├── .gitattributes
├── images
    ├── page-parser.jpeg
    └── page-parser-min.jpeg
├── requirements.txt
├── tests
    ├── __init__.py
    ├── parsers
    │   ├── __init__.py
    │   ├── baidu_parser_test.py
    │   ├── sina_parser_test.py
    │   └── kr36_parser_test.py
    └── page_parser_test.py
├── page_parser
    ├── parsers
    │   ├── __init__.py
    │   ├── baidu_parser.py
    │   ├── jobbole_parser.py
    │   ├── sina_parser.py
    │   ├── jandan_parser.py
    │   ├── lagou_parser.py
    │   ├── sogou_parser.py
    │   ├── qichacha_parser.py
    │   ├── douban_parser.py
    │   ├── xicidaili_parser.py
    │   └── kr36_parser.py
    ├── utils
    │   ├── __init__.py
    │   ├── request_util.py
    │   ├── time_util.py
    │   └── aes_cipher.py
    └── __init__.py
├── .travis.yml
├── demo_mini.py
├── test_parser.py
├── demo.py
├── setup.py
├── .gitignore
├── README.md
├── README-v0.0.4.md
└── README.rst


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.html linguist-language=Python


--------------------------------------------------------------------------------
/images/page-parser.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mouday/PageParser/HEAD/images/page-parser.jpeg


--------------------------------------------------------------------------------
/images/page-parser-min.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mouday/PageParser/HEAD/images/page-parser-min.jpeg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests >= 2.18.4
2 | parsel >= 1.4.0
3 | pycryptodome >= 3.20.0
4 | dateparser >= 0.7.2


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File    : __init__.py.py
4 | @Date    : 2024-02-18
5 | """


--------------------------------------------------------------------------------
/tests/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File    : __init__.py.py
4 | @Date    : 2024-02-18
5 | """


--------------------------------------------------------------------------------
/page_parser/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File    : __init__.py.py
4 | @Date    : 2024-02-18
5 | """


--------------------------------------------------------------------------------
/page_parser/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File    : __init__.py.py
4 | @Date    : 2024-02-18
5 | """


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | 
3 | install:
4 |   - pip install -r requirements.txt
5 | 
6 | # command to run tests
7 | script:
8 |   - pytest


--------------------------------------------------------------------------------
/demo_mini.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # page_parser 包使用示例：6行代码写爬虫
 3 | 
 4 | import requests
 5 | from page_parser.baidu_parser import BaiduParser
 6 | 
 7 | # 1、下载网页
 8 | response = requests.get("https://www.baidu.com/")
 9 | html = response.content.decode("utf-8")
10 | 
11 | # 2、解析网页
12 | items = BaiduParser().parse_index(html)
13 | 
14 | # 3、输出数据
15 | for item in items: print(item)
16 | # {'title': '百度一下，你就知道'}
17 | 


--------------------------------------------------------------------------------
/page_parser/utils/request_util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @File    : request_util.py
 4 | @Date    : 2024-02-18
 5 | """
 6 | import requests
 7 | 
 8 | HEADERS = {
 9 |     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0"
10 | }
11 | 
12 | 
13 | def request(url):
14 |     response = requests.get(url=url, headers=HEADERS)
15 |     response.encoding = response.apparent_encoding
16 |     return response.text
17 | 


--------------------------------------------------------------------------------
/page_parser/utils/time_util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @File    : time_util.py
 4 | @Date    : 2024-02-18
 5 | """
 6 | import dateparser
 7 | 
 8 | # 日期时间标准格式化
 9 | DATE_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
10 | 
11 | # 日期标准格式化
12 | DATE_FORMAT = "%Y-%m-%d"
13 | 
14 | # 时间标准格式化
15 | TIME_FORMAT = "%H:%M:%S"
16 | 
17 | 
18 | def format_datetime(datetime_str):
19 |     date_time = dateparser.parse(datetime_str)
20 |     if date_time:
21 |         return date_time.strftime(DATE_TIME_FORMAT)
22 | 


--------------------------------------------------------------------------------
/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-13
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | from page_parser import BaiduParser
 7 | import requests
 8 | 
 9 | 
10 | def test_parser():
11 |     response = requests.get("https://www.baidu.com/")
12 |     response.encoding = response.apparent_encoding
13 |     items = BaiduParser.parse_index(response.text)
14 |     for item in items:
15 |         print(item)
16 | 
17 |     print("test ok")
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     test_parser()
22 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-15
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | # page_parser 包使用示例
 7 | 
 8 | import requests
 9 | from page_parser.baidu_parser import BaiduParser
10 | 
11 | # 1、下载网页
12 | url = "https://www.baidu.com/"
13 | response = requests.get(url)
14 | response.encoding = response.apparent_encoding
15 | 
16 | # 2、解析网页
17 | parser = BaiduParser()
18 | items = parser.parse_index(response.text)
19 | 
20 | # 3、输出数据
21 | for item in items:
22 |     print(item)
23 | 
24 |     # {'title': '百度一下，你就知道'}
25 | 


--------------------------------------------------------------------------------
/page_parser/parsers/baidu_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | baidu_parser.py
 4 | @Date    : 2018-10-13
 5 | @Author  : Peng Shiyu
 6 | """
 7 | 
 8 | from parsel import Selector
 9 | 
10 | 
11 | def parse_index(html):
12 |     """
13 |     解析百度网主页：https://www.baidu.com/
14 |     :param html: {str} 网页文本
15 |     :return: {iterator} 抽取的内容
16 |     eg:
17 |     {
18 |         'title': '百度一下，你就知道'
19 |     }
20 |     """
21 |     sel = Selector(html)
22 |     title = sel.css("title::text").extract_first()
23 |     item = {
24 |         "title": title
25 |     }
26 | 
27 |     return item
28 | 


--------------------------------------------------------------------------------
/tests/page_parser_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @File    : page_parser_test.py
 4 | @Date    : 2024-02-18
 5 | """
 6 | import json
 7 | import unittest
 8 | 
 9 | import page_parser
10 | 
11 | 
12 | class PageParserTest(unittest.TestCase):
13 |     def test_parse(self):
14 |         urls = [
15 |             'https://www.baidu.com/',
16 |             'https://36kr.com/p/2652091684060295'
17 |         ]
18 | 
19 |         for url in urls:
20 |             print('url:', url)
21 |             data = page_parser.parse(url)
22 |             print(json.dumps(data, ensure_ascii=False, indent=2))
23 | 


--------------------------------------------------------------------------------
/tests/parsers/baidu_parser_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | baidu_parser.py
 4 | @Date    : 2018-10-13
 5 | @Author  : Peng Shiyu
 6 | """
 7 | 
 8 | import unittest
 9 | 
10 | import requests
11 | 
12 | from page_parser.parsers import baidu_parser
13 | from page_parser.utils import request_util
14 | 
15 | 
16 | class BaiduParserTest(unittest.TestCase):
17 |     def test_parse_index(self):
18 |         content = request_util.request("https://www.baidu.com/")
19 | 
20 |         items = baidu_parser.parse_index(content)
21 |         for item in items:
22 |             print(item)
23 | 
24 |             # {'title': '百度一下，你就知道'}
25 | 


--------------------------------------------------------------------------------
/tests/parsers/sina_parser_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @File    : sina_parser_test.py
 4 | @Date    : 2024-02-18
 5 | """
 6 | 
 7 | 
 8 | import json
 9 | import unittest
10 | 
11 | from page_parser.parsers import sina_parser
12 | from page_parser.utils import request_util
13 | 
14 | 
15 | class SinaParserTest(unittest.TestCase):
16 |     def test_parse_detail(self):
17 |         url = "https://finance.sina.com.cn/roll/2024-02-17/doc-inaikkvc4492655.shtml"
18 |         content = request_util.request(url)
19 | 
20 |         data = sina_parser.parse_detail(content)
21 |         print(json.dumps(data, ensure_ascii=False, indent=2))
22 | 
23 |         # {'title': '百度一下，你就知道'}
24 | 


--------------------------------------------------------------------------------
/tests/parsers/kr36_parser_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | baidu_parser.py
 4 | @Date    : 2018-10-13
 5 | @Author  : Peng Shiyu
 6 | """
 7 | import json
 8 | import unittest
 9 | 
10 | import requests
11 | 
12 | from page_parser.parsers import baidu_parser, kr36_parser
13 | from page_parser.utils import request_util
14 | 
15 | 
16 | class kr36ParserTest(unittest.TestCase):
17 |     def test_parse_detail(self):
18 |         url = "https://36kr.com/p/2652091684060295"
19 |         content = request_util.request(url)
20 | 
21 |         data = kr36_parser.parse_detail(content)
22 |         print(json.dumps(data, ensure_ascii=False, indent=2))
23 | 
24 |         # {'title': '百度一下，你就知道'}
25 | 


--------------------------------------------------------------------------------
/page_parser/parsers/jobbole_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2019-03-20
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | 
 7 | import requests
 8 | from parsel import Selector
 9 | 
10 | 
11 | def parse_python(html):
12 |     """
13 |     伯乐在线  http://www.jobbole.com/
14 |     python栏目页面所有文章连接 2019-03-20
15 |     http://python.jobbole.com/
16 |     """
17 |     sel = Selector(text=html)
18 |     articles = sel.css(".meta-title")
19 | 
20 |     lst = []
21 |     for article in articles:
22 |         href = article.css("::attr(href)").extract_first("")
23 |         text = article.css("::text").extract_first("")
24 | 
25 |         item = {
26 |             "title": text,
27 |             "url": href,
28 |         }
29 | 
30 |         lst.append(item)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     url = "http://python.jobbole.com/"
35 |     headers = {
36 |         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0"
37 |     }
38 |     response = requests.get(url, headers)
39 | 


--------------------------------------------------------------------------------
/page_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .parsers import baidu_parser, kr36_parser
 4 | from .parsers.jandan_parser import JandanParser
 5 | from .parsers.qichacha_parser import QichachaParser
 6 | from .parsers.sogou_parser import SogouParser
 7 | from .parsers.xicidaili_parser import XicidailiParser
 8 | from .parsers.douban_parser import DoubanParser
 9 | from .parsers.lagou_parser import LagouParser
10 | from .utils import request_util
11 | 
12 | parse_config = {
13 |     '.*://www.baidu.com/?': baidu_parser.parse_index,
14 |     '.*://movie.douban.com/?': baidu_parser.parse_index,
15 |     '.*://36kr.com/p/.*': kr36_parser.parse_detail,
16 | }
17 | 
18 | 
19 | def get_parse_function(url):
20 | 
21 |     for parse_url, parse_function in parse_config.items():
22 |         if re.match(parse_url, url):
23 |             return parse_function
24 | 
25 | 
26 | def parse(url, content=None):
27 |     """
28 |     解析函数
29 |     :param url: 指定网页的url
30 |     :param content: 网页内容 html格式
31 |     :return: 解析后的数据
32 |     """
33 |     parse_function = get_parse_function(url)
34 | 
35 |     if not parse_function:
36 |         raise Exception("not found parse function")
37 | 
38 |     if not content:
39 |         content = request_util.request(url)
40 | 
41 |     return parse_function(content)
42 | 


--------------------------------------------------------------------------------
/page_parser/parsers/sina_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @File    : sina_parser.py
 4 | @Date    : 2024-02-18
 5 | """
 6 | from parsel import Selector
 7 | 
 8 | from page_parser.utils import time_util
 9 | 
10 | 
11 | def parse_detail(content):
12 |     sel = Selector(content)
13 | 
14 |     title = sel.css('meta[property="og:title"]::attr(content)').extract_first()
15 |     description = sel.css('meta[property="og:description"]::attr(content)').extract_first()
16 |     url = sel.css('meta[property="og:url"]::attr(content)').extract_first()
17 |     image_url = sel.css('meta[property="og:image"]::attr(content)').extract_first()
18 |     publish_time = sel.css("#top_bar_wrap .date::text").extract_first()
19 |     author = sel.css("#top_bar_wrap .source::text").extract_first()
20 |     content = sel.css("#artibody").get()
21 | 
22 |     if image_url.startswith('//'):
23 |         image_url = 'https:' + image_url
24 | 
25 |     publish_time = time_util.format_datetime(publish_time)
26 | 
27 |     item = {
28 |         "url": url,
29 |         "image_url": image_url,
30 |         "title": title,
31 |         "content": content,
32 |         "description": description,
33 |         "author": author,
34 |         "publish_time": publish_time,
35 |     }
36 | 
37 |     return item
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-15
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | import setuptools
 7 | import os
 8 | import requests
 9 | 
10 | 
11 | # 将markdown格式转换为rst格式
12 | def md_to_rst(from_file, to_file):
13 |     r = requests.post(url='http://c.docverter.com/convert',
14 |                       data={'to': 'rst', 'from': 'markdown'},
15 |                       files={'input_files[]': open(from_file, 'rb')})
16 |     if r.ok:
17 |         with open(to_file, "wb") as f:
18 |             f.write(r.content)
19 | 
20 | 
21 | md_to_rst("README.md", "README.rst")
22 | 
23 | long_description = 'Add a fallback short description here'
24 | if os.path.exists('README.rst'):
25 |     long_description = open('README.rst', encoding="utf-8").read()
26 | 
27 | if os.path.exists('requirements.txt'):
28 |     requirements = open('requirements.txt', encoding="utf-8").read().splitlines()
29 | else:
30 |     requirements = []
31 | 
32 | setuptools.setup(
33 |     name="page_parser",
34 |     version="0.0.4",
35 |     author="Peng Shiyu",
36 |     author_email="pengshiyuyx@gmail.com",
37 |     description="web crawler or spider parse page",
38 |     long_description=long_description,
39 |     long_description_content_type="text/x-rst",
40 |     url="https://github.com/mouday/PageParser",
41 |     packages=setuptools.find_packages(),
42 |     classifiers=(
43 |         "Programming Language :: Python :: 3",
44 |         "License :: OSI Approved :: MIT License",
45 |         "Operating System :: OS Independent",
46 |     ),
47 |     install_requires=requirements,
48 | 
49 |     # 常用
50 |     package_data={
51 |         # If any package contains *.txt or *.rst files, include them:
52 |         'page_parser': ['source/*.*'],
53 |     }
54 | )
55 | 


--------------------------------------------------------------------------------
/page_parser/parsers/jandan_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-17
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | from parsel import Selector
 7 | 
 8 | 
 9 | class JandanParser(object):
10 |     """
11 |     煎蛋网：http://jandan.net/
12 |     """
13 | 
14 |     @staticmethod
15 |     def parse_index(html):
16 |         """
17 |         解析主页：http://jandan.net/
18 |         :param html: {str} 网页文本
19 |         :return: {iterator} 抽取的内容
20 |         """
21 |         sel = Selector(html)
22 |         posts = sel.css("#content .list-post")
23 |         for post in posts:
24 |             image = post.css(".thumbs_b img::attr(src)").extract_first("")
25 |             if image == "":
26 |                 image = post.css(".thumbs_b img::attr(data-original)").extract_first("")
27 | 
28 |             title = post.css("h2 a::text").extract_first("")
29 |             url = post.css("h2 a::attr(href)").extract_first("")
30 |             author = post.css(".time_s a::text").extract_first("")
31 |             tag = post.css(".time_s strong a::text").extract_first("")
32 |             summary = "".join(post.css(".indexs::text").extract()).strip()
33 | 
34 |             if image.startswith("//"):
35 |                 image = "http:" + image
36 | 
37 |             item = {
38 |                 "title": title,
39 |                 "author": author,
40 |                 "tag": tag,
41 |                 "summary": summary,
42 |                 "image": image,
43 |                 "url": url
44 |             }
45 |             yield item
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     import requests
50 | 
51 |     response = requests.get("http://jandan.net/")
52 |     items = JandanParser().parse_index(response.text)
53 |     for item in items:
54 |         print(item)
55 | 


--------------------------------------------------------------------------------
/page_parser/parsers/lagou_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-13
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | from parsel import Selector
 7 | 
 8 | 
 9 | class LagouParser(object):
10 |     """
11 |     拉勾网：https://www.lagou.com
12 |     """
13 | 
14 |     @staticmethod
15 |     def parse_zhaopin(html):
16 |         """
17 |         解析招聘职位列表页: https://www.lagou.com/zhaopin/
18 |         """
19 |         sel = Selector(text=html)
20 |         lis = sel.css("#s_position_list .item_con_list li")
21 |         for li in lis:
22 |             position_link = li.css(".position_link::attr(href)").extract_first("")
23 |             position = li.css("h3::text").extract_first("")
24 |             money = li.css(".money::text").extract_first("")
25 |             company = li.css(".company_name a::text").extract_first("")
26 |             logo = li.css(".com_logo img::attr(src)").extract_first("")
27 |             position_tempt = li.css(".list_item_bot .li_b_r::text").extract_first("")
28 |             key_words = "".join(li.css(".list_item_bot .li_b_l span::text").extract())
29 |             position_require = "".join(li.css(".p_bot .li_b_l::text").extract()).strip()
30 |             post_time = li.css(".format-time::text").extract_first("")
31 |             industry = li.css(".industry::text").extract_first("").strip()
32 | 
33 |             item = {
34 |                 "position": position,
35 |                 "money": money,
36 |                 "company": company,
37 |                 "industry": industry,
38 |                 "logo": logo,
39 |                 "key_words": key_words,
40 |                 "position_link": position_link,
41 |                 "position_tempt": position_tempt,
42 |                 "position_require": position_require,
43 |                 "post_time": post_time
44 |             }
45 | 
46 |             yield item
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     # 直接获取不到网页，所以保存到本地
51 |     html = open("source/lagou-zhaopin.html").read()
52 |     items = LagouParser().parse_zhaopin(html)
53 |     for item in items:
54 |         print(item)
55 | 


--------------------------------------------------------------------------------
/page_parser/parsers/sogou_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-13
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | from parsel import Selector
 7 | 
 8 | 
 9 | class SogouParser(object):
10 |     """
11 |     搜狗搜索：https://www.sogou.com/
12 |     """
13 | 
14 |     @staticmethod
15 |     def parse_weixin_name(html):
16 |         """
17 |         解析搜狗微信公众号搜索，搜到的微信信息结果
18 |         https://weixin.sogou.com/weixin?type=1&query=百度
19 |         """
20 |         sel = Selector(text=html)
21 |         lst = sel.css(".news-list2 li")
22 |         for li in lst:
23 |             img = li.css(".img-box img::attr(src)").extract_first("")
24 |             if img.startswith("//"):
25 |                 img = "http:" + img
26 |             title = li.css(".tit").xpath("string(.)").extract_first("").strip()
27 |             name = li.css(".info label::text").extract_first("")
28 |             name = name.replace("微信号：", "")
29 |             introduce_title = li.xpath("./dl[1]/dt/text()").extract_first("")
30 |             if "功能介绍" in introduce_title:
31 |                 introduce = li.xpath("./dl[1]/dd").xpath("string(.)").extract_first("")
32 |             else:
33 |                 introduce = ""
34 |             authentication_title = li.xpath("./dl[2]/dt").xpath("string(.)").extract_first("")
35 |             if "认证" in authentication_title:
36 |                 authname = li.xpath("./dl[2]/dd/text()").extract_first("")
37 |             else:
38 |                 authname = ""
39 |             qrcode = "http://open.weixin.qq.com/qr/code?username=" + name
40 | 
41 |             item = dict()
42 |             item["title"] = title
43 |             item["name"] = name
44 |             item["introduce"] = introduce
45 |             item["authname"] = authname
46 |             item["img"] = img
47 |             item["qrcode"] = qrcode
48 | 
49 |             yield item
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     import requests
54 | 
55 |     url = "https://weixin.sogou.com/weixin?type=1&query=百度"
56 |     response = requests.get(url)
57 |     items = SogouParser().parse_weixin_name(response.text)
58 |     for item in items:
59 |         print(item)
60 | 


--------------------------------------------------------------------------------
/page_parser/parsers/qichacha_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-13
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | 
 7 | from parsel import Selector
 8 | 
 9 | 
10 | class QichachaParser(object):
11 |     """
12 |     企查查：https://www.qichacha.com/
13 |     """
14 | 
15 |     @staticmethod
16 |     def parse_financing(html):
17 |         """
18 |         解析融资事件页：https://www.qichacha.com/elib_financing
19 |         """
20 |         sel = Selector(html)
21 | 
22 |         trs = sel.css(".ntable tr")
23 |         for tr in trs[1:]:
24 |             product_image = tr.xpath("./td[1]/img/@src").extract_first("").strip()
25 |             product_name = tr.xpath("./td[2]/a/text()").extract_first("").strip()
26 |             product_link = tr.xpath("./td[2]/a/@href").extract_first("").strip()
27 |             company = tr.xpath("./td[3]/text()").extract_first("").strip()
28 |             investor = tr.xpath("./td[4]/text()").extract_first("").strip()
29 |             financing_stage = tr.xpath("./td[5]/text()").extract_first("").strip()
30 |             financing_money = tr.xpath("./td[6]/text()").extract_first("").strip()
31 |             financing_time = tr.xpath("./td[7]/text()").extract_first("").strip()
32 | 
33 |             item = {
34 |                 "product_image": product_image,
35 |                 "product_name": product_name,
36 |                 "product_link": product_link,
37 |                 "company": company,
38 |                 "investor": investor,
39 |                 "financing_stage": financing_stage,
40 |                 "financing_money": financing_money,
41 |                 "financing_time": financing_time,
42 | 
43 |             }
44 | 
45 |             yield item
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     import requests
50 | 
51 |     headers = {
52 |         "user-agent": "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
53 |     }
54 |     response = requests.get("https://www.qichacha.com/elib_financing", headers=headers)
55 |     response.encoding = response.apparent_encoding
56 |     items = QichachaParser().parse_financing(response.text)
57 |     for item in items:
58 |         print(item)
59 | 


--------------------------------------------------------------------------------
/page_parser/parsers/douban_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # @Date    : 2018-10-16
 4 | # @Author  : Peng Shiyu
 5 | 
 6 | from parsel import Selector
 7 | 
 8 | 
 9 | class DoubanParser(object):
10 |     """
11 |     豆瓣网：https://www.douban.com/
12 |     """
13 | 
14 |     @staticmethod
15 |     def parse_movie(html):
16 |         """
17 |         豆瓣电影 正在热映：https://movie.douban.com/
18 | 
19 |         以下两个板块是接口不做解析
20 |         最近热门电影：
21 |             https://movie.douban.com/j/search_subjects?type=movie&tag=热门&page_limit=50&page_start=0
22 |         最近热门电视剧：
23 |             https://movie.douban.com/j/search_subjects?type=tv&tag=热门&page_limit=50&page_start=0
24 |         """
25 | 
26 |         sel = Selector(html)
27 |         rows = sel.css("#screening li.ui-slide-item")
28 |         for row in rows:
29 |             title = row.xpath("./@data-title").extract_first("")
30 |             release = row.xpath("./@data-release").extract_first("")
31 |             rate = row.xpath("./@data-rate").extract_first("")
32 |             star = row.xpath("./@data-star").extract_first("")
33 |             duration = row.xpath("./@data-duration").extract_first("")
34 |             region = row.xpath("./@data-region").extract_first("")
35 |             director = row.xpath("./@data-director").extract_first("")
36 |             actors = row.xpath("./@data-actors").extract_first("")
37 |             image = row.css(".poster img::attr(src)").extract_first("")
38 |             href = row.css(".poster a::attr(href)").extract_first("")
39 | 
40 |             item = {
41 |                 "title": title,
42 |                 "release": release,
43 |                 "rate": rate,
44 |                 "star": star,
45 |                 "duration": duration,
46 |                 "region": region,
47 |                 "director": director,
48 |                 "actors": actors,
49 |                 "image": image,
50 |                 "href": href,
51 |             }
52 |             yield item
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     import requests
57 | 
58 |     response = requests.get("https://movie.douban.com/")
59 |     items = DoubanParser().parse_movie(response.text)
60 |     for item in items:
61 |         print(item)
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/python
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # Environments
 94 | .env
 95 | .venv
 96 | env/
 97 | venv/
 98 | ENV/
 99 | env.bak/
100 | venv.bak/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | .dmypy.json
115 | dmypy.json
116 | 
117 | ### Python Patch ###
118 | .venv/
119 | 
120 | ### Python.VirtualEnv Stack ###
121 | # Virtualenv
122 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
123 | [Bb]in
124 | [Ii]nclude
125 | [Ll]ib
126 | [Ll]ib64
127 | [Ll]ocal
128 | [Ss]cripts
129 | pyvenv.cfg
130 | pip-selfcheck.json
131 | 
132 | 
133 | # End of https://www.gitignore.io/api/python
134 | 
135 | # pycharm
136 | .idea
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PageParser
  2 | 
  3 | [![Build Status](https://travis-ci.org/mouday/PageParser.svg?branch=master)](https://travis-ci.org/mouday/PageParser)
  4 | ![GitHub](https://img.shields.io/github/license/mashape/apistatus.svg)
  5 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/page-parser.svg)
  6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/page-parser.svg)
  7 | ![PyPI](https://img.shields.io/pypi/v/page-parser.svg)
  8 | ![GitHub last commit](https://img.shields.io/github/last-commit/mouday/PageParser.svg)
  9 | ![PyPI - Format](https://img.shields.io/pypi/format/page-parser.svg)
 10 | 
 11 | ## 项目简介
 12 | 
 13 | 项目名称：六行代码写爬虫
 14 | 
 15 | 英文名称：PageParser
 16 | 
 17 | 项目简介：一个爬虫使用的网页解析包，实现最大限度的代码复用
 18 | 
 19 | 项目目标：不懂网页解析也能写爬虫
 20 | 
 21 | > 注意：本项目仅用于学习交流，不可用于商业项目
 22 | 
 23 | ## 安装模块
 24 | ```
 25 | pip install page-parser
 26 | ```
 27 | 
 28 | 最小项目示例：
 29 | 
 30 | ```python
 31 | import page_parser
 32 | 
 33 | # 1、指定网页
 34 | url = "https://www.baidu.com/"
 35 | 
 36 | # 2、解析网页
 37 | items = page_parser.parse(url)
 38 | 
 39 | # 3、输出数据
 40 | for item in items: print(item)
 41 | # {'title': '百度一下，你就知道'}
 42 | ```
 43 | 
 44 | ## 支持网页
 45 | 
 46 | | 序号 |网站 | 网页名称 |网页地址 |
 47 | | - |- | - | - |
 48 | | 1 |百度 | 主页  | https://www.baidu.com/ |
 49 | | 2 |豆瓣 | 电影 正在热映 | https://movie.douban.com/ |
 50 | | 3 |拉勾 | 招聘职位列表页  | https://www.lagou.com/zhaopin/ |
 51 | | 4 |企查查 | 融资事件页  | https://www.qichacha.com/elib_financing |
 52 | | 5 |西刺代理 | 主页  | http://www.xicidaili.com/ |
 53 | | 6 |西刺代理 | 国内高匿代理 | http://www.xicidaili.com/nn/ |
 54 | | 7 |西刺代理 | 国内普通代理 | http://www.xicidaili.com/nt/ |
 55 | | 8 |西刺代理 | 国内HTTPS代理 | http://www.xicidaili.com/wn/ |
 56 | | 9 |西刺代理 | 国内HTTP代理 | http://www.xicidaili.com/wt/ |
 57 | | 10 |搜狗搜索 | 微信公众号搜索页  | https://weixin.sogou.com/weixin?type=1&query=百度 |
 58 | | 11 | 煎蛋网 | 主页列表 | http://jandan.net/|
 59 | |12| 伯乐在线 | python栏目 | http://python.jobbole.com/|
 60 | 
 61 | 
 62 | ## 网络爬虫工作流程：
 63 | 
 64 | ```
 65 | 页面下载器 -> 页面解析器 -> 数据存储
 66 | 
 67 | ```
 68 | 
 69 | `页面下载器`: 主要涉及防爬攻破，方法各异，爬虫的难点也在此
 70 | 
 71 | `页面解析器`: 一般页面在一段时间内是固定的，每个人下载页面后都需要解析出页面内容，属于重复工作
 72 | 
 73 | `数据存储`: 不管是存储到什么文件或数据库，主要看业务需求
 74 | 
 75 | 此项目就是将这项工作抽离出来，让网络爬虫程序重点关注于：网页下载，而不是重复的网页解析
 76 | 
 77 | ## 项目说明
 78 | 
 79 | 此项目可以和python 的requests 和scrapy 配合使用
 80 | 
 81 | 当然如果要和其他编程语言使用，可以使用flask等网络框架再次对此项目进行封装，提供网络接口即可
 82 | 
 83 | 发起人：mouday
 84 | 
 85 | 发起时间：2018-10-13
 86 | 
 87 | 需要更多的人一起来维护
 88 | 
 89 | ## 贡献代码
 90 | 
 91 | 贡献的代码统一放入文件夹：page_parser
 92 | 
 93 | 代码示例，如没有更好的理由，应该按照下面的格式，便于使用者调用
 94 | 
 95 | baidu_parser.py
 96 | 
 97 | ## 说明：
 98 | 
 99 | ### 原则：
100 | 
101 | 1. 按照网站分类建立解析类
102 | 
103 | 2. 解析方法包含在解析类中 为方便调用需要静态方法
104 | 
105 | 3. 因为网页解析有时效性，所以必须`注明日期`
106 | 
107 | 
108 | ### 命名规则：
109 | 例如:
110 | ```
111 | 文件名：baidu_parser
112 | 类名：BaiduParser
113 | 方法名：parse_index
114 | ```
115 | 
116 | ### 其他
117 | 
118 | 1. 必要的代码注释
119 | 
120 | 2. 必要的测试代码
121 | 
122 | 3. 其他必要的代码
123 | 
124 | 
125 | ## 加入我们
126 | ### 基本要求
127 | 1. python的基本语法 + 面向对象 + 迭代器（yield）
128 | 2. 掌握的库：requests、parsel、scrapy（了解即可）
129 | 3. 解析库统一使用parsel（基于xpath），简单高效，与scrapy无缝衔接
130 | 4. 不太懂也没关系，自己看参考文章，只要愿意学就会，瞬间提升自己
131 | 
132 | 参考文章：
133 | 
134 | 1. [Python编程：class类面向对象](https://blog.csdn.net/mouday/article/details/79002712)
135 | 
136 | 2. [Python编程：生成器yield与yield from区别简单理解](https://blog.csdn.net/mouday/article/details/80760973)
137 | 
138 | 3. [Python爬虫：requests库基本使用](https://blog.csdn.net/mouday/article/details/80087627)
139 | 
140 | 4. [Python网络爬虫之scrapy框架](https://blog.csdn.net/mouday/article/details/79736108)
141 | 
142 | 5. [Python爬虫：xpath常用方法示例](https://blog.csdn.net/mouday/article/details/80364436)
143 | 
144 | 6. [python爬虫：scrapy框架xpath和css选择器语法](https://blog.csdn.net/mouday/article/details/80455560)
145 | 
146 | ### 联系方式
147 | 
148 | PageParser QQ群号: 932301512
149 | 
150 | ![](images/page-parser-min.jpeg)
151 | 


--------------------------------------------------------------------------------
/page_parser/parsers/xicidaili_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # @Date    : 2018-10-15
  4 | # @Author  : Peng Shiyu
  5 | 
  6 | 
  7 | from parsel import Selector
  8 | 
  9 | 
 10 | class XicidailiParser(object):
 11 |     """
 12 |     西刺代理：http://www.xicidaili.com/
 13 |     """
 14 | 
 15 |     @staticmethod
 16 |     def parse_index(html):
 17 |         """
 18 |         解析主页：http://www.xicidaili.com/
 19 |         """
 20 |         sel = Selector(html)
 21 |         trs = sel.css("#ip_list tr")
 22 |         for tr in trs:
 23 |             country = tr.xpath("./td[1]/img/@alt").extract_first("")
 24 |             ip = tr.xpath("./td[2]/text()").extract_first("")
 25 |             port = tr.xpath("./td[3]/text()").extract_first("")
 26 |             server_address = tr.xpath("./td[4]/text()").extract_first("")
 27 |             hide_type = tr.xpath("./td[5]/text()").extract_first("")
 28 |             scheme_type = tr.xpath("./td[6]/text()").extract_first("")
 29 |             live_time = tr.xpath("./td[7]/text()").extract_first("")
 30 |             verify_time = tr.xpath("./td[8]/text()").extract_first("")
 31 | 
 32 |             item = {
 33 |                 "country": country,
 34 |                 "ip": ip,
 35 |                 "port": port,
 36 |                 "server_address": server_address,
 37 |                 "hide_type": hide_type,
 38 |                 "scheme_type": scheme_type,
 39 |                 "live_time": live_time,
 40 |                 "verify_time": verify_time,
 41 |             }
 42 |             if country != "":
 43 |                 yield item
 44 |             else:
 45 |                 continue
 46 | 
 47 |     def parse_list(self, html):
 48 |         """
 49 |         解析代理列表页：
 50 |         1. 国内高匿代理: http://www.xicidaili.com/nn/
 51 |         2. 国内普通代理: http://www.xicidaili.com/nt/
 52 |         3. 国内HTTPS代理: http://www.xicidaili.com/wn/
 53 |         4. 国内HTTP代理: http://www.xicidaili.com/wt/
 54 |         """
 55 |         sel = Selector(html)
 56 |         trs = sel.css("#ip_list tr")
 57 |         for tr in trs:
 58 |             country = tr.xpath("./td[1]/img/@alt").extract_first("")
 59 |             ip = tr.xpath("./td[2]/text()").extract_first("")
 60 |             port = tr.xpath("./td[3]/text()").extract_first("")
 61 |             server_address = tr.xpath("./td[4]/a/text()").extract_first("")
 62 |             hide_type = tr.xpath("./td[5]/text()").extract_first("")
 63 |             scheme_type = tr.xpath("./td[6]/text()").extract_first("")
 64 |             speed = tr.xpath("./td[7]/div/@title").extract_first("")
 65 |             connect_time = tr.xpath("./td[8]/div/@title").extract_first("")
 66 |             live_time = tr.xpath("./td[9]/text()").extract_first("")
 67 |             verify_time = tr.xpath("./td[10]/text()").extract_first("")
 68 | 
 69 |             item = {
 70 |                 "country": country,
 71 |                 "ip": ip,
 72 |                 "port": port,
 73 |                 "server_address": server_address,
 74 |                 "hide_type": hide_type,
 75 |                 "speed": speed,
 76 |                 "connect_time": connect_time,
 77 |                 "scheme_type": scheme_type,
 78 |                 "live_time": live_time,
 79 |                 "verify_time": verify_time,
 80 |             }
 81 |             if country != "":
 82 |                 yield item
 83 |             else:
 84 |                 continue
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     import requests
 89 | 
 90 |     headers = {
 91 |         "user-agent": "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
 92 |     }
 93 | 
 94 |     # url="http://www.xicidaili.com/"  # 首页
 95 |     # url = "http://www.xicidaili.com/nn/"  # 国内高匿代理
 96 |     # url = "http://www.xicidaili.com/nt/" # 国内普通代理
 97 |     # url= "http://www.xicidaili.com/wn/"# 国内HTTPS代理
 98 |     url = "http://www.xicidaili.com/wt/"  # 国内HTTP代理
 99 | 
100 |     response = requests.get(url, headers=headers)
101 | 
102 |     items = XicidailiParser().parse_list(response.text)
103 | 
104 |     for item in items:
105 |         print(item)
106 | 


--------------------------------------------------------------------------------
/page_parser/utils/aes_cipher.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import base64
 3 | 
 4 | from Crypto.Cipher import AES
 5 | from Crypto.Util.Padding import pad, unpad
 6 | 
 7 | 
 8 | class AESCipher(object):
 9 |     """
10 |     pip3 install pycryptodome
11 |     """
12 | 
13 |     def __init__(self, key, mode, **kwargs):
14 |         """
15 |         :param key:
16 |             16 (AES-128)
17 |             24 (AES-192)
18 |             32 (AES-256)
19 |         :param mode: 模式
20 |         :param kwargs:
21 |             iv 初始向量 MODE_CBC 模式使用 必须是16字节
22 | 
23 |         """
24 |         self.key = key
25 |         self.mode = mode
26 |         self.kwargs = kwargs
27 | 
28 |     def _get_aes(self):
29 |         """TypeError: decrypt() cannot be called after encrypt()"""
30 |         return AES.new(self.key.encode('utf-8'), self.mode, **self.kwargs)
31 | 
32 |     def encrypt(self, plain_text):
33 |         # 选择pkcs7补全
34 |         pad_pkcs7 = pad(plain_text.encode('utf-8'), AES.block_size)
35 |         encrypt_data = self._get_aes().encrypt(pad_pkcs7)
36 |         return str(base64.b64encode(encrypt_data), encoding='utf-8')
37 | 
38 |     def decrypt(self, cipher_text):
39 |         padded_data = self._get_aes().decrypt(base64.b64decode(cipher_text.encode('utf-8')))
40 |         return str(unpad(padded_data, AES.block_size), encoding='utf-8')
41 | 
42 | 
43 | def main():
44 |     md5_key = "BGdYBvGedpAdpVHM"
45 |     cipher_text = "IDj5xARMogOh88Z\/yAuFX539gW9\/\/4ZVjSjXmDUUu0BYeDYv0vNHgj3fU7YGROrpSmJMnBxYontO+KUqkt2WHKsuq2gyP+ijHOObDmVm0R7Fwd52RW1izzYwODLFnJeixqHLe1Ai56+Mre3ChxzMIWk973DovQU\/wg4hWvDkRnG95Bfb7O\/XDC3Gg53NE5sKx6LbfP4ufZG0l5mc5Ra287b62AqRQE7E05s+zrerV5een6zx4jzge4w9duQFKDtV5Lod5W1fdXepDzXndvlbNdUxrJyOCPN49l2UdiYIx+SbRo6Juu70f48Ixqm6Hh4wQ1jO1UxIG7mOtvWgaXvQ0gTE79yVwGaugdi3sDo9yz3OXjl2vBgIK3MxvK6A9pmLzCYUKrKNUfM+uQSjznuWaxzYiWv3gVMIywGUsBRqDoEHwf4HH40crPSrG2LBHY14UD6QXgHQT\/TtBnsFr+saReqLlWUAzJEHMYTOeqJNr91NWGN2KBFYcGnOKof3uo+sUwHsgyKXi\/IirrVp+Fq5ljW\/LKjdGRlHt0DVDXGBCUoKnggXPPgLFq20ooKIqwW7CBAlKhsZUVeayRFxGVCQP4oPRPTbr3BYEDCNuTEgYBigB9Ic6BiHQ1bVCvHbUzbybjMascWGYOz7FB4j0DuLAugzyKgWylfSfKQuo\/qE4+wYFZizBPYOgewnYc8LD8g1qc3y0R8UePzCTEq+jHguMkUpfdngVzA38VDvtCULud6nGjn4MuB+zyevGLzYy\/FzBjX8uI9UzeTME77h8qkS1ug55BQpSH2Z9wb5L3ZSzcxOnkh7Pt52JCt9I+ctjlp8K2wCuOAR7a+q+wkkjBPfAoWifm1\/+HrPdB\/2WV8cDAMyrkYQ9g4LAeKeelSK9CD+bjMascWGYOz7FB4j0DuLAlHZG1AqpRka9aa5JsJdsQQYFZizBPYOgewnYc8LD8g1IuTbx4ySvBGifx7CmawNRd4ESBC+9xgSPMVEwcMvcuT+Ff+W9Y3cSCfLwVEPmC2iVsdUHiMF2LQQIe7CYT5DpOg55BQpSH2Z9wb5L3ZSzczRNczLk1JVH1ZZhXuU+dFgW19dAumhAiUmvbds\/5qstPFMPkmBtu7S7WJXgj4I3Gs4OLrj7CDWJWCXiyyodrbpbjMascWGYOz7FB4j0DuLAp1ngcqJ2qc16r32EiKBTor+Ff+W9Y3cSCfLwVEPmC2iw43uzfJHk+0nJwpyc+esPdq5SOeP\/Y0oDglfBmFZzZCDKxxFonL6PqXEahWV8+HjQ9tDAEHJQdQKEHyMCwZ+nOg55BQpSH2Z9wb5L3ZSzcwYiLZ2lUcwuP+vN3igbKST\/j6BJZ4EEnRMMiG0zTc4LymvL7jnSid\/dZfp8k446C38WcIOVMxip3tKOruvTeT2nciLy9edjqGu+H77n\/CoOCmzPx1upTn0yQMm9hTVN8w4\/U+Cqrk7r4SjWUCtavGtbjMascWGYOz7FB4j0DuLAqB7Cy+sBxLMHgpZD25ZersYFZizBPYOgewnYc8LD8g1Y1ZtQm2r2T5Fi493+siC4X8ZfMY8bGYGcvF7ESr7CiZSiQgd+X96cgKZJ2c6aiXbhHvgApWa2axOfVSg73xtRug55BQpSH2Z9wb5L3ZSzcyOipWHjuxKTt3n14uPGkIrNN4cy7FvauFu7DQwM\/uRqymvL7jnSid\/dZfp8k446C31Ga4mvcgs9CTSLbR8i16RqAwrqcDqr+VPnO1mwMArEzTAchyf+5Ve5fsuh5sgItUYFZizBPYOgewnYc8LD8g1pB0ra78tWSVciUSfr9u8Dyd3UZ9Z1djk4xsU\/fnSFD3wvBnO\/wYamT2OH10xhGCILHJdEv0hrPZc0zrD3U+PpSmvL7jnSid\/dZfp8k446C1OmfaBkp919UZbMpqqI5frqAwrqcDqr+VPnO1mwMArE+OvmDA\/AmgIU85EeSd+H6fPXJRxeetu3nRxwPOxyAvabjMascWGYOz7FB4j0DuLAv6dPAxS5InExzdfOHLKWG0YFZizBPYOgewnYc8LD8g1fD4kgLdwV5UBco05Pmv3nPj5jAiQaS1uDkfKLqVh6hf+Ff+W9Y3cSCfLwVEPmC2inBDWSa7JB4mJHoIOJscHhug55BQpSH2Z9wb5L3ZSzcwxlCssatMnin4QqIyPu\/EPvWlnWnfyiFOTe\/aSuaC12YdCesupyFzD5B8Fqr8SRB4rTjP\/x3elHaia0x0\/UzFpqAwrqcDqr+VPnO1mwMArE4HtGY7wd8HnyWlhmc5FLVKew73gvTduwuq746ZMW+tubjMascWGYOz7FB4j0DuLAt1i3o+RgfTlYz+1kObhjIYYFZizBPYOgewnYc8LD8g1CxXkRSWKpX+gpfaDdXGRzusPP576SzPUfB6+K48Pxhj+Ff+W9Y3cSCfLwVEPmC2iarP\/Z8gxMryrEfBiADA53JtTjM2EzcRUCiQKXE9T1TYGj+ZfMPTEOg5kJlHQhzENjcTYjoy6K\/TfeiE9PkQtpymvL7jnSid\/dZfp8k446C2lKjN464Xu+OX6dTD9\/PE8qAwrqcDqr+VPnO1mwMArE61JDAgcZoWpV5LIBTyzFB4lMbzZ3gLMpsFyBotlqPUPbjMascWGYOz7FB4j0DuLAtUFq2CcBUK53C2CGVwUCxUYFZizBPYOgewnYc8LD8g1tYiKdBornoCDheKE3SDWYc5tclov9V7wr0JdBoxPRNP+Ff+W9Y3cSCfLwVEPmC2iUCk1PuTK39r8AQ0GtUBlRQfpJSJXHiG56CVPQQMOSlGYh\/P56TyYcTOFRikZXmkBmIZrSGGMJl8ew1IjKIjBuBbq\/k1q6A57TWrb4L7aS0YGxzJRmIVSfT3XW630DcEJ"
46 | 
47 |     # ECB 模式
48 |     ecb_cipher = AESCipher(md5_key, mode=AES.MODE_ECB)
49 | 
50 |     # 7J0VfbEYF0XdLnLuA1b4Fw==
51 |     print(ecb_cipher.decrypt(cipher_text))
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/README-v0.0.4.md:
--------------------------------------------------------------------------------
  1 | # PageParser
  2 | 
  3 | [![Build Status](https://travis-ci.org/mouday/PageParser.svg?branch=master)](https://travis-ci.org/mouday/PageParser)
  4 | ![GitHub](https://img.shields.io/github/license/mashape/apistatus.svg)
  5 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/page-parser.svg)
  6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/page-parser.svg)
  7 | ![PyPI](https://img.shields.io/pypi/v/page-parser.svg)
  8 | ![GitHub last commit](https://img.shields.io/github/last-commit/mouday/PageParser.svg)
  9 | ![PyPI - Format](https://img.shields.io/pypi/format/page-parser.svg)
 10 | 
 11 | ## 项目简介
 12 | 
 13 | 项目名称：六行代码写爬虫
 14 | 
 15 | 英文名称：PageParser
 16 | 
 17 | 项目简介：一个爬虫使用的网页解析包，实现最大限度的代码复用
 18 | 
 19 | 项目目标：不懂网页解析也能写爬虫
 20 | 
 21 | 
 22 | ## 安装模块
 23 | ```
 24 | pip install page-parser
 25 | ```
 26 | 
 27 | 最小项目示例：
 28 | 
 29 | ```python
 30 | import requests
 31 | from page_parser import BaiduParser
 32 | 
 33 | # 1、下载网页
 34 | response = requests.get("https://www.baidu.com/")
 35 | html = response.content.decode("utf-8")
 36 | 
 37 | # 2、解析网页
 38 | items = BaiduParser.parse_index(html)
 39 | 
 40 | # 3、输出数据
 41 | for item in items: print(item)
 42 | # {'title': '百度一下，你就知道'}
 43 | ```
 44 | 
 45 | ## 支持网页
 46 | 
 47 | | 序号 |网站 | 网页名称 |网页地址 |
 48 | | - |- | - | - |
 49 | | 1 |百度 | 主页  | https://www.baidu.com/ |
 50 | | 2 |豆瓣 | 电影 正在热映 | https://movie.douban.com/ |
 51 | | 3 |拉勾 | 招聘职位列表页  | https://www.lagou.com/zhaopin/ |
 52 | | 4 |企查查 | 融资事件页  | https://www.qichacha.com/elib_financing |
 53 | | 5 |西刺代理 | 主页  |http://www.xicidaili.com/ |
 54 | | 6 |西刺代理 | 国内高匿代理 | http://www.xicidaili.com/nn/ |
 55 | | 7 |西刺代理 | 国内普通代理 |http://www.xicidaili.com/nt/ |
 56 | | 8 |西刺代理 | 国内HTTPS代理 |http://www.xicidaili.com/wn/ |
 57 | | 9 |西刺代理 | 国内HTTP代理 | http://www.xicidaili.com/wt/ |
 58 | | 10 |搜狗搜索 | 微信公众号搜索页  | https://weixin.sogou.com/weixin?type=1&query=百度 |
 59 | | 11 | 煎蛋网 | 主页列表 | http://jandan.net/|
 60 | |12| 伯乐在线 | python栏目 | http://python.jobbole.com/|
 61 | 
 62 | ## 使用示例
 63 | ```python
 64 | # -*- coding: utf-8 -*-
 65 | 
 66 | import requests
 67 | from page_parser import BaiduParser
 68 | 
 69 | # 1、下载网页
 70 | url = "https://www.baidu.com/"
 71 | response = requests.get(url)
 72 | response.encoding = response.apparent_encoding
 73 | 
 74 | # 2、解析网页
 75 | items = BaiduParser.parse_index(response.text)
 76 | 
 77 | # 3、输出数据
 78 | for item in items:
 79 |     print(item)
 80 | 
 81 | # {'title': '百度一下，你就知道'}
 82 | 
 83 | ```
 84 | 
 85 | ## 网络爬虫工作流程：
 86 | 
 87 | ```
 88 | 页面下载器 -> 页面解析器 -> 数据存储
 89 | 
 90 | ```
 91 | 
 92 | `页面下载器`: 主要涉及防爬攻破，方法各异，爬虫的难点也在此
 93 | 
 94 | `页面解析器`: 一般页面在一段时间内是固定的，每个人下载页面后都需要解析出页面内容，属于重复工作
 95 | 
 96 | `数据存储`: 不管是存储到什么文件或数据库，主要看业务需求
 97 | 
 98 | 此项目就是将这项工作抽离出来，让网络爬虫程序重点关注于：网页下载，而不是重复的网页解析
 99 | 
100 | ## 项目说明
101 | 
102 | 此项目可以和python 的requests 和scrapy 配合使用
103 | 
104 | 当然如果要和其他编程语言使用，可以使用flask等网络框架再次对此项目进行封装，提供网络接口即可
105 | 
106 | 发起人：mouday
107 | 
108 | 发起时间：2018-10-13
109 | 
110 | 需要更多的人一起来维护
111 | 
112 | ## 贡献代码
113 | 
114 | 贡献的代码统一放入文件夹：page_parser
115 | 
116 | 代码示例，如没有更好的理由，应该按照下面的格式，便于使用者调用
117 | 
118 | baidu_parser.py
119 | 
120 | ```python
121 | 
122 | # -*- coding: utf-8 -*-
123 | 
124 | # @Date    : 2018-10-13
125 | # @Author  : Peng Shiyu
126 | 
127 | from parsel import Selector
128 | 
129 | 
130 | class BaiduParser(object):
131 |     """
132 |     百度网：https://www.baidu.com/
133 |     """
134 | 
135 |     @staticmethod
136 |     def parse_index(html):
137 |         """
138 |         解析主页：https://www.baidu.com/
139 |         2018-10-13 pengshiyuyx@gmai.com
140 |         :param html: {str} 网页文本
141 |         :return: {iterator} 抽取的内容
142 |         """
143 |         sel = Selector(html)
144 |         title = sel.css("title::text").extract_first()
145 |         item = {
146 |             "title": title
147 |         }
148 |         yield item
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     import requests
153 |     response = requests.get("https://www.baidu.com/")
154 |     response.encoding = response.apparent_encoding
155 |     items = BaiduParser.parse_index(response.text)
156 |     for item in items:
157 |         print(item)
158 | 
159 |     # {'title': '百度一下，你就知道'}
160 | 
161 | ```
162 | ## 说明：
163 | 
164 | ### 原则：
165 | 
166 | 1. 按照网站分类建立解析类
167 | 
168 | 2. 解析方法包含在解析类中 为方便调用需要静态方法
169 | 
170 | 3. 因为网页解析有时效性，所以必须`注明日期`
171 | 
172 | 
173 | ### 命名规则：
174 | 例如:
175 | ```
176 | 文件名：baidu_parser
177 | 类名：BaiduParser
178 | 方法名：parse_index
179 | ```
180 | 
181 | ### 其他
182 | 
183 | 1. 必要的代码注释
184 | 
185 | 2. 必要的测试代码
186 | 
187 | 3. 其他必要的代码
188 | 
189 | 
190 | ## 加入我们
191 | ### 基本要求
192 | 1. python的基本语法 + 面向对象 + 迭代器（yield）
193 | 2. 掌握的库：requests、parsel、scrapy（了解即可）
194 | 3. 解析库统一使用parsel（基于xpath），简单高效，与scrapy无缝衔接
195 | 4. 不太懂也没关系，自己看参考文章，只要愿意学就会，瞬间提升自己
196 | 
197 | 参考文章：
198 | 
199 | 1. [Python编程：class类面向对象](https://blog.csdn.net/mouday/article/details/79002712)
200 | 
201 | 2. [Python编程：生成器yield与yield from区别简单理解](https://blog.csdn.net/mouday/article/details/80760973)
202 | 
203 | 3. [Python爬虫：requests库基本使用](https://blog.csdn.net/mouday/article/details/80087627)
204 | 
205 | 4. [Python网络爬虫之scrapy框架](https://blog.csdn.net/mouday/article/details/79736108)
206 | 
207 | 5. [Python爬虫：xpath常用方法示例](https://blog.csdn.net/mouday/article/details/80364436)
208 | 
209 | 6. [python爬虫：scrapy框架xpath和css选择器语法](https://blog.csdn.net/mouday/article/details/80455560)
210 | 
211 | ### 联系方式
212 | 
213 | PageParser QQ群号: 932301512
214 | 
215 | ![](images/page-parser-min.jpeg)
216 | 


--------------------------------------------------------------------------------
/page_parser/parsers/kr36_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @File    : kr36_parser.py
 4 | @Date    : 2024-02-18
 5 | """
 6 | import json
 7 | import re
 8 | 
 9 | from page_parser.utils.aes_cipher import AESCipher, AES
10 | 
11 | 
12 | def decode_text(data):
13 |     """
14 |     数据解码
15 |     :param data:
16 |     :return:
17 |     """
18 |     key = "efabccee-b754-4c"
19 | 
20 |     # ECB 模式
21 |     ecb_cipher = AESCipher(key, mode=AES.MODE_ECB)
22 | 
23 |     return ecb_cipher.decrypt(data)
24 | 
25 | 
26 | def parse_detail(content):
27 |     """
28 |     36氪文章详情解析
29 |     eg: https://36kr.com/p/2652091684060295
30 |     :param content:
31 |     :return:
32 |     eg:
33 |     {
34 |       "itemId": 2652091684060295,
35 |       "widgetTitle": "Sora的出现给企业数字化转型的启示",
36 |       "summary": "Sora的出现给企业数字化转型的启示",
37 |       "author": "湘江数评-老杨",
38 |       "authorId": 5803347,
39 |       "authorFace": "https://img.36krcdn.com/hsossms/20230213/v2_cc623608ddf446d196d87cc9531def13_oswg262422oswg508oswg508_img_000?x-oss-process=image/resize,w_300",
40 |       "authorRoute": "detail_author?userId=5803347",
41 |       "publishTime": 1708230930102,
42 |       "widgetContent": "<p>最近两天的朋友圈又被一则科技新闻刷屏了，那就是：OpenAI发布文生视频模型“Sora”。它是继ChatGPT之后，OpenAI又推出的一款震惊科技圈的产品。</p><p><strong>Sora是什么？有什么优势？</strong></p><p>Sora，OpenAI发布的人工智能文生视频大模型，于2024年2月15日（美国当地时间）正式对外发布，Sora可以生成长达一分钟的视频，同时保持视觉品质并遵循用户提示。它能够生成具有多个角色、特定类型的运动以及精确的主题和背景细节的复杂场景。除了能够仅由文本指令生成视频外，该模型还能够获取现有的静态图像并从中生成视频，并精确地将图像内容动画化。Sora还可以对视频色彩风格等要素精确理解，创造出人物表情丰富、情感生动的视频内容。目前 Sora 尚未对公众全面开放，仅邀请部分测试者进行体验.</p><p><strong>Sora 三大亮点突出：</strong></p><p>⑴．60 秒长视频，Sora 可以保持视频主体与背景的高度流畅性与稳定性。</p><p>⑵.单视频多角度镜头，Sora 在一个视频内实现多角度镜头，分镜切换符合逻辑且十分流畅。⑶.理解真实世界的能力，Sora 对于光影反射、运动方式、镜头移动等细节处理得十分优秀，极大地提升了真实感。</p><p><strong>Sora的出现对社会有哪些影响？</strong></p><p>周鸿祎预测，Sora 的出现，或意味着 AGI（通用人工智能）实现将从 10 年缩短到 1 年。Sora的推出将为视频生成领域带来革命性的进步。也将对多个行业产生影响，包括但不限于广告、影视、游戏、教育、新闻等领域。它可以帮助企业和个人更快速地创作和制作视频内容，提高效率。但同时，这也可能导致部分视频从业者面临失业的风险，尤其是近些年随着短视频的风靡，国内涌现出众多视频职业剪辑制作者，他们的前景恐令人担忧。</p><p><strong>那么Sora的出现对企业的数字化转型有何启示呢？</strong></p><p>不难看出人工智能技术发展之迅猛，已经大大超出了我们的预期，在尚未出现强大的人工智能能力时，我们<strong>需要具有前瞻性的思考力，学会利用AI工具提升创作力，而不是恐慌！</strong>新技术的出现不仅颠覆我们的认知与生活，更在颠覆我们的思维模式，但在AI技术为我们带来思想狂欢的同时，在数字化建设与AI的融合利用方面企业需要的是更多的理智，原因如下：</p><p>第一，AI技术目前尚<strong>不成熟</strong>，需要一个发展完善的过程，而对于场景化的技术落地，则需要更长的路要走；</p><p>第二，AI技术落地企业场景需要的不仅是时间更需要<strong>大量的资金投入</strong>；一般的企业难以承受，且这种投入是<strong>持续性的</strong>；</p><p>第三，企业员工的AI思维能力与AI工具的融合更需要一个<strong>长时间的磨合过程；</strong></p><p>所以在当前企业领导对于AI技术需要的是理智的思考，不要对其产生过高的期望，同时当前大部分软件公司在AI方面的技术支持能力还有很大的提升空间，所以企业领导不要想当然的认为只要利用了AI技术就可以马上优化多少员工，降低多少成本支出。</p><p>目前而言大部分传统企业数字化基础薄弱，对数字化技术的理解能力有限，虽然在思想上易接受先进的技术，但易对新技术产生过高的期望，产生很多离奇的想法。当前大部分的传统企业领导缺乏对数字化的深度认知，过多想当然的思维，才易导致系统落地时巨大的落差感，因此企业领导要清楚的是<strong>并不是所有的先进技术都是最好的、都适应于企业，能解决实际问题的才是最好的。</strong></p><p>新兴技术具有普适性，并不是某个企业的专属，也并适应于所有工作场景，从概念到应用再到产出价值需要一个阶段与过程，因此技术有一个不断完善的过程，在这个过程中必然会产生各种不适、甚至是各种错误，因此企业必须要有一个接受、融合的过程，以完美的心态去要求新技术，以此来衡量新技术的成熟度是不科学的，也是不可取的，因此AI技术与普通技术最大的不同就是自我学习能力，但需要大量的时间去训练，因此AI人工技术在企业内的落地应用并发挥价值需要的时间，需要的是耐心，企业做数字化转型建设亦是如此。</p><p>在AI技术高速发展的今天，<strong>企业必须以正确的认知来面对与引进先进技术：</strong></p><p>①.如何让技术成为生产力；</p><p>②.如何保障生产力顺利提升；</p><p>③.如何搭建先进生产力体系；</p><p>企业尤其需要的是如何利用数字化进行员工的能力升级，而非利用技术给员工带来下岗的压力。</p><p>企业要面对的不仅是对AI技术的认知，同时更重要的是面对如何利用AI技术，当前有部分头部软件企业已经将AI技术应用于各种工作场景，老杨这里所指的技术是指企业信息中心的技术层面，包含两个方面的内容：</p><p><strong>第一，系统运维层面:</strong></p><p>传统的后台运维需要耗费企业系统管理员大量的时间精力，比如流程设置，就需要先做表单然后一个节点与一个节点的去设置审批人、各种判断条件，而应用了AI技术之后，只需要对AI助手描述你的要求，即可快速实现流程的搭建；这就是AI技术给企业信息部门带来的技术变革，但值得要提醒的是有了AI技术并不代表着企业可以取消系统运维岗了，任何一个人都可以做系统运维，这是非常错误的想法，技术操作层面可替代，但技术思维才是最核心的。同时AI技术应用场景并不是非常成熟，还需要一个不断学习完善的过程，这就需要具有技术与管理思维的员工来完成此事；</p><p><strong>第二，开发层面:</strong></p><p>AI技术确实给当前的程序员带来了不小的压力，当一个新技术问世的同时必然会削弱另一项技术的影响力甚至让其消失，比如低代码平台的出现，让企业的一些场景摆脱了专业技术的约束，让员工培训一下之后就可以快速搭建一套应用系统场景，但这势必会影响到企业内部源代码开发人员的生存地位；因此当AI出现时，特别是AI+低代码场景出现时，信息部门的软件开发人员该何去何从？公然抵触？还是用各种方法去证明AI的各种缺陷，阻止其在企业全面应用？这是企业不得不全面考虑的问题，数字化转型是一个系统工程，如果某一个环节考虑不周，将影响全局的发展，因此当AI应用于企业工作场景时，<strong>企业首先要提升的就是AI与技术层面的融合能力，而非排斥；只有融合发展，才会进步，故步自封，只能被逐步出局。</strong></p><p>在业务场景与AI技术融合的过程中，首先还是得培养业务人员的AI意识，也就是数字化转型过程中经常提到的转意识；要让业务人员全面了解AI的背后逻辑是什么，如何在工作中利用AI，出现问题了原因是什么，是企业原始数据不够精细，还是管理流程出了问题，要让业务人员尤其是管理者清楚的知道AI并不是喊一句：芝麻开门那般简单，要<strong>拒绝完美主义</strong>，因为从技术角度讲当前的AI并不是完美的，从应用角度讲，还是有限的，因此企业需要的是一个AI+人工不断精进学习的过程，企业应当杜绝短期见效主义，出现一点问题后就大喊“AI无用论”的现象，所以<strong>业务人员具备AI思维很关键，要打破传统工作思维，用AI的逻辑去思考、解决问题是关键。</strong></p><p>当前企业已经逐步进入了AI智能时代，只是还有很多企业没有做好准备，不清楚、不知道该如何应用AI，以为AI就如如念咒语一般简单：如意如意随我心意，快快显灵，企业要清楚的知道，AI不仅是技术，更是一种思维，<strong>想更好的利用AI技术，就必须先学会转型：意识转型、思想转型、工作模式转型，同时也必须有容错的心态，如AI一般的学习能力。</strong></p><p>在数字化+AI时代里，信息部门的成员更是要学会<strong>“五个主动”</strong>即：主动拥抱新技术，主动学习新技术，主动应用新技术，主动推广新技术，主动融合新技术。</p><p>本文来自微信公众号<a target=\"_blank\" rel=\"noopener noreferrer nofollow\" href=\"http://mp.weixin.qq.com/s?__biz=Mzg2NTY3NjU1OA==&amp;mid=2247493887&amp;idx=1&amp;sn=f3943b538919ac606d2860fcb44c9acc&amp;chksm=cfa3cc1fc89b0f9cbe5df7e038de4fe3458d28b30fa6ee7905adef6c60c8b61b7bfb2c3573c4&amp;scene=0&amp;xtrack=1#rd\">“湘江数评”（ID：benpaoshuzi）</a>，作者：老杨，36氪经授权发布。</p>",
43 |       "sourceType": "contribution",
44 |       "hasBanEclub": 0,
45 |       "popinImage": "https://img.36krcdn.com/hsossms/20240217/v2_1d16d26b00184352b4086594722ea1f5@000000_oswg108457oswg1080oswg460_img_000?x-oss-process=image/resize,m_mfit,w_600,h_400,limit_0/crop,w_600,h_400,g_center",
46 |       "userType": 3,
47 |       "companyCertifyNick": "湘江数评-老杨官方企业号"
48 |     }
49 | 
50 |     """
51 |     ret = re.search("window\.initialState=(\{.*\})", content)
52 | 
53 |     res = json.loads(ret.group(1))
54 | 
55 |     raw_state = decode_text(res['state'])
56 | 
57 |     return json.loads(raw_state)['articleDetail']['articleDetailData']['data']
58 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | PageParser
  2 | ==========
  3 | 
  4 | |Build Status| |GitHub|
  5 | 
  6 | 项目简介
  7 | --------
  8 | 
  9 | 项目名称：六行代码写爬虫
 10 | 
 11 | 英文名称：PageParser
 12 | 
 13 | 项目简介：一个爬虫使用的网页解析包，实现最大限度的代码复用
 14 | 
 15 | 项目目标：不懂网页解析也能写爬虫
 16 | 
 17 | 安装模块
 18 | --------
 19 | 
 20 | ::
 21 | 
 22 |     pip install page-parser
 23 | 
 24 | 最小项目示例：
 25 | 
 26 | .. code:: python
 27 | 
 28 |     import requests
 29 |     from page_parser import BaiduParser
 30 | 
 31 |     # 1、下载网页
 32 |     response = requests.get("https://www.baidu.com/")
 33 |     html = response.content.decode("utf-8")
 34 | 
 35 |     # 2、解析网页
 36 |     items = BaiduParser.parse_index(html)
 37 | 
 38 |     # 3、输出数据
 39 |     for item in items: print(item)
 40 |     # {'title': '百度一下，你就知道'}
 41 | 
 42 | 支持网页
 43 | --------
 44 | 
 45 | +--------+------------+--------------------+-----------------------------------------------------+
 46 | | 序号   | 网站       | 网页名称           | 网页地址                                            |
 47 | +========+============+====================+=====================================================+
 48 | | 1      | 百度       | 主页               | https://www.baidu.com/                              |
 49 | +--------+------------+--------------------+-----------------------------------------------------+
 50 | | 2      | 豆瓣       | 电影 正在热映      | https://movie.douban.com/                           |
 51 | +--------+------------+--------------------+-----------------------------------------------------+
 52 | | 3      | 拉勾       | 招聘职位列表页     | https://www.lagou.com/zhaopin/                      |
 53 | +--------+------------+--------------------+-----------------------------------------------------+
 54 | | 4      | 企查查     | 融资事件页         | https://www.qichacha.com/elib\_financing            |
 55 | +--------+------------+--------------------+-----------------------------------------------------+
 56 | | 5      | 西刺代理   | 主页               | http://www.xicidaili.com/                           |
 57 | +--------+------------+--------------------+-----------------------------------------------------+
 58 | | 6      | 西刺代理   | 国内高匿代理       | http://www.xicidaili.com/nn/                        |
 59 | +--------+------------+--------------------+-----------------------------------------------------+
 60 | | 7      | 西刺代理   | 国内普通代理       | http://www.xicidaili.com/nt/                        |
 61 | +--------+------------+--------------------+-----------------------------------------------------+
 62 | | 8      | 西刺代理   | 国内HTTPS代理      | http://www.xicidaili.com/wn/                        |
 63 | +--------+------------+--------------------+-----------------------------------------------------+
 64 | | 9      | 西刺代理   | 国内HTTP代理       | http://www.xicidaili.com/wt/                        |
 65 | +--------+------------+--------------------+-----------------------------------------------------+
 66 | | 10     | 搜狗搜索   | 微信公众号搜索页   | https://weixin.sogou.com/weixin?type=1&query=百度   |
 67 | +--------+------------+--------------------+-----------------------------------------------------+
 68 | | 11     | 煎蛋网     | 主页列表           | http://jandan.net/                                  |
 69 | +--------+------------+--------------------+-----------------------------------------------------+
 70 | | 12     | 伯乐在线   | python栏目         | http://python.jobbole.com/                          |
 71 | +--------+------------+--------------------+-----------------------------------------------------+
 72 | 
 73 | 使用示例
 74 | --------
 75 | 
 76 | .. code:: python
 77 | 
 78 |     # -*- coding: utf-8 -*-
 79 | 
 80 |     import requests
 81 |     from page_parser import BaiduParser
 82 | 
 83 |     # 1、下载网页
 84 |     url = "https://www.baidu.com/"
 85 |     response = requests.get(url)
 86 |     response.encoding = response.apparent_encoding
 87 | 
 88 |     # 2、解析网页
 89 |     items = BaiduParser.parse_index(response.text)
 90 | 
 91 |     # 3、输出数据
 92 |     for item in items:
 93 |         print(item)
 94 | 
 95 |     # {'title': '百度一下，你就知道'}
 96 | 
 97 | 网络爬虫工作流程：
 98 | ------------------
 99 | 
100 | ::
101 | 
102 |     页面下载器 -> 页面解析器 -> 数据存储
103 | 
104 | ``页面下载器``: 主要涉及防爬攻破，方法各异，爬虫的难点也在此
105 | 
106 | ``页面解析器``:
107 | 一般页面在一段时间内是固定的，每个人下载页面后都需要解析出页面内容，属于重复工作
108 | 
109 | ``数据存储``: 不管是存储到什么文件或数据库，主要看业务需求
110 | 
111 | 此项目就是将这项工作抽离出来，让网络爬虫程序重点关注于：网页下载，而不是重复的网页解析
112 | 
113 | 项目说明
114 | --------
115 | 
116 | 此项目可以和python 的requests 和scrapy 配合使用
117 | 
118 | 当然如果要和其他编程语言使用，可以使用flask等网络框架再次对此项目进行封装，提供网络接口即可
119 | 
120 | 发起人：mouday
121 | 
122 | 发起时间：2018-10-13
123 | 
124 | 需要更多的人一起来维护
125 | 
126 | 贡献代码
127 | --------
128 | 
129 | 贡献的代码统一放入文件夹：page\_parser
130 | 
131 | 代码示例，如没有更好的理由，应该按照下面的格式，便于使用者调用
132 | 
133 | baidu\_parser.py
134 | 
135 | .. code:: python
136 | 
137 | 
138 |     # -*- coding: utf-8 -*-
139 | 
140 |     # @Date    : 2018-10-13
141 |     # @Author  : Peng Shiyu
142 | 
143 |     from parsel import Selector
144 | 
145 | 
146 |     class BaiduParser(object):
147 |         """
148 |         百度网：https://www.baidu.com/
149 |         """
150 | 
151 |         @staticmethod
152 |         def parse_index(html):
153 |             """
154 |             解析主页：https://www.baidu.com/
155 |             2018-10-13 pengshiyuyx@gmai.com
156 |             :param html: {str} 网页文本
157 |             :return: {iterator} 抽取的内容
158 |             """
159 |             sel = Selector(html)
160 |             title = sel.css("title::text").extract_first()
161 |             item = {
162 |                 "title": title
163 |             }
164 |             yield item
165 | 
166 | 
167 |     if __name__ == '__main__':
168 |         import requests
169 |         response = requests.get("https://www.baidu.com/")
170 |         response.encoding = response.apparent_encoding
171 |         items = BaiduParser.parse_index(response.text)
172 |         for item in items:
173 |             print(item)
174 | 
175 |         # {'title': '百度一下，你就知道'}
176 | 
177 | 说明：
178 | ------
179 | 
180 | 原则：
181 | ~~~~~~
182 | 
183 | 1. 按照网站分类建立解析类
184 | 
185 | 2. 解析方法包含在解析类中 为方便调用需要静态方法
186 | 
187 | 3. 因为网页解析有时效性，所以必须\ ``注明日期``
188 | 
189 | 命名规则：
190 | ~~~~~~~~~~
191 | 
192 | 例如:
193 | 
194 | ::
195 | 
196 |     文件名：baidu_parser
197 |     类名：BaiduParser
198 |     方法名：parse_index
199 | 
200 | 其他
201 | ~~~~
202 | 
203 | 1. 必要的代码注释
204 | 
205 | 2. 必要的测试代码
206 | 
207 | 3. 其他必要的代码
208 | 
209 | 加入我们
210 | --------
211 | 
212 | 基本要求
213 | ~~~~~~~~
214 | 
215 | 1. python的基本语法 + 面向对象 + 迭代器（yield）
216 | 2. 掌握的库：requests、parsel、scrapy（了解即可）
217 | 3. 解析库统一使用parsel（基于xpath），简单高效，与scrapy无缝衔接
218 | 4. 不太懂也没关系，自己看参考文章，只要愿意学就会，瞬间提升自己
219 | 
220 | 参考文章：
221 | 
222 | 1. `Python编程：class类面向对象 <https://blog.csdn.net/mouday/article/details/79002712>`__
223 | 
224 | 2. `Python编程：生成器yield与yield
225 |    from区别简单理解 <https://blog.csdn.net/mouday/article/details/80760973>`__
226 | 
227 | 3. `Python爬虫：requests库基本使用 <https://blog.csdn.net/mouday/article/details/80087627>`__
228 | 
229 | 4. `Python网络爬虫之scrapy框架 <https://blog.csdn.net/mouday/article/details/79736108>`__
230 | 
231 | 5. `Python爬虫：xpath常用方法示例 <https://blog.csdn.net/mouday/article/details/80364436>`__
232 | 
233 | 6. `python爬虫：scrapy框架xpath和css选择器语法 <https://blog.csdn.net/mouday/article/details/80455560>`__
234 | 
235 | 联系方式
236 | ~~~~~~~~
237 | 
238 | PageParser QQ群号: 932301512
239 | 
240 | .. figure:: images/page-parser-min.jpeg
241 |    :alt: 
242 | 
243 | .. |Build Status| image:: https://travis-ci.org/mouday/PageParser.svg?branch=master
244 |    :target: https://travis-ci.org/mouday/PageParser
245 | .. |GitHub| image:: https://img.shields.io/github/license/mashape/apistatus.svg
246 | 


--------------------------------------------------------------------------------