├── README.md
├── .gitignore
├── spider.py
└── spider2.py
/README.md:
--------------------------------------------------------------------------------
1 | # ScrapeStatic1
2 |
3 | Spider for https://ssr1.scrape.center/
4 |
5 | ## PyQuery + MongoDB + 多进程版
6 |
7 | 见 [spider.py](spider.py)
8 |
9 | ## 正则表达式 + 文本 + 多进程版
10 |
11 | 见 [spider2.py](spider2.py)
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
133 | /.idea
134 | /.vscode
135 | /results
--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import logging
3 | import re
4 | import pymongo
5 | from pyquery import PyQuery as pq
6 | from urllib.parse import urljoin
7 | import multiprocessing
8 |
9 |
10 | logging.basicConfig(level=logging.INFO,
11 | format='%(asctime)s - %(levelname)s: %(message)s')
12 |
13 | BASE_URL = 'https://ssr1.scrape.center'
14 | TOTAL_PAGE = 10
15 | MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
16 | MONGO_DB_NAME = 'movies'
17 | MONGO_COLLECTION_NAME = 'movies'
18 |
19 | client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
20 | db = client['movies']
21 | collection = db['movies']
22 |
23 |
24 | def scrape_page(url):
25 | """
26 | scrape page by url and return its html
27 | :param url: page url
28 | :return: html of page
29 | """
30 | logging.info('scraping %s...', url)
31 | try:
32 | response = requests.get(url)
33 | if response.status_code == 200:
34 | return response.text
35 | logging.error('get invalid status code %s while scraping %s', response.status_code, url)
36 | except requests.RequestException:
37 | logging.error('error occurred while scraping %s', url, exc_info=True)
38 |
39 |
40 | def scrape_index(page):
41 | """
42 | scrape index page and return its html
43 | :param page: page of index page
44 | :return: html of index page
45 | """
46 | index_url = f'{BASE_URL}/page/{page}'
47 | return scrape_page(index_url)
48 |
49 |
50 | def parse_index(html):
51 | """
52 | parse index page
53 | :param html: html of index page
54 | :return: generator of detail page url
55 | """
56 | doc = pq(html)
57 | links = doc('.el-card .name')
58 | for link in links.items():
59 | href = link.attr('href')
60 | detail_url = urljoin(BASE_URL, href)
61 | logging.info('get detail url %s', detail_url)
62 | yield detail_url
63 |
64 |
65 | def scrape_detail(url):
66 | """
67 | scrape detail page and return its html
68 | :param page: page of detail page
69 | :return: html of detail page
70 | """
71 | return scrape_page(url)
72 |
73 |
74 | def parse_detail(html):
75 | """
76 | parse detail page
77 | :param html: html of detail page
78 | :return: data
79 | """
80 | doc = pq(html)
81 | cover = doc('img.cover').attr('src')
82 | name = doc('a > h2').text()
83 | categories = [item.text() for item in doc('.categories button span').items()]
84 | published_at = doc('.info:contains(上映)').text()
85 | published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
86 | if published_at and re.search('\d{4}-\d{2}-\d{2}', published_at) else None
87 | drama = doc('.drama p').text()
88 | score = doc('p.score').text()
89 | score = float(score) if score else None
90 | return {
91 | 'cover': cover,
92 | 'name': name,
93 | 'categories': categories,
94 | 'published_at': published_at,
95 | 'drama': drama,
96 | 'score': score
97 | }
98 |
99 |
100 | def save_data(data):
101 | """
102 | save to mongodb
103 | :param data:
104 | :return:
105 | """
106 | collection.update_one({
107 | 'name': data.get('name')
108 | }, {
109 | '$set': data
110 | }, upsert=True)
111 |
112 |
113 | def main(page):
114 | """
115 | main process
116 | :return:
117 | """
118 | index_html = scrape_index(page)
119 | detail_urls = parse_index(index_html)
120 | for detail_url in detail_urls:
121 | detail_html = scrape_detail(detail_url)
122 | data = parse_detail(detail_html)
123 | logging.info('get detail data %s', data)
124 | logging.info('saving data to mongodb')
125 | save_data(data)
126 | logging.info('data saved successfully')
127 |
128 |
129 | if __name__ == '__main__':
130 | pool = multiprocessing.Pool()
131 | pages = range(1, TOTAL_PAGE + 1)
132 | pool.map(main, pages)
133 | pool.close()
134 |
--------------------------------------------------------------------------------
/spider2.py:
--------------------------------------------------------------------------------
1 | import json
2 | from os import makedirs
3 | from os.path import exists
4 | import requests
5 | import logging
6 | import re
7 | from urllib.parse import urljoin
8 | import multiprocessing
9 |
10 | logging.basicConfig(level=logging.INFO,
11 | format='%(asctime)s - %(levelname)s: %(message)s')
12 |
13 | BASE_URL = 'https://ssr1.scrape.center'
14 | TOTAL_PAGE = 10
15 |
16 | RESULTS_DIR = 'results'
17 | exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
18 |
19 |
20 | def scrape_page(url):
21 | """
22 | scrape page by url and return its html
23 | :param url: page url
24 | :return: html of page
25 | """
26 | logging.info('scraping %s...', url)
27 | try:
28 | response = requests.get(url)
29 | if response.status_code == 200:
30 | return response.text
31 | logging.error('get invalid status code %s while scraping %s',
32 | response.status_code, url)
33 | except requests.RequestException:
34 | logging.error('error occurred while scraping %s', url, exc_info=True)
35 |
36 |
37 | def scrape_index(page):
38 | """
39 | scrape index page and return its html
40 | :param page: page of index page
41 | :return: html of index page
42 | """
43 | index_url = f'{BASE_URL}/page/{page}'
44 | return scrape_page(index_url)
45 |
46 |
47 | def parse_index(html):
48 | """
49 | parse index page and return detail url
50 | :param html: html of index page
51 | """
52 | pattern = re.compile('