├── README.md ├── .gitignore ├── spider.py └── spider2.py /README.md: -------------------------------------------------------------------------------- 1 | # ScrapeStatic1 2 | 3 | Spider for https://ssr1.scrape.center/ 4 | 5 | ## PyQuery + MongoDB + 多进程版 6 | 7 | 见 [spider.py](spider.py) 8 | 9 | ## 正则表达式 + 文本 + 多进程版 10 | 11 | 见 [spider2.py](spider2.py) 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | /.idea 134 | /.vscode 135 | /results -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | import re 4 | import pymongo 5 | from pyquery import PyQuery as pq 6 | from urllib.parse import urljoin 7 | import multiprocessing 8 | 9 | 10 | logging.basicConfig(level=logging.INFO, 11 | format='%(asctime)s - %(levelname)s: %(message)s') 12 | 13 | BASE_URL = 'https://ssr1.scrape.center' 14 | TOTAL_PAGE = 10 15 | MONGO_CONNECTION_STRING = 'mongodb://localhost:27017' 16 | MONGO_DB_NAME = 'movies' 17 | MONGO_COLLECTION_NAME = 'movies' 18 | 19 | client = pymongo.MongoClient(MONGO_CONNECTION_STRING) 20 | db = client['movies'] 21 | collection = db['movies'] 22 | 23 | 24 | def scrape_page(url): 25 | """ 26 | scrape page by url and return its html 27 | :param url: page url 28 | :return: html of page 29 | """ 30 | logging.info('scraping %s...', url) 31 | try: 32 | response = requests.get(url) 33 | if response.status_code == 200: 34 | return response.text 35 | logging.error('get invalid status code %s while scraping %s', response.status_code, url) 36 | except requests.RequestException: 37 | logging.error('error occurred while scraping %s', url, exc_info=True) 38 | 39 | 40 | def scrape_index(page): 41 | """ 42 | scrape index page and return its html 43 | :param page: page of index page 44 | :return: html of index page 45 | """ 46 | index_url = f'{BASE_URL}/page/{page}' 47 | return scrape_page(index_url) 48 | 49 | 50 | def parse_index(html): 51 | """ 52 | parse index page 53 | :param html: html of index page 54 | :return: generator of detail page url 55 | """ 56 | doc = pq(html) 57 | links = doc('.el-card .name') 58 | for link in links.items(): 59 | href = link.attr('href') 60 | detail_url = urljoin(BASE_URL, href) 61 | logging.info('get detail url %s', detail_url) 62 | yield detail_url 63 | 64 | 65 | def scrape_detail(url): 66 | """ 67 | scrape detail page and return its html 68 | :param page: page of detail page 69 | :return: html of detail page 70 | """ 71 | return scrape_page(url) 72 | 73 | 74 | def parse_detail(html): 75 | """ 76 | parse detail page 77 | :param html: html of detail page 78 | :return: data 79 | """ 80 | doc = pq(html) 81 | cover = doc('img.cover').attr('src') 82 | name = doc('a > h2').text() 83 | categories = [item.text() for item in doc('.categories button span').items()] 84 | published_at = doc('.info:contains(上映)').text() 85 | published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \ 86 | if published_at and re.search('\d{4}-\d{2}-\d{2}', published_at) else None 87 | drama = doc('.drama p').text() 88 | score = doc('p.score').text() 89 | score = float(score) if score else None 90 | return { 91 | 'cover': cover, 92 | 'name': name, 93 | 'categories': categories, 94 | 'published_at': published_at, 95 | 'drama': drama, 96 | 'score': score 97 | } 98 | 99 | 100 | def save_data(data): 101 | """ 102 | save to mongodb 103 | :param data: 104 | :return: 105 | """ 106 | collection.update_one({ 107 | 'name': data.get('name') 108 | }, { 109 | '$set': data 110 | }, upsert=True) 111 | 112 | 113 | def main(page): 114 | """ 115 | main process 116 | :return: 117 | """ 118 | index_html = scrape_index(page) 119 | detail_urls = parse_index(index_html) 120 | for detail_url in detail_urls: 121 | detail_html = scrape_detail(detail_url) 122 | data = parse_detail(detail_html) 123 | logging.info('get detail data %s', data) 124 | logging.info('saving data to mongodb') 125 | save_data(data) 126 | logging.info('data saved successfully') 127 | 128 | 129 | if __name__ == '__main__': 130 | pool = multiprocessing.Pool() 131 | pages = range(1, TOTAL_PAGE + 1) 132 | pool.map(main, pages) 133 | pool.close() 134 | -------------------------------------------------------------------------------- /spider2.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import makedirs 3 | from os.path import exists 4 | import requests 5 | import logging 6 | import re 7 | from urllib.parse import urljoin 8 | import multiprocessing 9 | 10 | logging.basicConfig(level=logging.INFO, 11 | format='%(asctime)s - %(levelname)s: %(message)s') 12 | 13 | BASE_URL = 'https://ssr1.scrape.center' 14 | TOTAL_PAGE = 10 15 | 16 | RESULTS_DIR = 'results' 17 | exists(RESULTS_DIR) or makedirs(RESULTS_DIR) 18 | 19 | 20 | def scrape_page(url): 21 | """ 22 | scrape page by url and return its html 23 | :param url: page url 24 | :return: html of page 25 | """ 26 | logging.info('scraping %s...', url) 27 | try: 28 | response = requests.get(url) 29 | if response.status_code == 200: 30 | return response.text 31 | logging.error('get invalid status code %s while scraping %s', 32 | response.status_code, url) 33 | except requests.RequestException: 34 | logging.error('error occurred while scraping %s', url, exc_info=True) 35 | 36 | 37 | def scrape_index(page): 38 | """ 39 | scrape index page and return its html 40 | :param page: page of index page 41 | :return: html of index page 42 | """ 43 | index_url = f'{BASE_URL}/page/{page}' 44 | return scrape_page(index_url) 45 | 46 | 47 | def parse_index(html): 48 | """ 49 | parse index page and return detail url 50 | :param html: html of index page 51 | """ 52 | pattern = re.compile('') 53 | items = re.findall(pattern, html) 54 | if not items: 55 | return [] 56 | for item in items: 57 | detail_url = urljoin(BASE_URL, item) 58 | logging.info('get detail url %s', detail_url) 59 | yield detail_url 60 | 61 | 62 | def scrape_detail(url): 63 | """ 64 | scrape detail page and return its html 65 | :param page: page of detail page 66 | :return: html of detail page 67 | """ 68 | return scrape_page(url) 69 | 70 | 71 | def parse_detail(html): 72 | """ 73 | parse detail page 74 | :param html: html of detail page 75 | :return: data 76 | """ 77 | 78 | cover_pattern = re.compile( 79 | 'class="item.*?', re.S) 80 | name_pattern = re.compile('(.*?)') 81 | categories_pattern = re.compile( 82 | '(.*?).*?', re.S) 83 | published_at_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s?上映') 84 | drama_pattern = re.compile('.*?(.*?)

', re.S) 85 | score_pattern = re.compile('(.*?)

', re.S) 86 | 87 | cover = re.search(cover_pattern, html).group( 88 | 1).strip() if re.search(cover_pattern, html) else None 89 | name = re.search(name_pattern, html).group( 90 | 1).strip() if re.search(name_pattern, html) else None 91 | categories = re.findall(categories_pattern, html) if re.findall( 92 | categories_pattern, html) else [] 93 | published_at = re.search(published_at_pattern, html).group( 94 | 1) if re.search(published_at_pattern, html) else None 95 | drama = re.search(drama_pattern, html).group( 96 | 1).strip() if re.search(drama_pattern, html) else None 97 | score = float(re.search(score_pattern, html).group(1).strip() 98 | ) if re.search(score_pattern, html) else None 99 | 100 | return { 101 | 'cover': cover, 102 | 'name': name, 103 | 'categories': categories, 104 | 'published_at': published_at, 105 | 'drama': drama, 106 | 'score': score 107 | } 108 | 109 | 110 | def save_data(data): 111 | """ 112 | save to json file 113 | :param data: 114 | :return: 115 | """ 116 | name = data.get('name') 117 | data_path = f'{RESULTS_DIR}/{name}.json' 118 | json.dump(data, open(data_path, 'w', encoding='utf-8'), 119 | ensure_ascii=False, indent=2) 120 | 121 | 122 | def main(page): 123 | """ 124 | main process 125 | :return: 126 | """ 127 | index_html = scrape_index(page) 128 | detail_urls = parse_index(index_html) 129 | for detail_url in detail_urls: 130 | detail_html = scrape_detail(detail_url) 131 | data = parse_detail(detail_html) 132 | logging.info('get detail data %s', data) 133 | logging.info('saving data to json file') 134 | save_data(data) 135 | logging.info('data saved successfully') 136 | 137 | 138 | if __name__ == '__main__': 139 | pool = multiprocessing.Pool() 140 | pages = range(1, TOTAL_PAGE + 1) 141 | pool.map(main, pages) 142 | pool.close() 143 | --------------------------------------------------------------------------------