├── .gitattributes ├── config.json ├── img ├── 01.png ├── 02.png ├── 03.png └── 04.png ├── requirements.txt ├── .travis.yml ├── luogu ├── __init__.py ├── LuoguException.py ├── IO.py ├── LuoguUser.py └── LuoguBrowser.py ├── .github └── ISSUE_TEMPLATE │ └── bug-report.md ├── example_browser.py ├── unit_tests.py ├── LICENSE ├── README.md ├── .gitignore ├── example_codedownload.py ├── example_userInfoCrawler.py └── user_info_download.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py filter=user_info_download -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "uid": 10086, 3 | "cookie": 10086 4 | } 5 | -------------------------------------------------------------------------------- /img/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/01.png -------------------------------------------------------------------------------- /img/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/02.png -------------------------------------------------------------------------------- /img/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/03.png -------------------------------------------------------------------------------- /img/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/04.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.0 2 | openpyxl==2.5.1 3 | urllib3==1.22 4 | PyMySQL==0.8.0 5 | mysqlclient==1.3.12 -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: 5 | pip install -r requirements.txt 6 | script: 7 | - python unit_tests.py -------------------------------------------------------------------------------- /luogu/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .LuoguBrowser import LuoguBrowser 4 | from .LuoguUser import LuoguUser 5 | 6 | from .LuoguException import * 7 | from .IO import * -------------------------------------------------------------------------------- /luogu/LuoguException.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*- coding:utf-8 -*- 3 | """ 4 | 洛谷浏览器的异常处理部分 5 | """ 6 | 7 | from __future__ import absolute_import 8 | 9 | 10 | class BrowserException: 11 | pass -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **Expected behavior** 11 | A clear and concise description of what you expected to happen. 12 | 13 | **Screenshots** 14 | If applicable, add screenshots to help explain your problem. 15 | 16 | **Other** 17 | 18 | - OS: [e.g. Windows10, Windows7, Macos] 19 | - Version [e.g. Python3.6] 20 | 21 | **Additional context** 22 | Add any other context about the problem here. 23 | -------------------------------------------------------------------------------- /luogu/IO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*- coding:utf-8 -*- 3 | """ 4 | 读写文件部分 5 | """ 6 | from __future__ import absolute_import 7 | 8 | import os 9 | import json 10 | 11 | 12 | def saveToFile(fileLocation, content): 13 | """ 14 | 保存内容到指定目录 15 | """ 16 | if fileLocation == None: 17 | raise AttributeError("fileLocation not found") 18 | f = open(fileLocation, mode='w') 19 | f.write(content) 20 | f.close() 21 | 22 | 23 | def getJson(fileLocation='config.json'): 24 | """ 25 | 获取Json文件内容,并转义到字典 26 | """ 27 | datas = open(fileLocation, mode='r') 28 | datas = json.loads(datas.read()) 29 | return datas -------------------------------------------------------------------------------- /example_browser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from luogu import * 4 | 5 | import ssl 6 | import json 7 | 8 | 9 | def getCookie(name="cookie.json"): 10 | datas = open(name) 11 | datas = json.loads(datas.read()) 12 | return datas['cookie'] 13 | 14 | 15 | defaultURL = "https://www.luogu.org" 16 | 17 | userNumber = [] 18 | userUrl = [] 19 | 20 | # MARK -- 参考答案:https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error 21 | ssl._create_default_https_context = ssl._create_unverified_context 22 | 23 | browser = LuoguBrowser() 24 | cookie = getCookie() 25 | browser.insert_headers('cookie', cookie) 26 | browser.openURL(defaultURL) 27 | data = browser.getData() 28 | print(data) -------------------------------------------------------------------------------- /luogu/LuoguUser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from __future__ import absolute_import 4 | import MySQLdb 5 | 6 | 7 | class LuoguUser(object): 8 | """ 9 | LuoguUser 10 | """ 11 | 12 | def __init__(self, name: str, contribution: int, active: int, 13 | integral: int, ac_num: int, submit_num: int, *args, **kwargs): 14 | """ 15 | Args: 16 | name: User's name 17 | contribution: emmmmm 18 | active: emmmmm 19 | integral: emmmmm 20 | ac_num: User's accepted topics number in all 21 | submit_num: User's submit topics number in all 22 | """ 23 | this.name = name 24 | this.contribution = contribution 25 | this.active = active 26 | this.integral = integral 27 | this.ac_num = ac_num 28 | this.submit_num = submit_num 29 | 30 | for key, value in kwargs.items(): 31 | setattr(self, key, value) -------------------------------------------------------------------------------- /unit_tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | 单元测试部分 3 | """ 4 | from luogu import LuoguBrowser, IO 5 | 6 | import ssl 7 | 8 | defaultURL = "https://www.luogu.org" 9 | 10 | 11 | def BrowserDefaultTest(): 12 | """ 13 | 单元测试部分 14 | 测试内容:LuoguBrowser是否可以正常使用 15 | """ 16 | # MARK -- 参考答案:https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error 17 | ssl._create_default_https_context = ssl._create_unverified_context 18 | # Init browser 19 | browser = LuoguBrowser() 20 | ## View Web 21 | browser.openURL(defaultURL) 22 | ## getData 23 | data = browser.getData() 24 | data = LuoguBrowser.ungzip(data) 25 | print(data) 26 | 27 | 28 | def OITest(): 29 | """ 30 | OI部分测试 31 | """ 32 | json = IO.getJson() 33 | print(json) 34 | print(json['cookie']) 35 | 36 | 37 | def main(): 38 | print('BrowserDefaultTest is Running...') 39 | BrowserDefaultTest() 40 | print('BrowserDefaultTest Sucess!') 41 | 42 | print('OITest is Running...') 43 | OITest() 44 | print('OITest Sucess') 45 | 46 | 47 | if __name__: 48 | main() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 扩散性百万甜面包 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LuoguCrawler | 洛谷爬虫 2 | 3 | [![Build Status](https://travis-ci.org/Himself65/LuoguCrawler.svg?branch=master)](https://travis-ci.org/Himself65/LuoguCrawler) [![LICENSE](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) ![language](https://img.shields.io/badge/language-python3-blue.svg) 4 | 5 | 【停止更新】lin_toto 明确禁止了洛谷的脚本使用 6 | 7 | ## Quick Start 8 | 9 | 注意:luogu/ 文件夹为必备内容,**请不要直接拷贝 example\*.py 代码直接运行** 10 | 11 | 1. 下载至本地,并解压 12 | 13 | 2. 运行相关脚本 14 | 15 | ```bash 16 | cd LuoguCrawler 17 | 18 | py example_download.py 19 | ``` 20 | 21 | ## Scripts 22 | 23 | - 下载你的所有 AC 代码 24 | 25 | 效果图: 26 | 27 | 快速下载你所有的 AC 代码 28 | 29 | ![04](/img/04.png) 30 | 31 | [脚本地址](/example_codedownload.py) 32 | 33 | - 洛谷用户爬取 34 | 35 | 多线程爬下用户信息 36 | 37 | ![03](/img/03.png) 38 | 39 | [脚本地址](/example_userInfoCrawler.py) 40 | 41 | - 其他脚本截图 42 | 43 | ![01](/img/01.png) 44 | 45 | ![02](/img/02.png) 46 | 47 | ## Config 48 | 49 | - 修改 config.json 内容 50 | 51 | - 某些特定脚本或许有需要配置的地方(将会修复) 52 | 53 | ```python 54 | myUrl = "https://www.luogu.org/space/show?uid=72813" 55 | myID = 72813 56 | ``` 57 | 58 | ## TODO 59 | 60 | 详见[TODO.md](TODO.md) 61 | 62 | ### Credits 63 | 64 | 感谢洛谷开发组提供的灵感,也感谢使用我 Repo 的各位 65 | 66 | ## LICENSE 67 | 68 | LowguNotice is available under the MIT license. See the [LICENSE](LICENSE) file for more information. 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # local 104 | .vscode 105 | cookie.json 106 | test.json 107 | 108 | temp.* 109 | *.xlsx 110 | download/ 111 | 112 | .DS_Store 113 | */.DS_Store 114 | */*/.DS_Store 115 | .test 116 | *.icloud 117 | *.db 118 | 119 | himself65 120 | .idea -------------------------------------------------------------------------------- /example_codedownload.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from luogu import * 3 | from bs4 import BeautifulSoup 4 | 5 | import asyncio 6 | import os 7 | import ssl 8 | 9 | cookie = 'UM_distinctid=1628f18d8fa568-0c77f61d2d6685-336c7b05-100200-1628f18d8fb74e; __client_id=4481c1bb80e250e3e1b43eb658c1f4882b4c98a5; CNZZDATA5476811=cnzz_eid%3D657620256-1522818985-%26ntime%3D1528115158' 10 | ID = 72813 # 这里写你的id 11 | mainUrl = 'https://www.luogu.org' 12 | pageUrl = 'https://www.luogu.org/recordnew/lists?uid=' + str(ID) + '&page=' 13 | downloadPath = 'download/' 14 | codePath = downloadPath + 'code/' 15 | 16 | DEBUG = True 17 | 18 | # browser 19 | browser = LuoguBrowser() 20 | browser.insert_headers('cookie', cookie) 21 | 22 | 23 | def getPageUrl(pageNum): 24 | return pageUrl + str(pageNum) 25 | 26 | 27 | def saveLocal(fileName, text): 28 | path = codePath + fileName + '.cpp' 29 | if os.path.exists(path): 30 | return 31 | f = open(path, mode='w') 32 | f.write(text) 33 | f.close() 34 | 35 | 36 | def downloadCode(url): 37 | browser.openURL(url) 38 | data = browser.getData() 39 | html = browser.ungzip(data).decode() 40 | soup = BeautifulSoup(html, 'html.parser') 41 | try: 42 | text = soup.find('code').get_text() 43 | name = soup.find('h1').get_text() 44 | saveLocal(name, text) 45 | print('下载完成:%s' % url) 46 | return True 47 | except AttributeError: 48 | print('下载异常:%s' % url) 49 | return False 50 | 51 | 52 | def searchPage(start, end): 53 | """ [start, end) 54 | """ 55 | count = 0 56 | for i in range(start, end): 57 | if DEBUG: 58 | print("现在是第%d页" % i) 59 | url = getPageUrl(i) 60 | browser.openURL(url) 61 | data = browser.getData() 62 | html = browser.ungzip(data).decode() 63 | soup = BeautifulSoup(html, 'html.parser') 64 | items = soup.find('div', { 65 | 'class': 'lg-content-table-left' 66 | }).find_all('div', {'class': 'am-g lg-table-bg0 lg-table-row'}) 67 | for item in items: 68 | point = item.find('strong', {'class': 'lg-fg-green'}) 69 | if point is None: 70 | continue 71 | acurl = item.find_all('a', { 72 | 'target': '_blank', 73 | 'data-pjax': '' 74 | })[0]['href'] 75 | import re 76 | if re.search(acurl, '/record/show?rid=*'): 77 | if DEBUG: 78 | print(acurl) 79 | continue 80 | if downloadCode(mainUrl + acurl): 81 | count += 1 82 | print('代码共', count) 83 | 84 | 85 | def main(): 86 | page = 1 # start Page 87 | url = getPageUrl(page) 88 | browser.openURL(url) 89 | data = browser.getData() 90 | html = browser.ungzip(data).decode() 91 | soup = BeautifulSoup(html, 'html.parser') 92 | items = soup.find('ul', { 93 | 'class': 'am-pagination am-pagination-centered' 94 | }).find_all('li') 95 | lastestItem = items[-1] 96 | maxPage = lastestItem.find('a')['data-ci-pagination-page'] 97 | # 找到最大页码 98 | print('最大页数', maxPage) 99 | searchPage(1, int(maxPage) + 1) 100 | 101 | 102 | def init(): 103 | print('初始化中') 104 | if not os.path.exists(downloadPath): 105 | print('正在创建文件夹download...') 106 | os.makedirs(downloadPath) 107 | print('done...') 108 | if not os.path.exists(codePath): 109 | print('正在创建task文件') 110 | os.makedirs(codePath) 111 | print('done...') 112 | print('初始化完成') 113 | 114 | 115 | if __name__: 116 | ssl._create_default_https_context = ssl._create_unverified_context 117 | init() 118 | main() -------------------------------------------------------------------------------- /luogu/LuoguBrowser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | #-*- coding:utf-8 -*- 3 | """ 4 | 简单的爬虫项目来访问洛谷官网 5 | """ 6 | from __future__ import absolute_import 7 | from http import cookiejar 8 | from urllib import parse, request 9 | from bs4 import BeautifulStoneSoup 10 | import json 11 | import socket 12 | import gzip 13 | 14 | __author__ = "Himself65" 15 | __license__ = "MIT" 16 | 17 | defaultURL = "https://www.luogu.org" 18 | 19 | from random import sample 20 | user_agents = [ 21 | 'Opera/9.25 (Windows NT 5.1; U; en)', 22 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 23 | 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 24 | 'Mozilla/5.0 (X11; U; linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 25 | 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9' 26 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 27 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 28 | ] 29 | 30 | 31 | def get_agent(): 32 | """ 33 | 每次随机返回一个user-agent 34 | """ 35 | return sample(user_agents, 1)[0] 36 | 37 | 38 | class LuoguBrowser(object): 39 | """ 40 | """ 41 | _headers = {} 42 | 43 | def __init__(self): 44 | """ 45 | 初始化访问洛谷 46 | """ 47 | self._headers['user-agent'] = get_agent() 48 | self._headers['accept-encoding'] = "gzip, deflate, br" 49 | self._headers['accept-language'] = "zh,en;q=0.9,zh-CN;q=0.8,ja;q=0.7" 50 | self.setOpener() 51 | 52 | def insert_headers(self, key, value): 53 | """ 54 | 插入值到请求头 55 | 每次插入后会自动setOpener 56 | """ 57 | self._headers[key] = value 58 | self.setOpener() 59 | 60 | def setOpener(self): 61 | """ 初始化opener 62 | """ 63 | cj = cookiejar.CookieJar() 64 | pro = request.HTTPCookieProcessor(cj) 65 | self.opener = request.build_opener(pro) 66 | header = [] 67 | for key, value in self._headers.items(): 68 | elem = (key, value) 69 | header.append(elem) 70 | self.opener.addheaders = header 71 | 72 | def openURL(self, url, data=None, timeout=None): 73 | """ 74 | 访问地址 75 | """ 76 | import socket 77 | if timeout is None: 78 | timeout = socket._GLOBAL_DEFAULT_TIMEOUT 79 | if url is None: 80 | raise AttributeError('url is none') 81 | # url = url.encode('UTF8', errors='strict') 82 | self.response = self.opener.open(url, data=data, timeout=timeout) 83 | 84 | def getData(self): 85 | """ 86 | 获取response内容 87 | """ 88 | return self.response.read() 89 | 90 | def getResponse(self): 91 | """ 92 | 获取response 93 | """ 94 | return self.response 95 | 96 | @staticmethod 97 | def create_query_string_message(dictionary): 98 | """ 99 | 创建请求地址 100 | Args: 101 | dictionary -> 字典,例如: 102 | 103 | Returns -> String 104 | 例如: 105 | { 106 | "id": 761282619, 107 | "name": "himself65" 108 | } 109 | Return -> "id=761282619&name=himself65" 110 | """ 111 | s = "" 112 | for (key, value) in dictionary.items(): 113 | s = s + str(key) + "=" + str(value) + "&" 114 | return s.rstrip("&") 115 | 116 | @staticmethod 117 | def getDataFromResponse(response, data='more'): 118 | """ 119 | 获取response请求中特定内容 120 | Args: 121 | response -> Response 122 | data -> String, 需要获得的数据,默认为'more' 123 | Returns -> String 124 | """ 125 | messages = json.loads(response) 126 | return messages[data] 127 | 128 | @staticmethod 129 | def check_Accessible(data, name='code', accessStatus=200): 130 | """ 131 | 检查状态值是否成功 132 | Args: 133 | data -> Dictionary, 为Response返回的请求 134 | Returns -> Bool 135 | 136 | """ 137 | data_json = json.loads(data) 138 | return data_json[name] == accessStatus if True else False 139 | 140 | @staticmethod 141 | def ungzip(data): 142 | """ 143 | ungzip the data 144 | """ 145 | try: 146 | ungzipData = gzip.decompress(data) 147 | except: 148 | print("解压失败,返回原数据") 149 | return data 150 | return ungzipData 151 | -------------------------------------------------------------------------------- /example_userInfoCrawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 注意: 5 | 1. 服务端上需自行配置代理 6 | 2. start_num end_num 为需手动填写 7 | """ 8 | from luogu import * 9 | from openpyxl import Workbook 10 | from openpyxl import load_workbook 11 | from bs4 import BeautifulSoup 12 | from urllib import request, error 13 | from queue import Queue 14 | 15 | import time 16 | import queue 17 | import os 18 | import ssl 19 | import json 20 | import threading 21 | import linecache 22 | 23 | # 必填内容 24 | start_num = 1 25 | end_num = 1000 26 | 27 | # 洛谷网站 28 | defaultURL = "https://www.luogu.org" 29 | userURL = "https://www.luogu.org/space/show?uid=" 30 | # 此处不建议修改 31 | title = ['id', '名字', '头像', '总提交数', 'AC数', '贡献', '活跃', '积分', '用户类型', '注册时间'] 32 | wbName = 'luogu2.xlsx' 33 | wsName = '1' 34 | downloadPath = 'download/' 35 | imagePath = downloadPath + 'img/' 36 | taskPath = downloadPath + 'task/' 37 | 38 | 39 | def download_img(url, userName): 40 | """ 下载图片到download/文件夹下 41 | """ 42 | loc = imagePath + userName + '.png' 43 | if os.path.exists(loc): 44 | return 45 | try: 46 | # 下载图片 47 | request.urlretrieve(url, filename=loc) 48 | except: 49 | print("\n无法下载文件") 50 | 51 | 52 | def crawler(taskque, que): 53 | """ get task from taskque 54 | """ 55 | try: 56 | # Init browser 57 | browser = LuoguBrowser() 58 | browser.openURL(defaultURL) 59 | except Exception as e: 60 | print("无法创建") 61 | print(e) 62 | return 63 | while True: 64 | try: 65 | i = taskque.get(block=True, timeout=1) 66 | except queue.Empty: 67 | print('无更多任务') 68 | print('请等待结束') 69 | return 70 | try: 71 | # Get messageURL 72 | messageURL = userURL + str(i) 73 | ## View Web 74 | browser.openURL(messageURL) 75 | ## getData 76 | html = browser.getData() 77 | html = LuoguBrowser.ungzip(html) 78 | soup = BeautifulSoup(html, 'html.parser') 79 | # print(soup) 80 | board = soup.find( 81 | 'ul', {'class': 'am-list am-list-static lg-summary-list'}) 82 | items = board.find_all("li") 83 | # 0 84 | userName = soup.find('span', {'name': 'username'}).get_text() 85 | avatar = items[0].find('img')['src'] 86 | # 1 87 | allPost = items[1].find_all('span', {'class': 'lg-bignum-num'}) 88 | Num = allPost[0].get_text() 89 | ACNum = allPost[1].get_text() 90 | # 2 91 | Acts = items[4].find('span', {'class': 'lg-right'}).get_text() 92 | Acts = Acts.split('/') 93 | contribute = Acts[0] 94 | active = Acts[1] 95 | integral = Acts[2] 96 | # 3 97 | Type = items[5].find('span', {'class': 'lg-right'}).get_text() 98 | # 4 99 | registeredTime = items[6].find('span', { 100 | 'class': 'lg-right' 101 | }).get_text() 102 | # make t 103 | t = [ 104 | i, userName, avatar, Num, ACNum, contribute, active, integral, 105 | Type, registeredTime 106 | ] 107 | # 下载图片 108 | download_img(avatar, str(i)) 109 | # finish 110 | taskque.task_done() 111 | que.put(t) 112 | except AttributeError: 113 | que.put([i, '无此人']) 114 | print('找不到id:', i) 115 | except Exception as e: 116 | print(e) 117 | 118 | 119 | def saveThread(que, sheet): 120 | while True: 121 | try: 122 | t = que.get(block=True, timeout=60) 123 | if t[1] != '-1': 124 | sheet.append(t) 125 | path = taskPath + str(t[0]) 126 | if os.path.exists(path): 127 | os.remove(path) 128 | except queue.Empty: 129 | return 130 | que.task_done() 131 | 132 | 133 | def getLine(num): 134 | """ 返回是否为true 135 | """ 136 | if os.path.exists(taskPath + str(num)): 137 | return True 138 | return False 139 | 140 | 141 | def getTaskThread(que, filePath): 142 | """ 创建任务列队 143 | """ 144 | # thread = threading.current_thread() 145 | tgroup = os.listdir(taskPath) 146 | for item in tgroup: 147 | try: 148 | que.put(int(item)) 149 | except ValueError: 150 | print(item) 151 | print('剩余任务数量:', que.qsize()) 152 | 153 | 154 | def init(): 155 | print('初始化中') 156 | if not os.path.exists(downloadPath): 157 | print('正在创建文件夹download...') 158 | os.makedirs(downloadPath) 159 | print('done...') 160 | if not os.path.exists(taskPath): 161 | print('正在创建task文件') 162 | os.makedirs(taskPath) 163 | # 第一次跑脚本时候使用 164 | taskMaker(start=start_num, end=end_num) 165 | print('done...') 166 | if not os.path.exists(imagePath): 167 | print('正在创建文件夹image...') 168 | os.makedirs(imagePath) 169 | print('done...') 170 | if not os.path.exists(wbName): 171 | print('正在创建Excel...') 172 | wb = Workbook() 173 | wb.create_sheet(title=wsName) 174 | wb.save(wbName) 175 | print('done...') 176 | print('初始化完成') 177 | 178 | 179 | def taskMaker(start=1, end=100): 180 | """ 初始化任务表 181 | """ 182 | if not os.path.exists(taskPath): 183 | os.makedirs(taskPath) 184 | for i in range(start, end): 185 | f = open(taskPath + str(i), mode='w') 186 | f.close() 187 | return 188 | 189 | 190 | def backgroundThread(saveQue, taskQue): 191 | while True: 192 | sz = saveQue.qsize() 193 | print('待保存量:', sz) 194 | sz = taskQue.qsize() 195 | print('剩余任务:', sz) 196 | time.sleep(30) 197 | 198 | 199 | def main(): 200 | # MARK -- 参考答案:https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error 201 | ssl._create_default_https_context = ssl._create_unverified_context 202 | # init 203 | init() 204 | # load data 205 | print('loading...') 206 | wb = load_workbook(wbName) 207 | sheet = wb[wsName] 208 | sheet.append(title) 209 | 210 | # thread 211 | saveQue = Queue() 212 | taskQue = Queue() 213 | thread = [] 214 | for i in range(0, 9): # 爬虫线程列队 215 | t = threading.Thread( 216 | target=crawler, name=str(i), args=(taskQue, saveQue)) 217 | thread.append(t) 218 | st = threading.Thread( 219 | target=saveThread, name='saveThread', args=(saveQue, sheet)) 220 | gt = threading.Thread( 221 | target=getTaskThread, name='getTaskThread', args=(taskQue, taskPath)) 222 | bg = threading.Thread( 223 | target=backgroundThread, 224 | name='backgroundThread', 225 | args=(saveQue, taskQue)) 226 | print('loading...') 227 | try: 228 | print('start!') 229 | gt.start() 230 | gt.join() 231 | for t in thread: 232 | t.start() 233 | st.start() 234 | bg.start() 235 | st.join() 236 | except: 237 | print("线程错误") 238 | finally: 239 | wb.save(wbName) 240 | 241 | 242 | if __name__: 243 | main() -------------------------------------------------------------------------------- /user_info_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import sqlite3 # 数据库 4 | import gc 5 | import urllib3 6 | import multiprocessing # 多线程模块 7 | import threading # 多线程模块 8 | import ssl 9 | import re # 正则表达式 10 | 11 | from time import time 12 | from queue import Queue 13 | from bs4 import BeautifulSoup 14 | from luogu import LuoguBrowser 15 | """ 16 | 重构洛谷用户数据爬取 17 | 多进程爬虫 + 数据库存储 + 缓存机制 + 内存优化 18 | 19 | 过程: 20 | 1.消息列队发送任务 21 | 2.爬虫列队抓取网页分析 22 | 3.保存列队保存到本地 23 | """ 24 | __author__ = 'himself65' 25 | __license__ = 'MIT' 26 | __email__ = 'himself6565@gmail.com' 27 | 28 | # SETTINGS 29 | DATABASE = { 30 | 'name': 'himself65', 31 | } 32 | Threads_Number = 10 # 默认10个爬虫列队 33 | DEBUG = True 34 | 35 | # END 36 | 37 | GET_TASKS_SQL = '''SELECT ID FROM TASK;''' 38 | 39 | 40 | def get_database(): 41 | """获取数据库""" 42 | name = DATABASE['name'] + '.db' 43 | database = sqlite3.connect(name) 44 | return database 45 | 46 | 47 | def task_producer(a, b): 48 | """生成任务 区间[a, b]""" 49 | db = get_database() 50 | cursor = db.cursor() 51 | for i in range(a, b + 1): 52 | create_task_sql = '''INSERT INTO TASK(ID) 53 | VALUES (%d);''' % (i) 54 | cursor.executescript(create_task_sql) 55 | db.close() 56 | 57 | 58 | def get_task(task_que: Queue): 59 | """任务队列,从数据库中加载任务放到task_que中""" 60 | db = get_database() 61 | cursor = db.cursor() 62 | cursor.execute(GET_TASKS_SQL) 63 | task_list = cursor.fetchall() 64 | if len(task_list) != 0: 65 | for item in task_que: 66 | time_1 = time() 67 | task_que.put(item) 68 | print("加载任务耗时: %lf" % (time() - time_1)) 69 | else: 70 | print("没有找到任务") 71 | db.close() 72 | 73 | 74 | def save_task(save_que: Queue): 75 | """存储队列 TODO""" 76 | db = get_database() 77 | cursor = db.cursor() 78 | while not save_que.empty(): 79 | item = save_que.get(timeout=5) 80 | uid = item['uid'] 81 | name = item['name'] 82 | sub_num = item['submit_num'] 83 | ac_num = item['ac_num'] 84 | contribute = item['contribute'] 85 | active = item['active'] 86 | integral = item['integral'] 87 | created_time = item['created_time'] 88 | save_user_sql = '''INSERT INTO USER(ID, 89 | NAME, 90 | SUBMIT_NUM, 91 | AC_NUM, 92 | CONTRIBUTE, 93 | ACTIVE, 94 | INTEGRAL, 95 | CREATED_TIME) 96 | VALUES(%d %s %d %d %d %d %d %s); 97 | DELETE FROM TASK WHERE ID = %d;''' % (uid, name, 98 | sub_num, 99 | ac_num, 100 | contribute, 101 | active, 102 | integral, 103 | created_time, 104 | uid) 105 | # Save 106 | cursor.execute(save_user_sql) 107 | save_que.task_done() 108 | 109 | 110 | def crawler(task_que: Queue, save_que: Queue): 111 | """爬虫部分""" 112 | import urllib 113 | import time 114 | user_url = "https://www.luogu.org/space/show?uid=" 115 | 116 | def get_url(user_id): 117 | """得到用户的url""" 118 | return '%s%s' % (user_url, user_id) 119 | 120 | while not task_que.empty(): 121 | user_id = task_que.get(block=True) 122 | url = get_url(user_id) 123 | start_time = time.time() 124 | html = urllib.request.urlopen(url) 125 | soup = BeautifulSoup(html, 'html.parser') 126 | if DEBUG: 127 | print('用户:%d 用时%lf' % (user_id, time.time() - start_time)) 128 | if re.match('提示', soup.title.string) is not None: 129 | # 判断用户是否存在 130 | print('不存在用户', user_id) 131 | continue 132 | """ 133 | 公开代码者有 5 个ul 134 | 1. avatar 135 | 2. ac_num submit_num 136 | 3. contribute active integral 137 | 4. utype 138 | 5. created_time 139 | 隐私保护者有 4 个ul 140 | 1. avatar 141 | 2. contribute active integral 142 | 3. utype 143 | 4. created_time 144 | """ 145 | # 找到信息板块 146 | board = soup.find('ul', 147 | {'class': 'am-list am-list-static lg-summary-list'}) 148 | board_items = board.find_all("li") 149 | # start 150 | user_name = soup.find('span', {'name': 'username'}).get_text() 151 | if len(board_items) == 7: 152 | """公开代码用户""" 153 | # 1 154 | allPost = board_items[1].find_all('span', 155 | {'class': 'lg-bignum-num'}) 156 | submit_num = allPost[0].get_text() 157 | ac_num = allPost[1].get_text() 158 | # 2 159 | acts = board_items[4].find('span', { 160 | 'class': 'lg-right' 161 | }).get_text() 162 | acts = acts.split('/') 163 | contribute = acts[0] 164 | active = acts[1] 165 | integral = acts[2] 166 | # 3 167 | utype = board_items[5].find('span', { 168 | 'class': 'lg-right' 169 | }).get_text() 170 | # 4 171 | created_time = board_items[6].find('span', { 172 | 'class': 'lg-right' 173 | }).get_text() 174 | luogu_user = { 175 | 'uid': user_id, 176 | 'name': user_name, 177 | 'submit_num': submit_num, 178 | 'ac_num': ac_num, 179 | 'contribute': contribute, 180 | 'active': active, 181 | 'integral': integral, 182 | 'created_time': created_time, 183 | } 184 | else: 185 | """隐私代码用户""" 186 | acts = board_items[1].find('span', { 187 | 'class': 'lg-right' 188 | }).get_text() 189 | acts = acts.split('/') 190 | contribute = acts[0] 191 | active = acts[1] 192 | integral = acts[2] 193 | created_time = board_items[3].find('span', { 194 | 'class': 'lg-right' 195 | }).get_text() 196 | luogu_user = { 197 | 'uid': user_id, 198 | 'name': user_name, 199 | 'submit_num': -1, 200 | 'ac_num': -1, 201 | 'contribute': contribute, 202 | 'active': active, 203 | 'integral': integral, 204 | 'created_time': created_time, 205 | } 206 | if DEBUG: 207 | print(luogu_user) 208 | 209 | # Finished 210 | save_que.put(luogu_user) 211 | task_que.task_done() 212 | 213 | 214 | def init(): 215 | db = get_database() 216 | cursor = db.cursor() 217 | # 创建表 218 | create_table_sql = '''CREATE TABLE IF NOT EXISTS USER( 219 | ID INT PRIMARY KEY NOT NULL, 220 | NAME TEXT NOT NULL, 221 | SUBMIT_NUM INT, 222 | AC_NUM INT, 223 | CONTRIBUTE INT, 224 | ACTIVE INT, 225 | INTEGRAL INT, 226 | CREATED_TIME TEXT 227 | );''' 228 | cursor.executescript(create_table_sql) 229 | create_table_sql = '''CREATE TABLE IF NOT EXISTS TASK( 230 | ID INT PRIMARY KEY NOT NULL 231 | );''' 232 | cursor.execute(create_table_sql) 233 | # 此处不用execute则无法获取,暂且不知道原因 234 | cursor.execute(GET_TASKS_SQL) 235 | exist_task = cursor.fetchone() 236 | if exist_task is None: 237 | print("no task exist!") 238 | exit() 239 | # Close DataBase 240 | db.close() 241 | 242 | 243 | def test(tq: Queue, sq: Queue): 244 | """Unit Test Part""" 245 | init() 246 | 247 | 248 | def main(tq: Queue, sq: Queue): 249 | # Create Theads 250 | threads_list = [] 251 | for td_id in range(1, Threads_Number + 1): 252 | td = threading.Thread(target=crawler, args=(tq, sq), name=str(td_id)) 253 | threads_list.append(td) 254 | 255 | for td in threads_list.index(): 256 | td.start() 257 | 258 | save_td = threading.Thread(target=save_task, args=(sq), name='Save_Que') 259 | save_td.start() 260 | save_td.join() 261 | print('Finished') 262 | 263 | 264 | if __name__: 265 | # MARK -- 参考答案:https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error 266 | ssl._create_default_https_context = ssl._create_unverified_context 267 | # 任务列表 268 | task_que = Queue() 269 | save_que = Queue() 270 | main(task_que, save_que) 271 | --------------------------------------------------------------------------------