├── .gitattributes
├── config.json
├── img
    ├── 01.png
    ├── 02.png
    ├── 03.png
    └── 04.png
├── requirements.txt
├── .travis.yml
├── luogu
    ├── __init__.py
    ├── LuoguException.py
    ├── IO.py
    ├── LuoguUser.py
    └── LuoguBrowser.py
├── .github
    └── ISSUE_TEMPLATE
    │   └── bug-report.md
├── example_browser.py
├── unit_tests.py
├── LICENSE
├── README.md
├── .gitignore
├── example_codedownload.py
├── example_userInfoCrawler.py
└── user_info_download.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py filter=user_info_download


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "uid": 10086,
3 |   "cookie": 10086
4 | }
5 | 


--------------------------------------------------------------------------------
/img/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/01.png


--------------------------------------------------------------------------------
/img/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/02.png


--------------------------------------------------------------------------------
/img/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/03.png


--------------------------------------------------------------------------------
/img/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/himself65/LuoguCrawler/HEAD/img/04.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.6.0
2 | openpyxl==2.5.1
3 | urllib3==1.22
4 | PyMySQL==0.8.0
5 | mysqlclient==1.3.12


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |     - "3.6"
4 | install:
5 |   pip install -r requirements.txt
6 | script:
7 |   - python unit_tests.py


--------------------------------------------------------------------------------
/luogu/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 
3 | from .LuoguBrowser import LuoguBrowser
4 | from .LuoguUser import LuoguUser
5 | 
6 | from .LuoguException import *
7 | from .IO import *


--------------------------------------------------------------------------------
/luogu/LuoguException.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #-*- coding:utf-8 -*-
 3 | """
 4 | 洛谷浏览器的异常处理部分
 5 | """
 6 | 
 7 | from __future__ import absolute_import
 8 | 
 9 | 
10 | class BrowserException:
11 |     pass


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Describe the bug**
 8 | A clear and concise description of what the bug is.
 9 | 
10 | **Expected behavior**
11 | A clear and concise description of what you expected to happen.
12 | 
13 | **Screenshots**
14 | If applicable, add screenshots to help explain your problem.
15 | 
16 | **Other**
17 | 
18 |  - OS: [e.g. Windows10, Windows7, Macos]
19 |  - Version [e.g. Python3.6]
20 | 
21 | **Additional context**
22 | Add any other context about the problem here.
23 | 


--------------------------------------------------------------------------------
/luogu/IO.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | #-*- coding:utf-8 -*-
 3 | """
 4 | 读写文件部分
 5 | """
 6 | from __future__ import absolute_import
 7 | 
 8 | import os
 9 | import json
10 | 
11 | 
12 | def saveToFile(fileLocation, content):
13 |     """
14 |     保存内容到指定目录
15 |     """
16 |     if fileLocation == None:
17 |         raise AttributeError("fileLocation not found")
18 |     f = open(fileLocation, mode='w')
19 |     f.write(content)
20 |     f.close()
21 | 
22 | 
23 | def getJson(fileLocation='config.json'):
24 |     """
25 |     获取Json文件内容，并转义到字典
26 |     """
27 |     datas = open(fileLocation, mode='r')
28 |     datas = json.loads(datas.read())
29 |     return datas


--------------------------------------------------------------------------------
/example_browser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from luogu import *
 4 | 
 5 | import ssl
 6 | import json
 7 | 
 8 | 
 9 | def getCookie(name="cookie.json"):
10 |     datas = open(name)
11 |     datas = json.loads(datas.read())
12 |     return datas['cookie']
13 | 
14 | 
15 | defaultURL = "https://www.luogu.org"
16 | 
17 | userNumber = []
18 | userUrl = []
19 | 
20 | # MARK -- 参考答案：https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
21 | ssl._create_default_https_context = ssl._create_unverified_context
22 | 
23 | browser = LuoguBrowser()
24 | cookie = getCookie()
25 | browser.insert_headers('cookie', cookie)
26 | browser.openURL(defaultURL)
27 | data = browser.getData()
28 | print(data)


--------------------------------------------------------------------------------
/luogu/LuoguUser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import absolute_import
 4 | import MySQLdb
 5 | 
 6 | 
 7 | class LuoguUser(object):
 8 |     """
 9 |     LuoguUser
10 |     """
11 | 
12 |     def __init__(self, name: str, contribution: int, active: int,
13 |                  integral: int, ac_num: int, submit_num: int, *args, **kwargs):
14 |         """
15 |         Args:
16 |             name: User's name
17 |             contribution: emmmmm
18 |             active: emmmmm
19 |             integral: emmmmm
20 |             ac_num: User's accepted topics number in all
21 |             submit_num: User's submit topics number in all 
22 |         """
23 |         this.name = name
24 |         this.contribution = contribution
25 |         this.active = active
26 |         this.integral = integral
27 |         this.ac_num = ac_num
28 |         this.submit_num = submit_num
29 | 
30 |         for key, value in kwargs.items():
31 |             setattr(self, key, value)


--------------------------------------------------------------------------------
/unit_tests.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 单元测试部分
 3 | """
 4 | from luogu import LuoguBrowser, IO
 5 | 
 6 | import ssl
 7 | 
 8 | defaultURL = "https://www.luogu.org"
 9 | 
10 | 
11 | def BrowserDefaultTest():
12 |     """
13 |     单元测试部分
14 |     测试内容：LuoguBrowser是否可以正常使用
15 |     """
16 |     # MARK -- 参考答案：https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
17 |     ssl._create_default_https_context = ssl._create_unverified_context
18 |     # Init browser
19 |     browser = LuoguBrowser()
20 |     ## View Web
21 |     browser.openURL(defaultURL)
22 |     ## getData
23 |     data = browser.getData()
24 |     data = LuoguBrowser.ungzip(data)
25 |     print(data)
26 | 
27 | 
28 | def OITest():
29 |     """
30 |     OI部分测试
31 |     """
32 |     json = IO.getJson()
33 |     print(json)
34 |     print(json['cookie'])
35 | 
36 | 
37 | def main():
38 |     print('BrowserDefaultTest is Running...')
39 |     BrowserDefaultTest()
40 |     print('BrowserDefaultTest Sucess!')
41 | 
42 |     print('OITest is Running...')
43 |     OITest()
44 |     print('OITest Sucess')
45 | 
46 | 
47 | if __name__:
48 |     main()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 扩散性百万甜面包
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LuoguCrawler | 洛谷爬虫
 2 | 
 3 | [![Build Status](https://travis-ci.org/Himself65/LuoguCrawler.svg?branch=master)](https://travis-ci.org/Himself65/LuoguCrawler) [![LICENSE](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) ![language](https://img.shields.io/badge/language-python3-blue.svg)
 4 | 
 5 | 【停止更新】lin_toto 明确禁止了洛谷的脚本使用
 6 | 
 7 | ## Quick Start
 8 | 
 9 | 注意：luogu/ 文件夹为必备内容，**请不要直接拷贝 example\*.py 代码直接运行**
10 | 
11 | 1.  下载至本地，并解压
12 | 
13 | 2.  运行相关脚本
14 | 
15 | ```bash
16 | cd LuoguCrawler
17 | 
18 | py example_download.py
19 | ```
20 | 
21 | ## Scripts
22 | 
23 | - 下载你的所有 AC 代码
24 | 
25 | 效果图：
26 | 
27 | 快速下载你所有的 AC 代码
28 | 
29 | ![04](/img/04.png)
30 | 
31 | [脚本地址](/example_codedownload.py)
32 | 
33 | - 洛谷用户爬取
34 | 
35 | 多线程爬下用户信息
36 | 
37 | ![03](/img/03.png)
38 | 
39 | [脚本地址](/example_userInfoCrawler.py)
40 | 
41 | - 其他脚本截图
42 | 
43 | ![01](/img/01.png)
44 | 
45 | ![02](/img/02.png)
46 | 
47 | ## Config
48 | 
49 | - 修改 config.json 内容
50 | 
51 | - 某些特定脚本或许有需要配置的地方（将会修复）
52 | 
53 | ```python
54 | myUrl = "https://www.luogu.org/space/show?uid=72813"
55 | myID = 72813
56 | ```
57 | 
58 | ## TODO
59 | 
60 | 详见[TODO.md](TODO.md)
61 | 
62 | ### Credits
63 | 
64 | 感谢洛谷开发组提供的灵感，也感谢使用我 Repo 的各位
65 | 
66 | ## LICENSE
67 | 
68 | LowguNotice is available under the MIT license. See the [LICENSE](LICENSE) file for more information.
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # local
104 | .vscode
105 | cookie.json
106 | test.json
107 | 
108 | temp.*
109 | *.xlsx
110 | download/
111 | 
112 | .DS_Store
113 | */.DS_Store
114 | */*/.DS_Store
115 | .test
116 | *.icloud
117 | *.db
118 | 
119 | himself65
120 | .idea


--------------------------------------------------------------------------------
/example_codedownload.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from luogu import *
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | import asyncio
  6 | import os
  7 | import ssl
  8 | 
  9 | cookie = 'UM_distinctid=1628f18d8fa568-0c77f61d2d6685-336c7b05-100200-1628f18d8fb74e; __client_id=4481c1bb80e250e3e1b43eb658c1f4882b4c98a5; CNZZDATA5476811=cnzz_eid%3D657620256-1522818985-%26ntime%3D1528115158'
 10 | ID = 72813  # 这里写你的id
 11 | mainUrl = 'https://www.luogu.org'
 12 | pageUrl = 'https://www.luogu.org/recordnew/lists?uid=' + str(ID) + '&page='
 13 | downloadPath = 'download/'
 14 | codePath = downloadPath + 'code/'
 15 | 
 16 | DEBUG = True
 17 | 
 18 | # browser
 19 | browser = LuoguBrowser()
 20 | browser.insert_headers('cookie', cookie)
 21 | 
 22 | 
 23 | def getPageUrl(pageNum):
 24 |     return pageUrl + str(pageNum)
 25 | 
 26 | 
 27 | def saveLocal(fileName, text):
 28 |     path = codePath + fileName + '.cpp'
 29 |     if os.path.exists(path):
 30 |         return
 31 |     f = open(path, mode='w')
 32 |     f.write(text)
 33 |     f.close()
 34 | 
 35 | 
 36 | def downloadCode(url):
 37 |     browser.openURL(url)
 38 |     data = browser.getData()
 39 |     html = browser.ungzip(data).decode()
 40 |     soup = BeautifulSoup(html, 'html.parser')
 41 |     try:
 42 |         text = soup.find('code').get_text()
 43 |         name = soup.find('h1').get_text()
 44 |         saveLocal(name, text)
 45 |         print('下载完成:%s' % url)
 46 |         return True
 47 |     except AttributeError:
 48 |         print('下载异常:%s' % url)
 49 |         return False
 50 | 
 51 | 
 52 | def searchPage(start, end):
 53 |     """ [start, end)
 54 |     """
 55 |     count = 0
 56 |     for i in range(start, end):
 57 |         if DEBUG:
 58 |             print("现在是第%d页" % i)
 59 |         url = getPageUrl(i)
 60 |         browser.openURL(url)
 61 |         data = browser.getData()
 62 |         html = browser.ungzip(data).decode()
 63 |         soup = BeautifulSoup(html, 'html.parser')
 64 |         items = soup.find('div', {
 65 |             'class': 'lg-content-table-left'
 66 |         }).find_all('div', {'class': 'am-g lg-table-bg0 lg-table-row'})
 67 |         for item in items:
 68 |             point = item.find('strong', {'class': 'lg-fg-green'})
 69 |             if point is None:
 70 |                 continue
 71 |             acurl = item.find_all('a', {
 72 |                 'target': '_blank',
 73 |                 'data-pjax': ''
 74 |             })[0]['href']
 75 |             import re
 76 |             if re.search(acurl, '/record/show?rid=*'):
 77 |                 if DEBUG:
 78 |                     print(acurl)
 79 |                 continue
 80 |             if downloadCode(mainUrl + acurl):
 81 |                 count += 1
 82 |     print('代码共', count)
 83 | 
 84 | 
 85 | def main():
 86 |     page = 1  # start Page
 87 |     url = getPageUrl(page)
 88 |     browser.openURL(url)
 89 |     data = browser.getData()
 90 |     html = browser.ungzip(data).decode()
 91 |     soup = BeautifulSoup(html, 'html.parser')
 92 |     items = soup.find('ul', {
 93 |         'class': 'am-pagination am-pagination-centered'
 94 |     }).find_all('li')
 95 |     lastestItem = items[-1]
 96 |     maxPage = lastestItem.find('a')['data-ci-pagination-page']
 97 |     # 找到最大页码
 98 |     print('最大页数', maxPage)
 99 |     searchPage(1, int(maxPage) + 1)
100 | 
101 | 
102 | def init():
103 |     print('初始化中')
104 |     if not os.path.exists(downloadPath):
105 |         print('正在创建文件夹download...')
106 |         os.makedirs(downloadPath)
107 |         print('done...')
108 |     if not os.path.exists(codePath):
109 |         print('正在创建task文件')
110 |         os.makedirs(codePath)
111 |         print('done...')
112 |     print('初始化完成')
113 | 
114 | 
115 | if __name__:
116 |     ssl._create_default_https_context = ssl._create_unverified_context
117 |     init()
118 |     main()


--------------------------------------------------------------------------------
/luogu/LuoguBrowser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | #-*- coding:utf-8 -*-
  3 | """
  4 | 简单的爬虫项目来访问洛谷官网
  5 | """
  6 | from __future__ import absolute_import
  7 | from http import cookiejar
  8 | from urllib import parse, request
  9 | from bs4 import BeautifulStoneSoup
 10 | import json
 11 | import socket
 12 | import gzip
 13 | 
 14 | __author__ = "Himself65"
 15 | __license__ = "MIT"
 16 | 
 17 | defaultURL = "https://www.luogu.org"
 18 | 
 19 | from random import sample
 20 | user_agents = [
 21 |     'Opera/9.25 (Windows NT 5.1; U; en)',
 22 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
 23 |     'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
 24 |     'Mozilla/5.0 (X11; U; linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
 25 |     'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
 26 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 27 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 28 | ]
 29 | 
 30 | 
 31 | def get_agent():
 32 |     """
 33 |     每次随机返回一个user-agent
 34 |     """
 35 |     return sample(user_agents, 1)[0]
 36 | 
 37 | 
 38 | class LuoguBrowser(object):
 39 |     """
 40 |     """
 41 |     _headers = {}
 42 | 
 43 |     def __init__(self):
 44 |         """ 
 45 |         初始化访问洛谷
 46 |         """
 47 |         self._headers['user-agent'] = get_agent()
 48 |         self._headers['accept-encoding'] = "gzip, deflate, br"
 49 |         self._headers['accept-language'] = "zh,en;q=0.9,zh-CN;q=0.8,ja;q=0.7"
 50 |         self.setOpener()
 51 | 
 52 |     def insert_headers(self, key, value):
 53 |         """
 54 |         插入值到请求头
 55 |         每次插入后会自动setOpener
 56 |         """
 57 |         self._headers[key] = value
 58 |         self.setOpener()
 59 | 
 60 |     def setOpener(self):
 61 |         """ 初始化opener
 62 |         """
 63 |         cj = cookiejar.CookieJar()
 64 |         pro = request.HTTPCookieProcessor(cj)
 65 |         self.opener = request.build_opener(pro)
 66 |         header = []
 67 |         for key, value in self._headers.items():
 68 |             elem = (key, value)
 69 |             header.append(elem)
 70 |         self.opener.addheaders = header
 71 | 
 72 |     def openURL(self, url, data=None, timeout=None):
 73 |         """
 74 |         访问地址
 75 |         """
 76 |         import socket
 77 |         if timeout is None:
 78 |             timeout = socket._GLOBAL_DEFAULT_TIMEOUT
 79 |         if url is None:
 80 |             raise AttributeError('url is none')
 81 |         # url = url.encode('UTF8', errors='strict')
 82 |         self.response = self.opener.open(url, data=data, timeout=timeout)
 83 | 
 84 |     def getData(self):
 85 |         """
 86 |         获取response内容
 87 |         """
 88 |         return self.response.read()
 89 | 
 90 |     def getResponse(self):
 91 |         """
 92 |         获取response
 93 |         """
 94 |         return self.response
 95 | 
 96 |     @staticmethod
 97 |     def create_query_string_message(dictionary):
 98 |         """
 99 |         创建请求地址
100 |         Args:
101 |             dictionary -> 字典，例如：
102 |             
103 |         Returns -> String
104 |             例如: 
105 |             { 
106 |                 "id": 761282619,
107 |                 "name": "himself65" 
108 |             }
109 |             Return -> "id=761282619&name=himself65"
110 |         """
111 |         s = ""
112 |         for (key, value) in dictionary.items():
113 |             s = s + str(key) + "=" + str(value) + "&"
114 |         return s.rstrip("&")
115 | 
116 |     @staticmethod
117 |     def getDataFromResponse(response, data='more'):
118 |         """
119 |         获取response请求中特定内容
120 |         Args:
121 |             response -> Response
122 |             data -> String, 需要获得的数据，默认为'more'
123 |         Returns -> String
124 |         """
125 |         messages = json.loads(response)
126 |         return messages[data]
127 | 
128 |     @staticmethod
129 |     def check_Accessible(data, name='code', accessStatus=200):
130 |         """
131 |         检查状态值是否成功
132 |         Args:
133 |             data -> Dictionary, 为Response返回的请求
134 |         Returns -> Bool
135 | 
136 |         """
137 |         data_json = json.loads(data)
138 |         return data_json[name] == accessStatus if True else False
139 | 
140 |     @staticmethod
141 |     def ungzip(data):
142 |         """
143 |         ungzip the data
144 |         """
145 |         try:
146 |             ungzipData = gzip.decompress(data)
147 |         except:
148 |             print("解压失败，返回原数据")
149 |             return data
150 |         return ungzipData
151 | 


--------------------------------------------------------------------------------
/example_userInfoCrawler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 注意：
  5 |     1. 服务端上需自行配置代理
  6 |     2. start_num end_num 为需手动填写
  7 | """
  8 | from luogu import *
  9 | from openpyxl import Workbook
 10 | from openpyxl import load_workbook
 11 | from bs4 import BeautifulSoup
 12 | from urllib import request, error
 13 | from queue import Queue
 14 | 
 15 | import time
 16 | import queue
 17 | import os
 18 | import ssl
 19 | import json
 20 | import threading
 21 | import linecache
 22 | 
 23 | # 必填内容
 24 | start_num = 1
 25 | end_num = 1000
 26 | 
 27 | # 洛谷网站
 28 | defaultURL = "https://www.luogu.org"
 29 | userURL = "https://www.luogu.org/space/show?uid="
 30 | # 此处不建议修改
 31 | title = ['id', '名字', '头像', '总提交数', 'AC数', '贡献', '活跃', '积分', '用户类型', '注册时间']
 32 | wbName = 'luogu2.xlsx'
 33 | wsName = '1'
 34 | downloadPath = 'download/'
 35 | imagePath = downloadPath + 'img/'
 36 | taskPath = downloadPath + 'task/'
 37 | 
 38 | 
 39 | def download_img(url, userName):
 40 |     """ 下载图片到download/文件夹下
 41 |     """
 42 |     loc = imagePath + userName + '.png'
 43 |     if os.path.exists(loc):
 44 |         return
 45 |     try:
 46 |         # 下载图片
 47 |         request.urlretrieve(url, filename=loc)
 48 |     except:
 49 |         print("\n无法下载文件")
 50 | 
 51 | 
 52 | def crawler(taskque, que):
 53 |     """ get task from taskque
 54 |     """
 55 |     try:
 56 |         # Init browser
 57 |         browser = LuoguBrowser()
 58 |         browser.openURL(defaultURL)
 59 |     except Exception as e:
 60 |         print("无法创建")
 61 |         print(e)
 62 |         return
 63 |     while True:
 64 |         try:
 65 |             i = taskque.get(block=True, timeout=1)
 66 |         except queue.Empty:
 67 |             print('无更多任务')
 68 |             print('请等待结束')
 69 |             return
 70 |         try:
 71 |             # Get messageURL
 72 |             messageURL = userURL + str(i)
 73 |             ## View Web
 74 |             browser.openURL(messageURL)
 75 |             ## getData
 76 |             html = browser.getData()
 77 |             html = LuoguBrowser.ungzip(html)
 78 |             soup = BeautifulSoup(html, 'html.parser')
 79 |             # print(soup)
 80 |             board = soup.find(
 81 |                 'ul', {'class': 'am-list am-list-static lg-summary-list'})
 82 |             items = board.find_all("li")
 83 |             # 0
 84 |             userName = soup.find('span', {'name': 'username'}).get_text()
 85 |             avatar = items[0].find('img')['src']
 86 |             # 1
 87 |             allPost = items[1].find_all('span', {'class': 'lg-bignum-num'})
 88 |             Num = allPost[0].get_text()
 89 |             ACNum = allPost[1].get_text()
 90 |             # 2
 91 |             Acts = items[4].find('span', {'class': 'lg-right'}).get_text()
 92 |             Acts = Acts.split('/')
 93 |             contribute = Acts[0]
 94 |             active = Acts[1]
 95 |             integral = Acts[2]
 96 |             # 3
 97 |             Type = items[5].find('span', {'class': 'lg-right'}).get_text()
 98 |             # 4
 99 |             registeredTime = items[6].find('span', {
100 |                 'class': 'lg-right'
101 |             }).get_text()
102 |             # make t
103 |             t = [
104 |                 i, userName, avatar, Num, ACNum, contribute, active, integral,
105 |                 Type, registeredTime
106 |             ]
107 |             # 下载图片
108 |             download_img(avatar, str(i))
109 |             # finish
110 |             taskque.task_done()
111 |             que.put(t)
112 |         except AttributeError:
113 |             que.put([i, '无此人'])
114 |             print('找不到id:', i)
115 |         except Exception as e:
116 |             print(e)
117 | 
118 | 
119 | def saveThread(que, sheet):
120 |     while True:
121 |         try:
122 |             t = que.get(block=True, timeout=60)
123 |             if t[1] != '-1':
124 |                 sheet.append(t)
125 |                 path = taskPath + str(t[0])
126 |                 if os.path.exists(path):
127 |                     os.remove(path)
128 |         except queue.Empty:
129 |             return
130 |         que.task_done()
131 | 
132 | 
133 | def getLine(num):
134 |     """ 返回是否为true
135 |     """
136 |     if os.path.exists(taskPath + str(num)):
137 |         return True
138 |     return False
139 | 
140 | 
141 | def getTaskThread(que, filePath):
142 |     """ 创建任务列队
143 |     """
144 |     # thread = threading.current_thread()
145 |     tgroup = os.listdir(taskPath)
146 |     for item in tgroup:
147 |         try:
148 |             que.put(int(item))
149 |         except ValueError:
150 |             print(item)
151 |     print('剩余任务数量:', que.qsize())
152 | 
153 | 
154 | def init():
155 |     print('初始化中')
156 |     if not os.path.exists(downloadPath):
157 |         print('正在创建文件夹download...')
158 |         os.makedirs(downloadPath)
159 |         print('done...')
160 |     if not os.path.exists(taskPath):
161 |         print('正在创建task文件')
162 |         os.makedirs(taskPath)
163 |         # 第一次跑脚本时候使用
164 |         taskMaker(start=start_num, end=end_num)
165 |         print('done...')
166 |     if not os.path.exists(imagePath):
167 |         print('正在创建文件夹image...')
168 |         os.makedirs(imagePath)
169 |         print('done...')
170 |     if not os.path.exists(wbName):
171 |         print('正在创建Excel...')
172 |         wb = Workbook()
173 |         wb.create_sheet(title=wsName)
174 |         wb.save(wbName)
175 |         print('done...')
176 |     print('初始化完成')
177 | 
178 | 
179 | def taskMaker(start=1, end=100):
180 |     """ 初始化任务表
181 |     """
182 |     if not os.path.exists(taskPath):
183 |         os.makedirs(taskPath)
184 |     for i in range(start, end):
185 |         f = open(taskPath + str(i), mode='w')
186 |         f.close()
187 |     return
188 | 
189 | 
190 | def backgroundThread(saveQue, taskQue):
191 |     while True:
192 |         sz = saveQue.qsize()
193 |         print('待保存量:', sz)
194 |         sz = taskQue.qsize()
195 |         print('剩余任务:', sz)
196 |         time.sleep(30)
197 | 
198 | 
199 | def main():
200 |     # MARK -- 参考答案：https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
201 |     ssl._create_default_https_context = ssl._create_unverified_context
202 |     # init
203 |     init()
204 |     # load data
205 |     print('loading...')
206 |     wb = load_workbook(wbName)
207 |     sheet = wb[wsName]
208 |     sheet.append(title)
209 | 
210 |     # thread
211 |     saveQue = Queue()
212 |     taskQue = Queue()
213 |     thread = []
214 |     for i in range(0, 9):  # 爬虫线程列队
215 |         t = threading.Thread(
216 |             target=crawler, name=str(i), args=(taskQue, saveQue))
217 |         thread.append(t)
218 |     st = threading.Thread(
219 |         target=saveThread, name='saveThread', args=(saveQue, sheet))
220 |     gt = threading.Thread(
221 |         target=getTaskThread, name='getTaskThread', args=(taskQue, taskPath))
222 |     bg = threading.Thread(
223 |         target=backgroundThread,
224 |         name='backgroundThread',
225 |         args=(saveQue, taskQue))
226 |     print('loading...')
227 |     try:
228 |         print('start!')
229 |         gt.start()
230 |         gt.join()
231 |         for t in thread:
232 |             t.start()
233 |         st.start()
234 |         bg.start()
235 |         st.join()
236 |     except:
237 |         print("线程错误")
238 |     finally:
239 |         wb.save(wbName)
240 | 
241 | 
242 | if __name__:
243 |     main()


--------------------------------------------------------------------------------
/user_info_download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import sqlite3  # 数据库
  4 | import gc
  5 | import urllib3
  6 | import multiprocessing  # 多线程模块
  7 | import threading  # 多线程模块
  8 | import ssl
  9 | import re  # 正则表达式
 10 | 
 11 | from time import time
 12 | from queue import Queue
 13 | from bs4 import BeautifulSoup
 14 | from luogu import LuoguBrowser
 15 | """
 16 | 重构洛谷用户数据爬取
 17 | 多进程爬虫 + 数据库存储 + 缓存机制 + 内存优化
 18 | 
 19 | 过程：
 20 |     1.消息列队发送任务
 21 |     2.爬虫列队抓取网页分析
 22 |     3.保存列队保存到本地
 23 | """
 24 | __author__ = 'himself65'
 25 | __license__ = 'MIT'
 26 | __email__ = 'himself6565@gmail.com'
 27 | 
 28 | # SETTINGS
 29 | DATABASE = {
 30 |     'name': 'himself65',
 31 | }
 32 | Threads_Number = 10  # 默认10个爬虫列队
 33 | DEBUG = True
 34 | 
 35 | # END
 36 | 
 37 | GET_TASKS_SQL = '''SELECT ID FROM TASK;'''
 38 | 
 39 | 
 40 | def get_database():
 41 |     """获取数据库"""
 42 |     name = DATABASE['name'] + '.db'
 43 |     database = sqlite3.connect(name)
 44 |     return database
 45 | 
 46 | 
 47 | def task_producer(a, b):
 48 |     """生成任务 区间[a, b]"""
 49 |     db = get_database()
 50 |     cursor = db.cursor()
 51 |     for i in range(a, b + 1):
 52 |         create_task_sql = '''INSERT INTO TASK(ID)
 53 |                         VALUES (%d);''' % (i)
 54 |         cursor.executescript(create_task_sql)
 55 |     db.close()
 56 | 
 57 | 
 58 | def get_task(task_que: Queue):
 59 |     """任务队列，从数据库中加载任务放到task_que中"""
 60 |     db = get_database()
 61 |     cursor = db.cursor()
 62 |     cursor.execute(GET_TASKS_SQL)
 63 |     task_list = cursor.fetchall()
 64 |     if len(task_list) != 0:
 65 |         for item in task_que:
 66 |             time_1 = time()
 67 |             task_que.put(item)
 68 |             print("加载任务耗时: %lf" % (time() - time_1))
 69 |     else:
 70 |         print("没有找到任务")
 71 |     db.close()
 72 | 
 73 | 
 74 | def save_task(save_que: Queue):
 75 |     """存储队列 TODO"""
 76 |     db = get_database()
 77 |     cursor = db.cursor()
 78 |     while not save_que.empty():
 79 |         item = save_que.get(timeout=5)
 80 |         uid = item['uid']
 81 |         name = item['name']
 82 |         sub_num = item['submit_num']
 83 |         ac_num = item['ac_num']
 84 |         contribute = item['contribute']
 85 |         active = item['active']
 86 |         integral = item['integral']
 87 |         created_time = item['created_time']
 88 |         save_user_sql = '''INSERT INTO USER(ID, 
 89 |                                             NAME, 
 90 |                                             SUBMIT_NUM,
 91 |                                             AC_NUM, 
 92 |                                             CONTRIBUTE,
 93 |                                             ACTIVE,
 94 |                                             INTEGRAL,
 95 |                                             CREATED_TIME)
 96 |                             VALUES(%d %s %d %d %d %d %d %s);
 97 |                             DELETE FROM TASK WHERE ID = %d;''' % (uid, name,
 98 |                                                                   sub_num,
 99 |                                                                   ac_num,
100 |                                                                   contribute,
101 |                                                                   active,
102 |                                                                   integral,
103 |                                                                   created_time,
104 |                                                                   uid)
105 |         # Save
106 |         cursor.execute(save_user_sql)
107 |         save_que.task_done()
108 | 
109 | 
110 | def crawler(task_que: Queue, save_que: Queue):
111 |     """爬虫部分"""
112 |     import urllib
113 |     import time
114 |     user_url = "https://www.luogu.org/space/show?uid="
115 | 
116 |     def get_url(user_id):
117 |         """得到用户的url"""
118 |         return '%s%s' % (user_url, user_id)
119 | 
120 |     while not task_que.empty():
121 |         user_id = task_que.get(block=True)
122 |         url = get_url(user_id)
123 |         start_time = time.time()
124 |         html = urllib.request.urlopen(url)
125 |         soup = BeautifulSoup(html, 'html.parser')
126 |         if DEBUG:
127 |             print('用户:%d 用时%lf' % (user_id, time.time() - start_time))
128 |         if re.match('提示', soup.title.string) is not None:
129 |             # 判断用户是否存在
130 |             print('不存在用户', user_id)
131 |             continue
132 |         """
133 |         公开代码者有 5 个ul
134 |             1. avatar
135 |             2. ac_num submit_num
136 |             3. contribute active integral
137 |             4. utype
138 |             5. created_time
139 |         隐私保护者有 4 个ul
140 |             1. avatar
141 |             2. contribute active integral
142 |             3. utype
143 |             4. created_time
144 |         """
145 |         # 找到信息板块
146 |         board = soup.find('ul',
147 |                           {'class': 'am-list am-list-static lg-summary-list'})
148 |         board_items = board.find_all("li")
149 |         # start
150 |         user_name = soup.find('span', {'name': 'username'}).get_text()
151 |         if len(board_items) == 7:
152 |             """公开代码用户"""
153 |             # 1
154 |             allPost = board_items[1].find_all('span',
155 |                                               {'class': 'lg-bignum-num'})
156 |             submit_num = allPost[0].get_text()
157 |             ac_num = allPost[1].get_text()
158 |             # 2
159 |             acts = board_items[4].find('span', {
160 |                 'class': 'lg-right'
161 |             }).get_text()
162 |             acts = acts.split('/')
163 |             contribute = acts[0]
164 |             active = acts[1]
165 |             integral = acts[2]
166 |             # 3
167 |             utype = board_items[5].find('span', {
168 |                 'class': 'lg-right'
169 |             }).get_text()
170 |             # 4
171 |             created_time = board_items[6].find('span', {
172 |                 'class': 'lg-right'
173 |             }).get_text()
174 |             luogu_user = {
175 |                 'uid': user_id,
176 |                 'name': user_name,
177 |                 'submit_num': submit_num,
178 |                 'ac_num': ac_num,
179 |                 'contribute': contribute,
180 |                 'active': active,
181 |                 'integral': integral,
182 |                 'created_time': created_time,
183 |             }
184 |         else:
185 |             """隐私代码用户"""
186 |             acts = board_items[1].find('span', {
187 |                 'class': 'lg-right'
188 |             }).get_text()
189 |             acts = acts.split('/')
190 |             contribute = acts[0]
191 |             active = acts[1]
192 |             integral = acts[2]
193 |             created_time = board_items[3].find('span', {
194 |                 'class': 'lg-right'
195 |             }).get_text()
196 |             luogu_user = {
197 |                 'uid': user_id,
198 |                 'name': user_name,
199 |                 'submit_num': -1,
200 |                 'ac_num': -1,
201 |                 'contribute': contribute,
202 |                 'active': active,
203 |                 'integral': integral,
204 |                 'created_time': created_time,
205 |             }
206 |         if DEBUG:
207 |             print(luogu_user)
208 | 
209 |         # Finished
210 |         save_que.put(luogu_user)
211 |         task_que.task_done()
212 | 
213 | 
214 | def init():
215 |     db = get_database()
216 |     cursor = db.cursor()
217 |     # 创建表
218 |     create_table_sql = '''CREATE TABLE IF NOT EXISTS USER(
219 |                             ID INT  PRIMARY KEY NOT NULL,
220 |                             NAME TEXT     NOT NULL,
221 |                             SUBMIT_NUM         INT,
222 |                             AC_NUM             INT,
223 |                             CONTRIBUTE         INT,
224 |                             ACTIVE             INT,
225 |                             INTEGRAL           INT,
226 |                             CREATED_TIME       TEXT
227 |                         );'''
228 |     cursor.executescript(create_table_sql)
229 |     create_table_sql = '''CREATE TABLE IF NOT EXISTS TASK(
230 |                             ID INT PRIMARY KEY NOT NULL
231 |                         );'''
232 |     cursor.execute(create_table_sql)
233 |     # 此处不用execute则无法获取，暂且不知道原因
234 |     cursor.execute(GET_TASKS_SQL)
235 |     exist_task = cursor.fetchone()
236 |     if exist_task is None:
237 |         print("no task exist!")
238 |         exit()
239 |     # Close DataBase
240 |     db.close()
241 | 
242 | 
243 | def test(tq: Queue, sq: Queue):
244 |     """Unit Test Part"""
245 |     init()
246 | 
247 | 
248 | def main(tq: Queue, sq: Queue):
249 |     # Create Theads
250 |     threads_list = []
251 |     for td_id in range(1, Threads_Number + 1):
252 |         td = threading.Thread(target=crawler, args=(tq, sq), name=str(td_id))
253 |         threads_list.append(td)
254 | 
255 |     for td in threads_list.index():
256 |         td.start()
257 | 
258 |     save_td = threading.Thread(target=save_task, args=(sq), name='Save_Que')
259 |     save_td.start()
260 |     save_td.join()
261 |     print('Finished')
262 | 
263 | 
264 | if __name__:
265 |     # MARK -- 参考答案：https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
266 |     ssl._create_default_https_context = ssl._create_unverified_context
267 |     # 任务列表
268 |     task_que = Queue()
269 |     save_que = Queue()
270 |     main(task_que, save_que)
271 | 


--------------------------------------------------------------------------------