├── HuaweiRank ├── __init__.py ├── __pycache__ │ ├── items.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ ├── settings.cpython-36.pyc │ └── middlewares.cpython-36.pyc ├── spiders │ ├── __pycache__ │ │ ├── rank.cpython-36.pyc │ │ └── __init__.cpython-36.pyc │ ├── __init__.py │ └── rank.py ├── items.py ├── pipelines.py ├── settings.py └── middlewares.py ├── run.sh ├── README.md └── scrapy.cfg /HuaweiRank/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | scrapy crawl rank -------------------------------------------------------------------------------- /HuaweiRank/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /HuaweiRank/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /HuaweiRank/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /HuaweiRank/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /HuaweiRank/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /HuaweiRank/spiders/__pycache__/rank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/spiders/__pycache__/rank.cpython-36.pyc -------------------------------------------------------------------------------- /HuaweiRank/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HuaWeiRankList 2 | 华为软件精英挑战赛榜单 3 | 4 | # 环境 5 | python3 6 | pip3 install scrapy 7 | pip3 install pandas 8 | 9 | # 运行 10 | sh run.sh 11 | -------------------------------------------------------------------------------- /HuaweiRank/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = HuaweiRank.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = HuaweiRank 12 | -------------------------------------------------------------------------------- /HuaweiRank/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class HuaweirankItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | division = scrapy.Field() 15 | score = scrapy.Field() 16 | # rank = scrapy.Field() 17 | team_name = scrapy.Field() 18 | users = scrapy.Field() 19 | submit_time = scrapy.Field() 20 | -------------------------------------------------------------------------------- /HuaweiRank/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pandas as pd 9 | 10 | 11 | class HuaweirankPipeline(object): 12 | def open_spider(self, spider): 13 | self.df_result = [] 14 | 15 | def process_item(self, item, spider): 16 | self.df_result.append(item) 17 | return item 18 | 19 | def close_spider(self, spider): 20 | result = pd.DataFrame(self.df_result) 21 | result.to_csv("rank.csv", index=False, encoding="utf_8_sig") 22 | -------------------------------------------------------------------------------- /HuaweiRank/spiders/rank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | from HuaweiRank.items import HuaweirankItem 5 | 6 | 7 | class RankSpider(scrapy.Spider): 8 | name = 'rank' 9 | allowed_domains = ['competition.huaweicloud.com'] 10 | 11 | def start_requests(self): 12 | nav_list = [ 13 | (1000036574, 136710), 14 | (1000036576, 136712), 15 | (1000036577, 136713), 16 | (1000036578, 136714), 17 | (1000036579, 136715), 18 | (1000036580, 136716), 19 | (1000036581, 136717), 20 | (1000036582, 136718), 21 | (1000036583, 136719) 22 | ] 23 | urls = ['https://competition.huaweicloud.com/competition/v1/competitions/ranking/{0}?stage_id={1}&page_no=1&page_size=64'.format( 24 | it[0], it[1]) for it in nav_list] 25 | divisions = ["京津东北赛区", "上合赛区", "杭厦赛区", "江山赛区", 26 | "成渝赛区", "西北赛区", "武长赛区", "粤港澳赛区", "海外赛区"] 27 | for url, division in zip(urls, divisions): 28 | yield scrapy.Request(url=url, callback=self.parse, meta={"division": division}) 29 | 30 | def parse(self, response): 31 | division = response.meta.get('division', 'null') 32 | print(division) 33 | data = json.loads(response.body) 34 | result = data.get('result', []) 35 | teamRankingList = result.get('teamRankingList', []) 36 | results = teamRankingList.get('results', []) 37 | for it in results: 38 | item = HuaweirankItem() 39 | item['division'] = division 40 | item['score'] = it.get('score', -1) 41 | # item['rank'] = it.get('ranking', -1) 42 | item['team_name'] = it.get('teamName', "null") 43 | item['submit_time'] = it.get('submitTime', "null") 44 | users = [] 45 | for user in it.get('userList', []): 46 | users.append(user.get('domainName', 'null')) 47 | item['users'] = users 48 | yield item 49 | -------------------------------------------------------------------------------- /HuaweiRank/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for HuaweiRank project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'HuaweiRank' 13 | 14 | SPIDER_MODULES = ['HuaweiRank.spiders'] 15 | NEWSPIDER_MODULE = 'HuaweiRank.spiders' 16 | LOG_LEVEL = "WARNING" 17 | 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'HuaweiRank (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | # DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | # } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 50 | # SPIDER_MIDDLEWARES = { 51 | # 'HuaweiRank.middlewares.HuaweirankSpiderMiddleware': 543, 52 | # } 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 56 | DOWNLOADER_MIDDLEWARES = { 57 | 'HuaweiRank.middlewares.HuaweirankDownloaderMiddleware': 543, 58 | } 59 | 60 | # Enable or disable extensions 61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 62 | # EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | # } 65 | 66 | # Configure item pipelines 67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'HuaweiRank.pipelines.HuaweirankPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /HuaweiRank/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class HuaweirankSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class HuaweirankDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | --------------------------------------------------------------------------------