├── HuaweiRank
    ├── __init__.py
    ├── __pycache__
    │   ├── items.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── pipelines.cpython-36.pyc
    │   ├── settings.cpython-36.pyc
    │   └── middlewares.cpython-36.pyc
    ├── spiders
    │   ├── __pycache__
    │   │   ├── rank.cpython-36.pyc
    │   │   └── __init__.cpython-36.pyc
    │   ├── __init__.py
    │   └── rank.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── middlewares.py
├── run.sh
├── README.md
└── scrapy.cfg


/HuaweiRank/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | scrapy crawl rank


--------------------------------------------------------------------------------
/HuaweiRank/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/HuaweiRank/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/HuaweiRank/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/HuaweiRank/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/HuaweiRank/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/__pycache__/middlewares.cpython-36.pyc


--------------------------------------------------------------------------------
/HuaweiRank/spiders/__pycache__/rank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/spiders/__pycache__/rank.cpython-36.pyc


--------------------------------------------------------------------------------
/HuaweiRank/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chierqj/HuaWeiRankList/HEAD/HuaweiRank/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HuaWeiRankList
 2 | 华为软件精英挑战赛榜单
 3 | 
 4 | # 环境
 5 | python3  
 6 | pip3 install scrapy  
 7 | pip3 install pandas  
 8 | 
 9 | # 运行
10 | sh run.sh
11 | 


--------------------------------------------------------------------------------
/HuaweiRank/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = HuaweiRank.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = HuaweiRank
12 | 


--------------------------------------------------------------------------------
/HuaweiRank/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class HuaweirankItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     division = scrapy.Field()
15 |     score = scrapy.Field()
16 |     # rank = scrapy.Field()
17 |     team_name = scrapy.Field()
18 |     users = scrapy.Field()
19 |     submit_time = scrapy.Field()
20 | 


--------------------------------------------------------------------------------
/HuaweiRank/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pandas as pd
 9 | 
10 | 
11 | class HuaweirankPipeline(object):
12 |     def open_spider(self, spider):
13 |         self.df_result = []
14 | 
15 |     def process_item(self, item, spider):
16 |         self.df_result.append(item)
17 |         return item
18 | 
19 |     def close_spider(self, spider):
20 |         result = pd.DataFrame(self.df_result)
21 |         result.to_csv("rank.csv", index=False, encoding="utf_8_sig")
22 | 


--------------------------------------------------------------------------------
/HuaweiRank/spiders/rank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import json
 4 | from HuaweiRank.items import HuaweirankItem
 5 | 
 6 | 
 7 | class RankSpider(scrapy.Spider):
 8 |     name = 'rank'
 9 |     allowed_domains = ['competition.huaweicloud.com']
10 | 
11 |     def start_requests(self):
12 |         nav_list = [
13 |             (1000036574, 136710),
14 |             (1000036576, 136712),
15 |             (1000036577, 136713),
16 |             (1000036578, 136714),
17 |             (1000036579, 136715),
18 |             (1000036580, 136716),
19 |             (1000036581, 136717),
20 |             (1000036582, 136718),
21 |             (1000036583, 136719)
22 |         ]
23 |         urls = ['https://competition.huaweicloud.com/competition/v1/competitions/ranking/{0}?stage_id={1}&page_no=1&page_size=64'.format(
24 |             it[0], it[1]) for it in nav_list]
25 |         divisions = ["京津东北赛区", "上合赛区", "杭厦赛区", "江山赛区",
26 |                      "成渝赛区", "西北赛区", "武长赛区", "粤港澳赛区", "海外赛区"]
27 |         for url, division in zip(urls, divisions):
28 |             yield scrapy.Request(url=url, callback=self.parse, meta={"division": division})
29 | 
30 |     def parse(self, response):
31 |         division = response.meta.get('division', 'null')
32 |         print(division)
33 |         data = json.loads(response.body)
34 |         result = data.get('result', [])
35 |         teamRankingList = result.get('teamRankingList', [])
36 |         results = teamRankingList.get('results', [])
37 |         for it in results:
38 |             item = HuaweirankItem()
39 |             item['division'] = division
40 |             item['score'] = it.get('score', -1)
41 |             # item['rank'] = it.get('ranking', -1)
42 |             item['team_name'] = it.get('teamName', "null")
43 |             item['submit_time'] = it.get('submitTime', "null")
44 |             users = []
45 |             for user in it.get('userList', []):
46 |                 users.append(user.get('domainName', 'null'))
47 |             item['users'] = users
48 |             yield item
49 | 


--------------------------------------------------------------------------------
/HuaweiRank/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for HuaweiRank project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'HuaweiRank'
13 | 
14 | SPIDER_MODULES = ['HuaweiRank.spiders']
15 | NEWSPIDER_MODULE = 'HuaweiRank.spiders'
16 | LOG_LEVEL = "WARNING"
17 | 
18 | 
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | #USER_AGENT = 'HuaweiRank (+http://www.yourdomain.com)'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | # DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | # }
47 | 
48 | # Enable or disable spider middlewares
49 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
50 | # SPIDER_MIDDLEWARES = {
51 | #    'HuaweiRank.middlewares.HuaweirankSpiderMiddleware': 543,
52 | # }
53 | 
54 | # Enable or disable downloader middlewares
55 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
56 | DOWNLOADER_MIDDLEWARES = {
57 |     'HuaweiRank.middlewares.HuaweirankDownloaderMiddleware': 543,
58 | }
59 | 
60 | # Enable or disable extensions
61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
62 | # EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | # }
65 | 
66 | # Configure item pipelines
67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'HuaweiRank.pipelines.HuaweirankPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/HuaweiRank/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class HuaweirankSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class HuaweirankDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------