├── Pinnacle
├── __init__.py
├── spiders
│ ├── __init__.py
│ └── pinnacle.py
├── items.py
├── pipelines.py
├── settings.py
└── middlewares.py
├── .gitignore
├── scrapy.cfg
├── LICENSE
└── README.md
/Pinnacle/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *iml
3 | geckodriver.log
4 | *json
5 | __pycache__
--------------------------------------------------------------------------------
/Pinnacle/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Pinnacle/items.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 |
3 |
4 | class Event(scrapy.Item):
5 | game = scrapy.Field()
6 | date = scrapy.Field()
7 | player1 = scrapy.Field()
8 | odds1 = scrapy.Field()
9 | player2 = scrapy.Field()
10 | odds2 = scrapy.Field()
11 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = Pinnacle.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Pinnacle
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018, Alexander Shums'kii
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/Pinnacle/pipelines.py:
--------------------------------------------------------------------------------
1 | """
2 | This pipeline will do next:
3 | - transform date of the single event to UTC format
4 | - Drop all events/games that already passed
5 | - Drop all events/games that not a primary(events, where you betting on 'first blood', '1st map winner' etc)
6 |
7 | Enter your own value for :var:*TIME_DIFFERENCE*
8 | """
9 | import datetime
10 |
11 | from scrapy.exceptions import DropItem
12 |
13 | # Its seems, all dates on site shows in -8 GMT zone(for me).
14 | # Well, for now difference between UTC and site time: -8 hour (but need to be monitor)
15 | # Put your own value here, if it's different for you(with sight)
16 | TIME_DIFFERENCE = -8
17 |
18 |
19 | # just don't forgot to activate your pipeline settings
20 | class PinnaclePipeline(object):
21 |
22 | def process_item(self, item, spider):
23 |
24 | # transforming string date to UTC datetime
25 | current_time_utc = datetime.datetime.utcnow()
26 | time_string = item['date'] # its looks like this: "Sat 03/02 15.15"
27 | first, second, third = time_string.split(" ")
28 | day, month = second.split("/")
29 | hour, minute = third.split(".")
30 | if (current_time_utc.month == 12) and (month < 11): # its nor ok, but works for most part
31 | year = current_time_utc.year + 1
32 | else:
33 | year = current_time_utc.year
34 | site_time = datetime.datetime(year=year, month=int(month), day=int(day),
35 | hour=int(hour), minute=int(minute))
36 | game_time_utc = site_time - datetime.timedelta(minutes=TIME_DIFFERENCE*60)
37 |
38 | if current_time_utc > game_time_utc:
39 | raise DropItem("Event already passed or in progress")
40 | item['date'] = game_time_utc
41 |
42 | # Now selecting primary events:
43 | # not primary events have brackets '()' in players names: "Los Angeles Valiant (map 1)"
44 | if "(" in item['player1']:
45 | raise DropItem("Not a primary event: {}".format(item))
46 | if "select matches" in item['player1'].lower():
47 | raise DropItem("Not an event")
48 |
49 | return item
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pinnacle-Scraper
2 | Scrapping esport betting information from web site www.pinacle.com using Scrapy and Selenium.
3 |
4 | Take note: script was created for educational purposes to demonstrate usage of scrapy *Pipelines*,
5 | *LinkExtractors*, "Rules", *Generic Spiders*, *Items*, *xpath selectors*.
6 |
7 | So, what does this spider exactly doing(general algorithm):
8 | 1. Gather links to betting pages for each esport event(using appropriate set of rules).
9 | 2. Follow each extracted link and scrape esport data.
10 | 3. Filter gathered data in the pipeline.
11 |
12 | After all processes finished we will get information about each single esport event to come. But, we will
13 | not include events, that already passed(or in progress), and betting data for not primary events(such as betting
14 | on "first blood", "second map winner" etc). Also, event/game time will be converted to UTC format. (If you want
15 | include all events and keep original "site time" - comment code inside "pipelines.py" file or exclude pipelines
16 | in "setting.py").
17 |
18 | Keys and description for each returning line of information:
19 | - 'date' - date of the single event/game in timedate format converted to UTC time(or tried to);
20 | - 'game' - name of the game(CS:GO, League of Legends, Dota 2 etc);
21 | - 'player1' - name of the first participant(or team name, like: "Fnatic" or "Team Liquid" etc);
22 | - 'player2' - name of the second participant;
23 | - 'odds1' - bet rate on the first player(float value, like: 1.862);
24 | - 'odds2' - bet rate on the second player(float value).
25 |
26 | This script was written in Python 3.6(for scrapy 1.5) and tested on Windows machine. Before running it,
27 | you'll need to install:
28 | - Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs);
29 | - Selenium (with geckodriver for Windows machines);
30 | - Firefox browser.
31 |
32 | After installing all requirements - copy "Pinnacle" folder to your machine/device. Open "pipelines.py" file
33 | and set variable "TIME_DIFFERENCE" to your own value (if needed).
34 |
35 | To run a spider - change your location in terminal to scrapy project folder and type:
36 | ```scrapy crawl pinnacle```
37 | To save data to .json file(for example), type:
38 | ```scrapy crawl pinnacle -o yourfile.json```
39 |
--------------------------------------------------------------------------------
/Pinnacle/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for Pinnacle project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'Pinnacle'
13 |
14 | SPIDER_MODULES = ['Pinnacle.spiders']
15 | NEWSPIDER_MODULE = 'Pinnacle.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Pinnacle (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'Pinnacle.middlewares.PinnacleSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'Pinnacle.middlewares.PinnacleDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'Pinnacle.pipelines.PinnaclePipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/Pinnacle/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class PinnacleSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class PinnacleDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/Pinnacle/spiders/pinnacle.py:
--------------------------------------------------------------------------------
1 | """
2 | This scrapy spider will scrape betting data about esports events from web site "pinnacle.com"
3 |
4 | Take note: script was created for educational purposes to demonstrate usage of scrapy *Pipelines*,
5 | *LinkExtractors*, "Rules", *Generic Spiders*, *Items*, *xpath selectors*.
6 |
7 | So, what does this spider exactly doing(general algorithm):
8 | 1. Gather links to betting pages for each esport event(using appropriate set of rules).
9 | 2. Follow each extracted link and scrape esport data.
10 | 3. Filter gathered data in the pipeline.
11 | After all processes finished we will get information about each single esport event to come. But, we will
12 | not include events, that already passed(or in progress), and betting data for not primary events(such as betting
13 | on "first blood", "second map winner" etc). Also, event/game time will be converted to UTC format. (If you want
14 | include all events and keep original "site time" - comment code inside "pipelines.py" file or exclude pipelines
15 | in "setting.py").
16 |
17 | Keys and description for each returning line of information:
18 | - 'date' - date of the single event/game in timedate format converted to UTC time(or tried to);
19 | - 'game' - name of the game(CS:GO, League of Legends, Dota 2 etc);
20 | - 'player1' - name of the first participant(or team name, like: "Fnatic" or "Team Liquid" etc);
21 | - 'player2' - name of the second participant;
22 | - 'odds1' - bet rate on the first player(float value, like: 1.862);
23 | - 'odds2' - bet rate on the second player(float value).
24 |
25 | This script was written in Python 3.6(for scrapy 1.5) and tested on Windows machine. Before running it,
26 | you'll need to install:
27 | - Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs);
28 | - Selenium (with geckodriver for Windows machines);
29 | - Firefox browser.
30 | After installing all requirements - copy "Pinnacle" folder to your machine/device. Open "pipelines.py" file
31 | and set variable "TIME_DIFFERENCE" to your own value (if needed).
32 |
33 | To run a spider - change your location in terminal to scrapy project folder and type: scrapy crawl pinnacle
34 | To save data to .json file(for example), type: scrapy crawl pinnacle -o yourfile.json
35 | """
36 |
37 | import time
38 |
39 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
40 | from scrapy.spiders import Rule, CrawlSpider
41 | from scrapy import Selector
42 | from Pinnacle.items import Event
43 |
44 | from selenium import webdriver
45 |
46 |
47 | class Pinnacle(CrawlSpider):
48 | """
49 | Spider for extracting links, following them and parsing data from response.
50 | Note: we using here a generic scrapy spider "CrawlSpider" (instead of "scrapy.Spider")
51 | and set of rules to extract only "required" urls.
52 | """
53 |
54 | name = 'pinnacle'
55 | allowed_domains = ["www.pinnacle.com"]
56 | start_urls = ["https://www.pinnacle.com/en/"]
57 | # Our esport events always have this part in their links: "odds/match/e-sports/"
58 | rules = (
59 | Rule(LxmlLinkExtractor(allow="odds/match/e-sports/",
60 | allow_domains=allowed_domains,
61 | restrict_css="ul li.level-2", unique=True), callback='parse_item'),
62 | )
63 |
64 | def load_page(self, url, sleeptime):
65 | """Load page with selenium and get source code after page fully loaded"""
66 | driver = webdriver.Firefox()
67 | driver.get(url)
68 | time.sleep(sleeptime)
69 | source = Selector(text=driver.page_source)
70 | driver.close()
71 | return source
72 |
73 | def parse_item(self, response):
74 | sleeptime = 2
75 | game_name = response.xpath('//header//div[@class="breadcrumbs"]/a[3]/text()').extract_first()
76 | # ok, for 'New Market' category we need to get game name from another place
77 | if game_name == "New Markets":
78 | # we get something like this "eSports CS:GO - GOTV.GG Invitational Odds"
79 | game_name = response.xpath('//h1[@class="sport-title"]/text()').extract_first()
80 | # take 2nd word from previous string
81 | game_name = game_name.split(" ")[1]
82 |
83 | # getting dynamically loaded content:
84 | source = self.load_page(response.url, sleeptime)
85 | # Now we going to find all tables with events on current page and loop through them
86 | events_table = source.xpath('//div[@ng-repeat="date in currentPeriod.dates"]')
87 | for table in events_table:
88 | item = Event()
89 | # getting all rows in current table and loop through them(don't forget relative '.' in xpath):
90 | rows = table.xpath('.//table[@class="odds-data"]//tbody')
91 | date_string = table.xpath('.//div[@class="toolbar"]//span[2]/text()').extract_first()
92 | for row in rows:
93 | time_string = row.xpath('.//tr[1]//td[@class="game-time ng-scope"]//span/text()').extract_first()
94 | site_date_string = date_string + time_string
95 |
96 | player1 = row.xpath('.//tr[1]//td[@class="game-name name"]//span/text()').extract_first()
97 | odds1 = row.xpath('.//tr[1]//td[@class="oddTip game-moneyline"]//span/text()').extract_first()
98 | try:
99 | odds1 = float(odds1.strip())
100 | except:
101 | pass
102 |
103 | player2 = row.xpath('.//tr[2]//td[@class="game-name name"]//span/text()').extract_first()
104 | odds2 = row.xpath('.//tr[2]//td[@class="oddTip game-moneyline"]//span/text()').extract_first()
105 | try:
106 | odds2 = float(odds2.strip())
107 | except:
108 | pass
109 |
110 | item['game'] = game_name
111 | item['date'] = site_date_string
112 | item['player1'] = player1
113 | item['odds1'] = odds1
114 | item['player2'] = player2
115 | item['odds2'] = odds2
116 | yield item
117 |
--------------------------------------------------------------------------------