├── Pinnacle
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── pinnacle.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── middlewares.py
├── .gitignore
├── scrapy.cfg
├── LICENSE
└── README.md


/Pinnacle/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *iml
3 | geckodriver.log
4 | *json
5 | __pycache__


--------------------------------------------------------------------------------
/Pinnacle/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Pinnacle/items.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class Event(scrapy.Item):
 5 |     game = scrapy.Field()
 6 |     date = scrapy.Field()
 7 |     player1 = scrapy.Field()
 8 |     odds1 = scrapy.Field()
 9 |     player2 = scrapy.Field()
10 |     odds2 = scrapy.Field()
11 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Pinnacle.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Pinnacle
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, Alexander Shums'kii
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Pinnacle/pipelines.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This pipeline will do next:
 3 | - transform date of the single event to UTC format
 4 | - Drop all events/games that already passed
 5 | - Drop all events/games that not a primary(events, where you betting on 'first blood', '1st map winner' etc)
 6 | 
 7 | Enter your own value for :var:*TIME_DIFFERENCE*
 8 | """
 9 | import datetime
10 | 
11 | from scrapy.exceptions import DropItem
12 | 
13 | # Its seems, all dates on site shows in -8 GMT zone(for me).
14 | # Well, for now difference between UTC and site time:  -8 hour (but need to be monitor)
15 | # Put your own value here, if it's different for you(with sight)
16 | TIME_DIFFERENCE = -8
17 | 
18 | 
19 | # just don't forgot to activate your pipeline settings
20 | class PinnaclePipeline(object):
21 | 
22 |     def process_item(self, item, spider):
23 | 
24 |         # transforming string date to UTC datetime
25 |         current_time_utc = datetime.datetime.utcnow()
26 |         time_string = item['date']  # its looks like this: "Sat 03/02 15.15"
27 |         first, second, third = time_string.split(" ")
28 |         day, month = second.split("/")
29 |         hour, minute = third.split(".")
30 |         if (current_time_utc.month == 12) and (month < 11):  # its nor ok, but works for most part
31 |             year = current_time_utc.year + 1
32 |         else:
33 |             year = current_time_utc.year
34 |         site_time = datetime.datetime(year=year, month=int(month), day=int(day),
35 |                                       hour=int(hour), minute=int(minute))
36 |         game_time_utc = site_time - datetime.timedelta(minutes=TIME_DIFFERENCE*60)
37 | 
38 |         if current_time_utc > game_time_utc:
39 |             raise DropItem("Event already passed or in progress")
40 |         item['date'] = game_time_utc
41 | 
42 |         # Now selecting primary events:
43 |         # not primary events have brackets '()' in players names: "Los Angeles Valiant (map 1)"
44 |         if "(" in item['player1']:
45 |             raise DropItem("Not a primary event: {}".format(item))
46 |         if "select matches" in item['player1'].lower():
47 |             raise DropItem("Not an event")
48 | 
49 |         return item
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pinnacle-Scraper
 2 | Scrapping esport betting information from web site www.pinacle.com using Scrapy and Selenium. 
 3 | 
 4 | Take note: script was created for educational purposes to demonstrate usage of scrapy *Pipelines*,
 5 | *LinkExtractors*, "<i>Rules</i>", *Generic Spiders*, *Items*, *xpath selectors*.
 6 | 
 7 | So, what does this spider exactly doing(general <b>algorithm</b>):
 8 | 1. Gather links to betting pages for each esport event(using appropriate set of rules).
 9 | 2. Follow each extracted link and scrape esport data.
10 | 3. Filter gathered data in the pipeline.
11 | 
12 | After all processes finished we will get information about each single esport event to come. But, we <b>will
13 | not include</b> events, that already passed(or in progress), and betting data for not primary events(such as betting
14 | on "first blood", "second map winner" etc). Also, event/game time will be converted to <b>UTC</b> format. (If you want
15 | include all events and keep original "site time" - comment code inside "<i>pipelines.py</i>" file or exclude pipelines
16 | in "<i>setting.py</i>").
17 | 
18 | Keys and description for each returning line of information:
19 | - '<i>date</i>'  - date of the single event/game in timedate format converted to UTC time(or tried to);
20 | - '<i>game</i>' - name of the game(CS:GO, League of Legends, Dota 2 etc);
21 | - '<i>player1</i>' - name of the first participant(or team name, like: "Fnatic" or "Team Liquid" etc);
22 | - '<i>player2</i>' - name of the second participant;
23 | - '<i>odds1</i>' - bet rate on the first player(float value, like: 1.862);
24 | - '<i>odds2</i>' - bet rate on the second player(float value).
25 | 
26 | This script was written in Python 3.6(for scrapy 1.5) and tested on Windows machine. Before running it,
27 |  you'll need to <b>install</b>:
28 | - Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs);
29 | - Selenium (with geckodriver for Windows machines);
30 | - Firefox browser.
31 | 
32 | After installing all requirements - copy "<i>Pinnacle</i>" folder to your machine/device. Open "<i>pipelines.py</i>" file
33 | and set variable "<b>TIME_DIFFERENCE</b>" to your own value (if needed).
34 | 
35 | To <b>run a spider</b> - change your location in terminal to scrapy project folder and type:</br> 
36 | ```scrapy crawl pinnacle```</br>
37 | To save data to .json file(for example), type:</br> 
38 | ```scrapy crawl pinnacle -o yourfile.json```
39 | 


--------------------------------------------------------------------------------
/Pinnacle/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Pinnacle project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Pinnacle'
13 | 
14 | SPIDER_MODULES = ['Pinnacle.spiders']
15 | NEWSPIDER_MODULE = 'Pinnacle.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Pinnacle (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'Pinnacle.middlewares.PinnacleSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'Pinnacle.middlewares.PinnacleDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'Pinnacle.pipelines.PinnaclePipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/Pinnacle/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class PinnacleSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class PinnacleDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/Pinnacle/spiders/pinnacle.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This scrapy spider will scrape betting data about esports events from web site "pinnacle.com"
  3 | 
  4 | Take note: script was created for educational purposes to demonstrate usage of scrapy *Pipelines*,
  5 | *LinkExtractors*, "Rules", *Generic Spiders*, *Items*, *xpath selectors*.
  6 | 
  7 | So, what does this spider exactly doing(general algorithm):
  8 |     1. Gather links to betting pages for each esport event(using appropriate set of rules).
  9 |     2. Follow each extracted link and scrape esport data.
 10 |     3. Filter gathered data in the pipeline.
 11 | After all processes finished we will get information about each single esport event to come. But, we will
 12 | not include events, that already passed(or in progress), and betting data for not primary events(such as betting
 13 | on "first blood", "second map winner" etc). Also, event/game time will be converted to UTC format. (If you want
 14 | include all events and keep original "site time" - comment code inside "pipelines.py" file or exclude pipelines
 15 | in "setting.py").
 16 | 
 17 | Keys and description for each returning line of information:
 18 | - 'date'  - date of the single event/game in timedate format converted to UTC time(or tried to);
 19 | - 'game' - name of the game(CS:GO, League of Legends, Dota 2 etc);
 20 | - 'player1' - name of the first participant(or team name, like: "Fnatic" or "Team Liquid" etc);
 21 | - 'player2' - name of the second participant;
 22 | - 'odds1' - bet rate on the first player(float value, like: 1.862);
 23 | - 'odds2' - bet rate on the second player(float value).
 24 | 
 25 | This script was written in Python 3.6(for scrapy 1.5) and tested on Windows machine. Before running it,
 26 |  you'll need to install:
 27 | - Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs);
 28 | - Selenium (with geckodriver for Windows machines);
 29 | - Firefox browser.
 30 | After installing all requirements - copy "Pinnacle" folder to your machine/device. Open "pipelines.py" file
 31 | and set variable "TIME_DIFFERENCE" to your own value (if needed).
 32 | 
 33 | To run a spider - change your location in terminal to scrapy project folder and type: scrapy crawl pinnacle
 34 | To save data to .json file(for example), type: scrapy crawl pinnacle -o yourfile.json
 35 | """
 36 | 
 37 | import time
 38 | 
 39 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
 40 | from scrapy.spiders import Rule, CrawlSpider
 41 | from scrapy import Selector
 42 | from Pinnacle.items import Event
 43 | 
 44 | from selenium import webdriver
 45 | 
 46 | 
 47 | class Pinnacle(CrawlSpider):
 48 |     """
 49 |     Spider for extracting links, following them and parsing data from response.
 50 |     Note: we using here a generic scrapy spider "CrawlSpider" (instead of "scrapy.Spider")
 51 |     and set of rules to extract only "required" urls.
 52 |     """
 53 | 
 54 |     name = 'pinnacle'
 55 |     allowed_domains = ["www.pinnacle.com"]
 56 |     start_urls = ["https://www.pinnacle.com/en/"]
 57 |     # Our esport events always have this part in their links: "odds/match/e-sports/"
 58 |     rules = (
 59 |         Rule(LxmlLinkExtractor(allow="odds/match/e-sports/",
 60 |                                allow_domains=allowed_domains,
 61 |                                restrict_css="ul li.level-2", unique=True), callback='parse_item'),
 62 |             )
 63 | 
 64 |     def load_page(self, url, sleeptime):
 65 |         """Load page with selenium and get source code after page fully loaded"""
 66 |         driver = webdriver.Firefox()
 67 |         driver.get(url)
 68 |         time.sleep(sleeptime)
 69 |         source = Selector(text=driver.page_source)
 70 |         driver.close()
 71 |         return source
 72 | 
 73 |     def parse_item(self, response):
 74 |         sleeptime = 2
 75 |         game_name = response.xpath('//header//div[@class="breadcrumbs"]/a[3]/text()').extract_first()
 76 |         # ok, for 'New Market' category we need to get game name from another place
 77 |         if game_name == "New Markets":
 78 |             # we get something like this "eSports CS:GO - GOTV.GG Invitational Odds"
 79 |             game_name = response.xpath('//h1[@class="sport-title"]/text()').extract_first()
 80 |             # take 2nd word from previous string
 81 |             game_name = game_name.split(" ")[1]
 82 | 
 83 |         # getting dynamically loaded content:
 84 |         source = self.load_page(response.url, sleeptime)
 85 |         # Now we going to find all tables with events on current page and loop through them
 86 |         events_table = source.xpath('//div[@ng-repeat="date in currentPeriod.dates"]')
 87 |         for table in events_table:
 88 |             item = Event()
 89 |             # getting all rows in current table and loop through them(don't forget relative '.' in xpath):
 90 |             rows = table.xpath('.//table[@class="odds-data"]//tbody')
 91 |             date_string = table.xpath('.//div[@class="toolbar"]//span[2]/text()').extract_first()
 92 |             for row in rows:
 93 |                 time_string = row.xpath('.//tr[1]//td[@class="game-time ng-scope"]//span/text()').extract_first()
 94 |                 site_date_string = date_string + time_string
 95 | 
 96 |                 player1 = row.xpath('.//tr[1]//td[@class="game-name name"]//span/text()').extract_first()
 97 |                 odds1 = row.xpath('.//tr[1]//td[@class="oddTip game-moneyline"]//span/text()').extract_first()
 98 |                 try:
 99 |                     odds1 = float(odds1.strip())
100 |                 except:
101 |                     pass
102 | 
103 |                 player2 = row.xpath('.//tr[2]//td[@class="game-name name"]//span/text()').extract_first()
104 |                 odds2 = row.xpath('.//tr[2]//td[@class="oddTip game-moneyline"]//span/text()').extract_first()
105 |                 try:
106 |                     odds2 = float(odds2.strip())
107 |                 except:
108 |                     pass
109 | 
110 |                 item['game'] = game_name
111 |                 item['date'] = site_date_string
112 |                 item['player1'] = player1
113 |                 item['odds1'] = odds1
114 |                 item['player2'] = player2
115 |                 item['odds2'] = odds2
116 |                 yield item
117 | 


--------------------------------------------------------------------------------