├── Pinnacle ├── __init__.py ├── spiders │ ├── __init__.py │ └── pinnacle.py ├── items.py ├── pipelines.py ├── settings.py └── middlewares.py ├── .gitignore ├── scrapy.cfg ├── LICENSE └── README.md /Pinnacle/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *iml 3 | geckodriver.log 4 | *json 5 | __pycache__ -------------------------------------------------------------------------------- /Pinnacle/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Pinnacle/items.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class Event(scrapy.Item): 5 | game = scrapy.Field() 6 | date = scrapy.Field() 7 | player1 = scrapy.Field() 8 | odds1 = scrapy.Field() 9 | player2 = scrapy.Field() 10 | odds2 = scrapy.Field() 11 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Pinnacle.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Pinnacle 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Alexander Shums'kii 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Pinnacle/pipelines.py: -------------------------------------------------------------------------------- 1 | """ 2 | This pipeline will do next: 3 | - transform date of the single event to UTC format 4 | - Drop all events/games that already passed 5 | - Drop all events/games that not a primary(events, where you betting on 'first blood', '1st map winner' etc) 6 | 7 | Enter your own value for :var:*TIME_DIFFERENCE* 8 | """ 9 | import datetime 10 | 11 | from scrapy.exceptions import DropItem 12 | 13 | # Its seems, all dates on site shows in -8 GMT zone(for me). 14 | # Well, for now difference between UTC and site time: -8 hour (but need to be monitor) 15 | # Put your own value here, if it's different for you(with sight) 16 | TIME_DIFFERENCE = -8 17 | 18 | 19 | # just don't forgot to activate your pipeline settings 20 | class PinnaclePipeline(object): 21 | 22 | def process_item(self, item, spider): 23 | 24 | # transforming string date to UTC datetime 25 | current_time_utc = datetime.datetime.utcnow() 26 | time_string = item['date'] # its looks like this: "Sat 03/02 15.15" 27 | first, second, third = time_string.split(" ") 28 | day, month = second.split("/") 29 | hour, minute = third.split(".") 30 | if (current_time_utc.month == 12) and (month < 11): # its nor ok, but works for most part 31 | year = current_time_utc.year + 1 32 | else: 33 | year = current_time_utc.year 34 | site_time = datetime.datetime(year=year, month=int(month), day=int(day), 35 | hour=int(hour), minute=int(minute)) 36 | game_time_utc = site_time - datetime.timedelta(minutes=TIME_DIFFERENCE*60) 37 | 38 | if current_time_utc > game_time_utc: 39 | raise DropItem("Event already passed or in progress") 40 | item['date'] = game_time_utc 41 | 42 | # Now selecting primary events: 43 | # not primary events have brackets '()' in players names: "Los Angeles Valiant (map 1)" 44 | if "(" in item['player1']: 45 | raise DropItem("Not a primary event: {}".format(item)) 46 | if "select matches" in item['player1'].lower(): 47 | raise DropItem("Not an event") 48 | 49 | return item 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pinnacle-Scraper 2 | Scrapping esport betting information from web site www.pinacle.com using Scrapy and Selenium. 3 | 4 | Take note: script was created for educational purposes to demonstrate usage of scrapy *Pipelines*, 5 | *LinkExtractors*, "Rules", *Generic Spiders*, *Items*, *xpath selectors*. 6 | 7 | So, what does this spider exactly doing(general algorithm): 8 | 1. Gather links to betting pages for each esport event(using appropriate set of rules). 9 | 2. Follow each extracted link and scrape esport data. 10 | 3. Filter gathered data in the pipeline. 11 | 12 | After all processes finished we will get information about each single esport event to come. But, we will 13 | not include events, that already passed(or in progress), and betting data for not primary events(such as betting 14 | on "first blood", "second map winner" etc). Also, event/game time will be converted to UTC format. (If you want 15 | include all events and keep original "site time" - comment code inside "pipelines.py" file or exclude pipelines 16 | in "setting.py"). 17 | 18 | Keys and description for each returning line of information: 19 | - 'date' - date of the single event/game in timedate format converted to UTC time(or tried to); 20 | - 'game' - name of the game(CS:GO, League of Legends, Dota 2 etc); 21 | - 'player1' - name of the first participant(or team name, like: "Fnatic" or "Team Liquid" etc); 22 | - 'player2' - name of the second participant; 23 | - 'odds1' - bet rate on the first player(float value, like: 1.862); 24 | - 'odds2' - bet rate on the second player(float value). 25 | 26 | This script was written in Python 3.6(for scrapy 1.5) and tested on Windows machine. Before running it, 27 | you'll need to install: 28 | - Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs); 29 | - Selenium (with geckodriver for Windows machines); 30 | - Firefox browser. 31 | 32 | After installing all requirements - copy "Pinnacle" folder to your machine/device. Open "pipelines.py" file 33 | and set variable "TIME_DIFFERENCE" to your own value (if needed). 34 | 35 | To run a spider - change your location in terminal to scrapy project folder and type:
36 | ```scrapy crawl pinnacle```
37 | To save data to .json file(for example), type:
38 | ```scrapy crawl pinnacle -o yourfile.json``` 39 | -------------------------------------------------------------------------------- /Pinnacle/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Pinnacle project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Pinnacle' 13 | 14 | SPIDER_MODULES = ['Pinnacle.spiders'] 15 | NEWSPIDER_MODULE = 'Pinnacle.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Pinnacle (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Pinnacle.middlewares.PinnacleSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Pinnacle.middlewares.PinnacleDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'Pinnacle.pipelines.PinnaclePipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Pinnacle/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class PinnacleSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class PinnacleDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /Pinnacle/spiders/pinnacle.py: -------------------------------------------------------------------------------- 1 | """ 2 | This scrapy spider will scrape betting data about esports events from web site "pinnacle.com" 3 | 4 | Take note: script was created for educational purposes to demonstrate usage of scrapy *Pipelines*, 5 | *LinkExtractors*, "Rules", *Generic Spiders*, *Items*, *xpath selectors*. 6 | 7 | So, what does this spider exactly doing(general algorithm): 8 | 1. Gather links to betting pages for each esport event(using appropriate set of rules). 9 | 2. Follow each extracted link and scrape esport data. 10 | 3. Filter gathered data in the pipeline. 11 | After all processes finished we will get information about each single esport event to come. But, we will 12 | not include events, that already passed(or in progress), and betting data for not primary events(such as betting 13 | on "first blood", "second map winner" etc). Also, event/game time will be converted to UTC format. (If you want 14 | include all events and keep original "site time" - comment code inside "pipelines.py" file or exclude pipelines 15 | in "setting.py"). 16 | 17 | Keys and description for each returning line of information: 18 | - 'date' - date of the single event/game in timedate format converted to UTC time(or tried to); 19 | - 'game' - name of the game(CS:GO, League of Legends, Dota 2 etc); 20 | - 'player1' - name of the first participant(or team name, like: "Fnatic" or "Team Liquid" etc); 21 | - 'player2' - name of the second participant; 22 | - 'odds1' - bet rate on the first player(float value, like: 1.862); 23 | - 'odds2' - bet rate on the second player(float value). 24 | 25 | This script was written in Python 3.6(for scrapy 1.5) and tested on Windows machine. Before running it, 26 | you'll need to install: 27 | - Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs); 28 | - Selenium (with geckodriver for Windows machines); 29 | - Firefox browser. 30 | After installing all requirements - copy "Pinnacle" folder to your machine/device. Open "pipelines.py" file 31 | and set variable "TIME_DIFFERENCE" to your own value (if needed). 32 | 33 | To run a spider - change your location in terminal to scrapy project folder and type: scrapy crawl pinnacle 34 | To save data to .json file(for example), type: scrapy crawl pinnacle -o yourfile.json 35 | """ 36 | 37 | import time 38 | 39 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor 40 | from scrapy.spiders import Rule, CrawlSpider 41 | from scrapy import Selector 42 | from Pinnacle.items import Event 43 | 44 | from selenium import webdriver 45 | 46 | 47 | class Pinnacle(CrawlSpider): 48 | """ 49 | Spider for extracting links, following them and parsing data from response. 50 | Note: we using here a generic scrapy spider "CrawlSpider" (instead of "scrapy.Spider") 51 | and set of rules to extract only "required" urls. 52 | """ 53 | 54 | name = 'pinnacle' 55 | allowed_domains = ["www.pinnacle.com"] 56 | start_urls = ["https://www.pinnacle.com/en/"] 57 | # Our esport events always have this part in their links: "odds/match/e-sports/" 58 | rules = ( 59 | Rule(LxmlLinkExtractor(allow="odds/match/e-sports/", 60 | allow_domains=allowed_domains, 61 | restrict_css="ul li.level-2", unique=True), callback='parse_item'), 62 | ) 63 | 64 | def load_page(self, url, sleeptime): 65 | """Load page with selenium and get source code after page fully loaded""" 66 | driver = webdriver.Firefox() 67 | driver.get(url) 68 | time.sleep(sleeptime) 69 | source = Selector(text=driver.page_source) 70 | driver.close() 71 | return source 72 | 73 | def parse_item(self, response): 74 | sleeptime = 2 75 | game_name = response.xpath('//header//div[@class="breadcrumbs"]/a[3]/text()').extract_first() 76 | # ok, for 'New Market' category we need to get game name from another place 77 | if game_name == "New Markets": 78 | # we get something like this "eSports CS:GO - GOTV.GG Invitational Odds" 79 | game_name = response.xpath('//h1[@class="sport-title"]/text()').extract_first() 80 | # take 2nd word from previous string 81 | game_name = game_name.split(" ")[1] 82 | 83 | # getting dynamically loaded content: 84 | source = self.load_page(response.url, sleeptime) 85 | # Now we going to find all tables with events on current page and loop through them 86 | events_table = source.xpath('//div[@ng-repeat="date in currentPeriod.dates"]') 87 | for table in events_table: 88 | item = Event() 89 | # getting all rows in current table and loop through them(don't forget relative '.' in xpath): 90 | rows = table.xpath('.//table[@class="odds-data"]//tbody') 91 | date_string = table.xpath('.//div[@class="toolbar"]//span[2]/text()').extract_first() 92 | for row in rows: 93 | time_string = row.xpath('.//tr[1]//td[@class="game-time ng-scope"]//span/text()').extract_first() 94 | site_date_string = date_string + time_string 95 | 96 | player1 = row.xpath('.//tr[1]//td[@class="game-name name"]//span/text()').extract_first() 97 | odds1 = row.xpath('.//tr[1]//td[@class="oddTip game-moneyline"]//span/text()').extract_first() 98 | try: 99 | odds1 = float(odds1.strip()) 100 | except: 101 | pass 102 | 103 | player2 = row.xpath('.//tr[2]//td[@class="game-name name"]//span/text()').extract_first() 104 | odds2 = row.xpath('.//tr[2]//td[@class="oddTip game-moneyline"]//span/text()').extract_first() 105 | try: 106 | odds2 = float(odds2.strip()) 107 | except: 108 | pass 109 | 110 | item['game'] = game_name 111 | item['date'] = site_date_string 112 | item['player1'] = player1 113 | item['odds1'] = odds1 114 | item['player2'] = player2 115 | item['odds2'] = odds2 116 | yield item 117 | --------------------------------------------------------------------------------