├── hkjc ├── hkjc │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── science.py │ ├── URLDuplicateFilter.py │ ├── items.py │ ├── settings.py │ └── pipelines.py ├── scrapy.cfg └── requirements.txt ├── README.md ├── howstuffworks ├── howstuffworks │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── science.py │ ├── items.py │ ├── URLDuplicateFilter.py │ ├── pipelines.py │ └── settings.py └── scrapy.cfg ├── wallpaperswide ├── wallpaperswide │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── wallpaper.py │ ├── items.py │ ├── pipelines.py │ └── settings.py └── scrapy.cfg ├── .gitignore ├── requirements.txt └── processors └── tagger.py /hkjc/hkjc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Srcappers for Frictionle. 2 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wallpaperswide/wallpaperswide/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | *.pyc 3 | *.pyo 4 | *.log 5 | .DS_Store 6 | *.json 7 | -------------------------------------------------------------------------------- /hkjc/hkjc/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /wallpaperswide/wallpaperswide/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /hkjc/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = hkjc.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = hkjc 12 | -------------------------------------------------------------------------------- /howstuffworks/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = howstuffworks.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = howstuffworks 12 | -------------------------------------------------------------------------------- /wallpaperswide/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = wallpaperswide.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = wallpaperswide 12 | -------------------------------------------------------------------------------- /wallpaperswide/wallpaperswide/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WallpaperItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | image_url = scrapy.Field() 14 | -------------------------------------------------------------------------------- /wallpaperswide/wallpaperswide/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class WallpaperswidePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /hkjc/requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.0.5 2 | Twisted==16.1.1 3 | argparse==1.2.1 4 | attrs==15.2.0 5 | cffi==1.6.0 6 | cryptography==1.3.1 7 | cssselect==0.9.1 8 | enum34==1.1.3 9 | html2text==2016.4.2 10 | idna==2.1 11 | ipaddress==1.0.16 12 | lxml==3.6.0 13 | pyOpenSSL==16.0.0 14 | pyasn1==0.1.9 15 | pyasn1-modules==0.0.8 16 | pycparser==2.14 17 | queuelib==1.4.2 18 | service-identity==16.0.0 19 | six==1.10.0 20 | w3lib==1.14.2 21 | wsgiref==0.1.2 22 | zope.interface==4.1.3 23 | pytz 24 | -------------------------------------------------------------------------------- /hkjc/hkjc/URLDuplicateFilter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from scrapy.dupefilters import RFPDupeFilter 4 | 5 | class URLFilter(RFPDupeFilter): 6 | """A dupe filter that considers specific ids in the url""" 7 | 8 | def __getid(self, url): 9 | return url 10 | 11 | def request_seen(self, request): 12 | fp = self.__getid(request.url) 13 | if fp in self.fingerprints: 14 | return True 15 | self.fingerprints.add(fp) 16 | if self.file: 17 | self.file.write(fp + os.linesep) 18 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ArticleItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | url = scrapy.Field() 14 | title = scrapy.Field() 15 | desc = scrapy.Field() 16 | excerpt = scrapy.Field() 17 | images = scrapy.Field() 18 | related = scrapy.Field() 19 | 20 | urlb64 = scrapy.Field() 21 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/URLDuplicateFilter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from scrapy.dupefilters import RFPDupeFilter 4 | from scrapy.utils.request import request_fingerprint 5 | 6 | class URLFilter(RFPDupeFilter): 7 | """A dupe filter that considers specific ids in the url""" 8 | 9 | def __getid(self, url): 10 | return url 11 | 12 | def request_seen(self, request): 13 | fp = self.__getid(request.url) 14 | if fp in self.fingerprints: 15 | return True 16 | self.fingerprints.add(fp) 17 | if self.file: 18 | self.file.write(fp + os.linesep) 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | cffi==1.3.0 3 | characteristic==14.3.0 4 | cryptography==1.1 5 | cssselect==0.9.1 6 | decorator==4.0.4 7 | enum34==1.0.4 8 | html2text==2015.11.4 9 | idna==2.0 10 | ipaddress==1.0.15 11 | ipython==4.0.0 12 | ipython-genutils==0.1.0 13 | lxml==3.5.0 14 | path.py==8.1.2 15 | pexpect==4.0.1 16 | pickleshare==0.5 17 | ptyprocess==0.5 18 | pyasn1==0.1.9 19 | pyasn1-modules==0.0.8 20 | pycparser==2.14 21 | pymongo==3.1.1 22 | pyOpenSSL==0.15.1 23 | queuelib==1.4.2 24 | Scrapy==1.0.3 25 | service-identity==14.0.0 26 | simplegeneric==0.8.1 27 | six==1.10.0 28 | traitlets==4.0.0 29 | Twisted==15.4.0 30 | w3lib==1.13.0 31 | wheel==0.24.0 32 | zope.interface==4.1.3 33 | -------------------------------------------------------------------------------- /hkjc/hkjc/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ArticleItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | server_timestamp = scrapy.Field() 14 | client_timestamp = scrapy.Field() 15 | 16 | home_team = scrapy.Field() 17 | home_goal = scrapy.Field() 18 | away_team = scrapy.Field() 19 | away_goal = scrapy.Field() 20 | 21 | post_time = scrapy.Field() 22 | 23 | flag_1 = scrapy.Field() 24 | flag_2 = scrapy.Field() 25 | flag_3 = scrapy.Field() 26 | 27 | home_odds = scrapy.Field() 28 | draw_odds = scrapy.Field() 29 | away_odds = scrapy.Field() 30 | 31 | final = scrapy.Field() 32 | 33 | html = scrapy.Field() 34 | 35 | 36 | class RawPages(scrapy.Item): 37 | pages = scrapy.Field() 38 | -------------------------------------------------------------------------------- /processors/tagger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from optparse import OptionParser 4 | 5 | import nltk 6 | 7 | def parse(file_path): 8 | """ 9 | Given a file 'file_path', returns list of tags 10 | """ 11 | with open(file_path, 'rb') as f: 12 | contents = json.loads(f.read()) 13 | print extract_tags(contents['desc']) 14 | 15 | 16 | # 17 | # https://en.wikipedia.org/w/api.php?action=query&clshow=!hidden&cllimit=500&prop=categories&titles=Sachin_Tendulkar 18 | # 19 | 20 | def extract_tags(content): 21 | """ 22 | Given string content, returns list of tags 23 | """ 24 | tag, tags = [], [] 25 | pos_tags = ['NN', 'NNP', 'NNS', 'NNPS'] 26 | for pos_element in nltk.pos_tag(content.split()): 27 | if pos_element[1] in pos_tags: 28 | tag.append(pos_element) 29 | else: 30 | if len(tag) > 0: 31 | tags.append(tag) 32 | tag = [] 33 | if len(tag) > 0: 34 | tags.append(tag) 35 | 36 | return tags 37 | 38 | if __name__ == '__main__': 39 | parser = OptionParser() 40 | parser.add_option("-d", "--dir", dest="directory", 41 | help="all files of this directory will be parsed", metavar="DIR") 42 | parser.add_option("-f", "--file", dest="file", 43 | help="file for parsing", metavar="FILE") 44 | 45 | (opts, args) = parser.parse_args() 46 | 47 | if opts.file: 48 | parse(opts.file) 49 | 50 | if opts.directory: 51 | files = os.listdir(opts.directory) 52 | for file_name in files: 53 | file_path = os.path.join(opts.directory, file_name) 54 | parse(file_path) 55 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import time, base64, json, os 8 | import pymongo 9 | from scrapy.exceptions import DropItem 10 | 11 | class DatabaseEntryPipeline(object): 12 | 13 | collection_name = 'map' 14 | 15 | def __init__(self, mongo_uri, mongo_db): 16 | self.mongo_uri = mongo_uri 17 | self.mongo_db = mongo_db 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | return cls( 22 | mongo_uri=crawler.settings.get('MONGO_URI'), 23 | mongo_db=crawler.settings.get('MONGO_DATABASE', 'frictionle') 24 | ) 25 | 26 | def open_spider(self, spider): 27 | self.client = pymongo.MongoClient(self.mongo_uri) 28 | self.db = self.client[self.mongo_db] 29 | 30 | def close_spider(self, spider): 31 | self.client.close() 32 | 33 | def process_item(self, item, spider): 34 | try: 35 | _id = base64.b64encode(item['url']) 36 | result = self.db[self.collection_name].insert({ 37 | '_id': _id, 38 | 'u': item['url'], 39 | 't': item['title'], 40 | 'c': time.time() 41 | }) 42 | item['urlb64'] = _id 43 | except pymongo.errors.DuplicateKeyError: 44 | raise DropItem("Duplicate item found: %s" % item['url']) 45 | else: 46 | return item 47 | 48 | 49 | class FileWriterPipeline(object): 50 | def __init__(self, dump_folder): 51 | self.dump_folder = dump_folder 52 | 53 | @classmethod 54 | def from_crawler(cls, crawler): 55 | return cls( 56 | dump_folder=crawler.settings.get('DUMP_FOLDER'), 57 | ) 58 | 59 | def process_item(self, item, spider): 60 | file_path = os.path.join(self.dump_folder, spider.site, spider.name, item['urlb64']) 61 | with open(file_path, 'wb') as data_file: 62 | content = json.dumps(dict(item)) 63 | data_file.write(content) 64 | return item 65 | -------------------------------------------------------------------------------- /wallpaperswide/wallpaperswide/spiders/wallpaper.py: -------------------------------------------------------------------------------- 1 | import scrapy, html2text 2 | from wallpaperswide.items import WallpaperItem 3 | 4 | class WallpaperCrawler(scrapy.Spider): 5 | name = "wallpaper" 6 | site = "wallpaperswide.com" 7 | allowed_domains = ["wallpaperswide.com"] 8 | base_url = 'http://wallpaperswide.com' 9 | start_urls = [ 10 | "http://wallpaperswide.com/games-desktop-wallpapers.html" 11 | ] 12 | priorities = [ 13 | '1366x768', 14 | '1280x768', 15 | '1280x720', 16 | '1440x900', 17 | '1600x900', 18 | '1280x1024', 19 | '1280x960', 20 | '960x600', 21 | '800x600' 22 | ] 23 | 24 | def is_wallpaper(self, response): 25 | if response.css('#wallpaper-resolutions'): 26 | return False 27 | return False 28 | 29 | def getatindex(self, a, index=0): 30 | if not a: 31 | return a 32 | return a[index] 33 | 34 | def mergeall(self, a): 35 | return ''.join(a) 36 | 37 | 38 | def parse_wallpaper_page(self, response): 39 | """ 40 | Returns an item 41 | """ 42 | item = WallpaperItem() 43 | image_url = None 44 | for resolution in WallpaperCrawler.priorities: 45 | xp = '//div[@id="wallpaper-resolutions"]/a[text()="%s"]/@href' % resolution 46 | link = response.xpath(xp).extract() 47 | if link: 48 | image_url = WallpaperCrawler.base_url + self.getatindex(link) 49 | break 50 | 51 | if image_url: 52 | item['image_url'] = image_url 53 | yield item 54 | else: 55 | self.logger.warning('No images found for given resolution %s' % response.url) 56 | 57 | 58 | def parse(self, response): 59 | for wallpaper_page_url in response.xpath('//div[@id="content"]//ul[@class="wallpapers"]/li//a/@href').extract(): 60 | wallpaper_page_url = WallpaperCrawler.base_url + wallpaper_page_url 61 | yield scrapy.Request(wallpaper_page_url, callback=self.parse_wallpaper_page) 62 | 63 | next_urls = response.xpath('//*[@id="content"]/div[@class="pagination"]/a[contains(text(),"Next")]/@href').extract() 64 | for next_url in next_urls: 65 | next_url = WallpaperCrawler.base_url + next_url 66 | yield scrapy.Request(next_url, callback=self.parse) 67 | -------------------------------------------------------------------------------- /wallpaperswide/wallpaperswide/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for wallpaperswide project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'wallpaperswide' 13 | 14 | SPIDER_MODULES = ['wallpaperswide.spiders'] 15 | NEWSPIDER_MODULE = 'wallpaperswide.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'wallpaperswide (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'wallpaperswide.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'wallpaperswide.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | #ITEM_PIPELINES = { 65 | # 'wallpaperswide.pipelines.SomePipeline': 300, 66 | #} 67 | 68 | # Enable and configure the AutoThrottle extension (disabled by default) 69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 71 | #AUTOTHROTTLE_ENABLED=True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY=5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY=60 76 | # Enable showing throttling stats for every response received: 77 | #AUTOTHROTTLE_DEBUG=False 78 | 79 | # Enable and configure HTTP caching (disabled by default) 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 81 | #HTTPCACHE_ENABLED=True 82 | #HTTPCACHE_EXPIRATION_SECS=0 83 | #HTTPCACHE_DIR='httpcache' 84 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 86 | -------------------------------------------------------------------------------- /hkjc/hkjc/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for howstuffworks project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'hkjc' 13 | 14 | SPIDER_MODULES = ['hkjc.spiders'] 15 | NEWSPIDER_MODULE = 'hkjc.spiders' 16 | 17 | RAW_FOLDER = '/home/vagrant/temp/raw' 18 | DECRYPTED_FOLDER = '/home/vagrant/temp/decrypted' 19 | META_FOLDER = '/home/vagrant/temp/meta' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'howstuffworks (+http://www.yourdomain.com)' 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS=32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY=3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 33 | #CONCURRENT_REQUESTS_PER_IP=16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED=False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED=False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'howstuffworks.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'howstuffworks.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'hkjc.pipelines.FileWriterPipeline': 200 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 74 | #AUTOTHROTTLE_ENABLED=True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY=5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY=60 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG=False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED=True 85 | #HTTPCACHE_EXPIRATION_SECS=0 86 | #HTTPCACHE_DIR='httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for howstuffworks project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'howstuffworks' 13 | 14 | SPIDER_MODULES = ['howstuffworks.spiders'] 15 | NEWSPIDER_MODULE = 'howstuffworks.spiders' 16 | 17 | MONGO_URI = 'mongodb://localhost:27017' 18 | MONGO_DATABASE = 'frictionle' 19 | 20 | DUMP_FOLDER = '/Users/arpitbhayani/frictionle/site_data/' 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'howstuffworks (+http://www.yourdomain.com)' 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS=32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY=3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 34 | #CONCURRENT_REQUESTS_PER_IP=16 35 | 36 | # Disable cookies (enabled by default) 37 | #COOKIES_ENABLED=False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED=False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'howstuffworks.middlewares.MyCustomSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'howstuffworks.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'howstuffworks.pipelines.DatabaseEntryPipeline': 100, 70 | 'howstuffworks.pipelines.FileWriterPipeline': 200, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 76 | #AUTOTHROTTLE_ENABLED=True 77 | # The initial download delay 78 | #AUTOTHROTTLE_START_DELAY=5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY=60 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG=False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED=True 87 | #HTTPCACHE_EXPIRATION_SECS=0 88 | #HTTPCACHE_DIR='httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 90 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /hkjc/hkjc/spiders/science.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import html2text 3 | from hkjc.items import ArticleItem 4 | from urlparse import urlparse 5 | from os.path import splitext, basename 6 | 7 | import pytz 8 | import logging 9 | from datetime import datetime 10 | 11 | class HKJCCrawler(scrapy.Spider): 12 | name = "hkjc" 13 | site = "hkjc" 14 | allowed_domains = ["bet.hkjc.com"] 15 | base_domain = "http://bet.hkjc.com" 16 | start_urls = [ 17 | "http://bet.hkjc.com/football/odds/odds_hha.aspx?ci=en-US", 18 | ] 19 | 20 | def get_raw(self, htmltext): 21 | converter = html2text.HTML2Text() 22 | converter.ignore_links = True 23 | return converter.handle(htmltext).strip() 24 | 25 | def get_image_filename(self, image_url): 26 | if type(image_url) != list or len(image_url) == 0: 27 | return None 28 | 29 | disassembled = urlparse(image_url[0]) 30 | filename, file_ext = splitext(basename(disassembled.path)) 31 | return filename 32 | 33 | def process_team_name(self, name): 34 | return '"%s"' % name.replace(' ', '_') 35 | 36 | def process_rows(self, rows, server_timestamp, html): 37 | items = [] 38 | for row in rows: 39 | 40 | item = ArticleItem() 41 | 42 | temp_rows = row.css('tr::attr(class)').extract() 43 | if len(temp_rows) == 0: 44 | continue 45 | 46 | class_attr = temp_rows[0].lower() 47 | if not ('rhead' in class_attr or 'rchead' in class_attr or 'tdpage' in class_attr): 48 | 49 | # Save raw html 50 | item['html'] = html 51 | 52 | # Server timestamp 53 | dt = server_timestamp.split(' ') 54 | d = dt[0].replace('/', '-') 55 | t = dt[1] 56 | 57 | dtobj = datetime.strptime('%s %s' % (d, t), '%d-%m-%y %H:%M') 58 | dtobj = pytz.timezone("Hongkong").localize(dtobj) 59 | item['server_timestamp'] = dtobj.strftime('%Y-%m-%d %H:%M:%S %Z') 60 | 61 | # Client timestamp 62 | dtobj = pytz.timezone("Hongkong").localize(datetime.now()) 63 | item['client_timestamp'] = dtobj.strftime('%Y-%m-%d %H:%M:%S %Z') 64 | 65 | # Fetch Flags 66 | flags = row.css('td.cflag') 67 | if len(flags) >= 1: 68 | item['flag_1'] = self.get_image_filename(flags[0].css('img::attr(src)').extract()) or 0 69 | if len(flags) >= 2: 70 | item['flag_2'] = self.get_image_filename(flags[1].css('img::attr(src)').extract()) or 0 71 | 72 | # Fetch Venue 73 | flags = row.css('td.cvenue') 74 | if len(flags) >= 1: 75 | item['flag_3'] = self.get_image_filename(flags[0].css('img::attr(src)').extract()) or 0 76 | 77 | # Fetch teams 78 | teams = row.css('td.cteams') 79 | if len(teams) >= 1: 80 | all_teams = self.get_raw(teams[0].css('a')[0].extract()) 81 | 82 | tokens = all_teams.split(' vs ') 83 | 84 | home_team_name = tokens[0].split('[')[0].strip() if '[' in tokens[0] else tokens[0] 85 | away_team_name = tokens[1].split('[')[0].strip() if '[' in tokens[1] else tokens[1] 86 | 87 | item['home_team'] = self.process_team_name(home_team_name) 88 | item['away_team'] = self.process_team_name(away_team_name) 89 | 90 | if len(teams[0].css('a *')) >= 2: 91 | item['home_goal'] = self.get_raw(teams[0].css('a *')[1].extract()) 92 | if len(teams[0].css('a *')) >= 5: 93 | item['away_goal'] = self.get_raw(teams[0].css('a *')[4].extract()) 94 | 95 | # Fetch Odds 96 | odds = row.css('td.codds') 97 | if len(odds) >= 1: 98 | item['home_odds'] = self.get_raw(odds[0].extract()) 99 | if len(odds) >= 2: 100 | item['draw_odds'] = self.get_raw(odds[1].extract()) 101 | if len(odds) >= 3: 102 | item['away_odds'] = self.get_raw(odds[2].extract()) 103 | 104 | # Fetch Post Time 105 | eest = row.css('td.cesst') 106 | if len(eest) >= 1: 107 | tstr = self.get_raw(eest[0].extract()) 108 | dtobj = datetime.strptime(tstr, '%d/%m %H:%M') 109 | if dtobj.year == 1900: 110 | dtobj = dtobj.replace(year=datetime.now().year) 111 | dtobj = pytz.timezone("Hongkong").localize(dtobj) 112 | item['post_time'] = dtobj.strftime('%Y-%m-%d %H:%M:%S %Z') 113 | 114 | items.append(item) 115 | 116 | return items 117 | 118 | def parse_article_page(self, response): 119 | """ 120 | Returns an item 121 | """ 122 | articles = [] 123 | odd_tables = response.css('table.tHHA.tOdds') 124 | server_timestamp = self.get_raw(response.css('#server_datetime').extract()[0]) 125 | 126 | for odd_table in odd_tables: 127 | rows = odd_table.css('tr') 128 | articles.extend(self.process_rows(rows, server_timestamp, response.body)) 129 | return articles 130 | 131 | def parse(self, response): 132 | for article in self.parse_article_page(response): 133 | yield article 134 | 135 | page_buttons = response.css('#tblOdds a') 136 | 137 | for page_button in page_buttons: 138 | link_text = self.get_raw(page_button.extract()) 139 | if link_text.lower() == 'next': 140 | next_link = page_button.xpath('@href').extract() 141 | if len(next_link) >= 0: 142 | yield scrapy.Request(self.base_domain + next_link[0], callback=self.parse) 143 | -------------------------------------------------------------------------------- /howstuffworks/howstuffworks/spiders/science.py: -------------------------------------------------------------------------------- 1 | import scrapy, html2text 2 | from howstuffworks.items import ArticleItem 3 | 4 | class ScienceCrawler(scrapy.Spider): 5 | name = "science" 6 | site = "howstuffworks" 7 | allowed_domains = ["science.howstuffworks.com"] 8 | start_urls = [ 9 | "http://science.howstuffworks.com/engineering/civil", 10 | "http://science.howstuffworks.com/materials-science-channel.htm", 11 | "http://science.howstuffworks.com/engineering/structural", 12 | "http://science.howstuffworks.com/devices-channel.htm", 13 | "http://science.howstuffworks.com/robots-channel.htm", 14 | "http://science.howstuffworks.com/environmental/conservation", 15 | "http://science.howstuffworks.com/environmental/energy", 16 | "http://science.howstuffworks.com/environmental/green-science", 17 | "http://science.howstuffworks.com/environmental/earth", 18 | "http://science.howstuffworks.com/environmental/terms", 19 | "http://science.howstuffworks.com/environmental/green-tech", 20 | "http://science.howstuffworks.com/nature/climate-weather", 21 | "http://science.howstuffworks.com/nature/natural-disasters", 22 | "http://science.howstuffworks.com/innovation/big-thinkers", 23 | "http://science.howstuffworks.com/innovation/everyday-innovations", 24 | "http://science.howstuffworks.com/innovation/inventions", 25 | "http://science.howstuffworks.com/innovation/new-inventions", 26 | "http://science.howstuffworks.com/innovation/science-questions", 27 | "http://science.howstuffworks.com/innovation/edible-innovations", 28 | "http://science.howstuffworks.com/innovation/famous-inventors", 29 | "http://science.howstuffworks.com/innovation/nasa-inventions", 30 | "http://science.howstuffworks.com/innovation/repurposed-inventions", 31 | "http://science.howstuffworks.com/innovation/scientific-experiments", 32 | "http://science.howstuffworks.com/life/biology-fields", 33 | "http://science.howstuffworks.com/life/cellular-microscopic", 34 | "http://science.howstuffworks.com/life/fungi", 35 | "http://science.howstuffworks.com/life/inside-the-mind", 36 | "http://science.howstuffworks.com/life/botany", 37 | "http://science.howstuffworks.com/life/evolution", 38 | "http://science.howstuffworks.com/life/genetic", 39 | "http://science.howstuffworks.com/military-aircraft-channel.htm", 40 | "http://science.howstuffworks.com/biological-warfare-channel.htm", 41 | "http://science.howstuffworks.com/explosives-channel.htm", 42 | "http://science.howstuffworks.com/future-military-technology.htm", 43 | "http://science.howstuffworks.com/personal-finance-in-military.htm", 44 | "http://science.howstuffworks.com/surveillance-stealth-channel.htm", 45 | "http://science.howstuffworks.com/military/army-careers", 46 | "http://science.howstuffworks.com/military-branches-channel.htm", 47 | "http://science.howstuffworks.com/firearms-channel.htm", 48 | "http://science.howstuffworks.com/naval-technology-channel.htm", 49 | "http://science.howstuffworks.com/soldiers-channel.htm", 50 | "http://science.howstuffworks.com/tanks-fighting-vehicles-channel.htm", 51 | "http://science.howstuffworks.com/acoustics-channel.htm", 52 | "http://science.howstuffworks.com/electricity-channel.htm", 53 | "http://science.howstuffworks.com/history-of-physics-channel.htm", 54 | "http://science.howstuffworks.com/math-concepts", 55 | "http://science.howstuffworks.com/mechanics-channel.htm", 56 | "http://science.howstuffworks.com/optics-channel.htm", 57 | "http://science.howstuffworks.com/chemistry-channel.htm", 58 | "http://science.howstuffworks.com/forensic-science-channel.htm", 59 | "http://science.howstuffworks.com/magnetism-channel.htm", 60 | "http://science.howstuffworks.com/matter-channel.htm", 61 | "http://science.howstuffworks.com/nuclear-science-channel.htm", 62 | "http://science.howstuffworks.com/dictionary/astronomy-terms", 63 | "http://science.howstuffworks.com/dictionary/biology-terms", 64 | "http://science.howstuffworks.com/dictionary/famous-scientists", 65 | "http://science.howstuffworks.com/dictionary/meteorological-terms", 66 | "http://science.howstuffworks.com/dictionary/petrology-terms", 67 | "http://science.howstuffworks.com/dictionary/awards-organizations", 68 | "http://science.howstuffworks.com/dictionary/chemistry-terms", 69 | "http://science.howstuffworks.com/dictionary/geology-terms", 70 | "http://science.howstuffworks.com/dictionary/physics-terms", 71 | "http://science.howstuffworks.com/dictionary/plant-terms", 72 | "http://science.howstuffworks.com/science-vs-myth/afterlife", 73 | "http://science.howstuffworks.com/science-vs-myth/extrasensory-perceptions", 74 | "http://science.howstuffworks.com/science-vs-myth/unexplained-phenomena", 75 | "http://science.howstuffworks.com/science-vs-myth/everyday-myths", 76 | "http://science.howstuffworks.com/science-vs-myth/strange-creatures", 77 | "http://science.howstuffworks.com/science-vs-myth/what-if", 78 | "http://science.howstuffworks.com/space/aliens-ufos", 79 | "http://science.howstuffworks.com/future-space-channel.htm", 80 | "http://science.howstuffworks.com/space-exploration-channel.htm", 81 | "http://science.howstuffworks.com/astronomy-channel.htm", 82 | "http://science.howstuffworks.com/spaceflight-channel.htm", 83 | "http://science.howstuffworks.com/space-transportation-systems-channel.htm", 84 | "http://science.howstuffworks.com/transport/engines-equipment", 85 | "http://science.howstuffworks.com/transport/flight" 86 | ] 87 | 88 | def is_article(self, response): 89 | if response.css('#Title'): 90 | return False 91 | return False 92 | 93 | 94 | def getatindex(self, a, index=0): 95 | if not a: 96 | return a 97 | return a[index] 98 | 99 | def mergeall(self, a): 100 | return ''.join(a) 101 | 102 | 103 | def parse_article_page(self, response): 104 | """ 105 | Returns an item 106 | """ 107 | item = ArticleItem() 108 | item['url'] = response.url 109 | item['title'] = self.getatindex(response.xpath('//*[@id="Title"]//h1/text()').extract()) 110 | item['desc'] = self.mergeall(response.xpath('//*[@id="ArticleWell"]//div[@class="content"]/p/text()').extract()) 111 | item['excerpt'] = item['desc'][:800] 112 | item['related'] = response.xpath('//*[@id="RelatedLinks0"]//a/@href').extract() 113 | item['images'] = { 114 | 'inset': response.xpath('//*[@id="ArticleWell"]//img/@src').extract(), 115 | 'fb': self.getatindex(response.xpath('//*[@property="og:image"]/@content').extract()) 116 | } 117 | 118 | if len(item['desc']) < 1000: 119 | self.logger.warning('Description is less than 1000 chars. Might be fishy %s' % item['url']) 120 | yield item 121 | 122 | 123 | def parse(self, response): 124 | for article_url in response.css('#ContentLibrary a[class="img"]').xpath('@href').extract(): 125 | yield scrapy.Request(article_url, callback=self.parse_article_page) 126 | 127 | next_buttons = response.xpath('//*[@id="ContentLibrary"]//img[@src="http://s.hswstatic.com/en-us/skins/hsw/arrow-right-3x5-2.png"]') 128 | for next_button in next_buttons: 129 | url = self.getatindex(next_button.xpath('../@href').extract()) 130 | yield scrapy.Request(url, callback=self.parse) 131 | -------------------------------------------------------------------------------- /hkjc/hkjc/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import os 8 | import pickle 9 | import logging 10 | from datetime import datetime 11 | 12 | 13 | class FileWriterPipeline(object): 14 | def __init__(self, raw_folder, decrypted_folder, meta_folder): 15 | self.raw_folder = raw_folder 16 | self.decrypted_folder = decrypted_folder 17 | self.meta_folder = meta_folder 18 | self.match_names = {} 19 | self.global_count = 0 20 | self.global_count_str = 0 21 | 22 | self.global_count_file = os.path.join(self.meta_folder, 'count.txt') 23 | parent_dir = os.path.dirname(self.global_count_file) 24 | if not os.path.isdir(parent_dir): 25 | os.makedirs(parent_dir) 26 | 27 | if not os.path.isfile(self.global_count_file): 28 | with open(self.global_count_file, 'w') as f: 29 | f.write('0004000000') 30 | 31 | self.global_count_str = '0' 32 | with open(self.global_count_file, 'r') as f: 33 | self.global_count = int(f.read().strip()) 34 | self.global_count_str = str(self.global_count).zfill(10) 35 | self.global_count += 1 36 | 37 | def close_spider(self, spider): 38 | last_matches_file = os.path.join(self.meta_folder, 'matches.pkl') 39 | 40 | if not os.path.isfile(last_matches_file): 41 | with open(last_matches_file, 'w') as f: 42 | pickle.dump(self.match_names, f) 43 | return 44 | 45 | with open(last_matches_file, 'r') as f: 46 | temp = pickle.load(f) 47 | for name in temp: 48 | if name not in self.match_names: 49 | # Mark match with name as final 50 | file_path = temp[name] 51 | file_data = [] 52 | with open(file_path, 'rw') as t: 53 | file_data = t.readlines() 54 | data = file_data[-1].strip() 55 | tokens = data.split(',') 56 | tokens[-1] = '1\n' 57 | 58 | file_data = file_data[:-1] + [','.join(tokens)] 59 | 60 | with open(file_path, 'w') as t: 61 | for line in file_data: 62 | t.write(line) 63 | 64 | with open(last_matches_file, 'w') as f: 65 | pickle.dump(self.match_names, f) 66 | 67 | @classmethod 68 | def from_crawler(cls, crawler): 69 | return cls( 70 | raw_folder=crawler.settings.get('RAW_FOLDER'), 71 | decrypted_folder=crawler.settings.get('DECRYPTED_FOLDER'), 72 | meta_folder=crawler.settings.get('META_FOLDER') 73 | ) 74 | 75 | def format_item(self, item, is_final): 76 | 77 | if 'server_timestamp' not in item: 78 | item['server_timestamp'] = '' 79 | 80 | if 'client_timestamp' not in item: 81 | item['client_timestamp'] = '' 82 | 83 | if 'home_team' not in item: 84 | item['home_team'] = '' 85 | 86 | if 'home_goal' not in item: 87 | item['home_goal'] = '' 88 | 89 | if 'away_team' not in item: 90 | item['away_team'] = '' 91 | 92 | if 'away_goal' not in item: 93 | item['away_goal'] = '' 94 | 95 | if 'post_time' not in item: 96 | item['post_time'] = '' 97 | 98 | if 'flag_1' not in item: 99 | item['flag_1'] = 0 100 | 101 | if 'flag_2' not in item: 102 | item['flag_2'] = 0 103 | 104 | if 'flag_3' not in item: 105 | item['flag_3'] = 0 106 | 107 | if 'home_odds' not in item: 108 | item['home_odds'] = '' 109 | 110 | if 'draw_odds' not in item: 111 | item['draw_odds'] = '' 112 | 113 | if 'away_odds' not in item: 114 | item['away_odds'] = '' 115 | 116 | item['final'] = '1' is is_final or '0' 117 | 118 | return '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' % ( 119 | item['server_timestamp'], 120 | item['client_timestamp'], 121 | item['home_team'], 122 | item['home_goal'], 123 | item['away_team'], 124 | item['away_goal'], 125 | item['post_time'], 126 | item['flag_1'], 127 | item['flag_2'], 128 | item['flag_3'], 129 | item['home_odds'], 130 | item['draw_odds'], 131 | item['away_odds'], 132 | item['final'] 133 | ) 134 | 135 | def is_valid_item(self, item): 136 | """Checks if given item is valid (has legitimate entries) 137 | Ex: Not a header row, not with irregular data 138 | """ 139 | if 'home_odds' not in item and 'draw_odds' not in item and\ 140 | 'away_odds' not in item and 'post_time' not in item: 141 | return False 142 | return True 143 | 144 | def are_odds_same(self, str1, str2): 145 | odds1 = ','.join(str1.split(',')[-4:-1]) 146 | odds2 = ','.join(str2.split(',')[-4:-1]) 147 | return odds1 == odds2 148 | 149 | def process_item(self, item, spider): 150 | if not self.is_valid_item(item): 151 | logging.warning("Skipping item: " + str(item)) 152 | return item 153 | 154 | # Get file path for decrypeted data 155 | # Format Handicap_HAD/decrypted/2015/04/14/Shakhtar_Dontsk-Braga/Handicap_HAD 156 | if item.get('post_time') is None: 157 | return item 158 | 159 | dtobj = datetime.strptime(item.get('post_time'), '%Y-%m-%d %H:%M:%S HKT') 160 | 161 | # match_name = Shakhtar_Dontsk-Braga 162 | if item.get('home_team') is None or item.get('away_team') is None: 163 | return item 164 | match_name = "%s-%s" % (item.get('home_team').strip('"'), 165 | item.get('away_team').strip('"')) 166 | 167 | 168 | decrypted_filepath = 'Handicap_HAD/decrypted/%s/%s/%s/%s/Handicap_HAD' % (dtobj.year, dtobj.month, dtobj.day, match_name) 169 | 170 | # Check if file needs to be changed 171 | file_path = os.path.join(self.decrypted_folder, decrypted_filepath) 172 | 173 | save_item = False 174 | 175 | if os.path.isfile(file_path): 176 | # Check if any odds changed 177 | with open(file_path, 'r') as data_file: 178 | last_data = data_file.readlines()[-1].strip() 179 | item_str = self.format_item(item, False) 180 | 181 | if not self.are_odds_same(last_data, item_str): 182 | save_item = True 183 | else: 184 | save_item = True 185 | 186 | if save_item: 187 | self.match_names[match_name] = file_path 188 | 189 | parent_dir = os.path.dirname(file_path) 190 | if not os.path.isdir(parent_dir): 191 | os.makedirs(parent_dir) 192 | 193 | # Saving item in file 194 | with open(file_path, 'a') as data_file: 195 | item_str = self.format_item(item, False) 196 | data_file.write(item_str) 197 | data_file.write('\n') 198 | 199 | # Saving raw file 200 | dtobj = datetime.now() 201 | raw_file = 'soccer/raw/%s/%s/%s/%s_%s-%s-%s_Handicap_Had' % (dtobj.year, dtobj.month, dtobj.day, self.global_count_str, dtobj.hour, dtobj.minute, dtobj.second) 202 | 203 | raw_file_path = os.path.join(self.raw_folder, raw_file) 204 | 205 | parent_dir = os.path.dirname(raw_file_path) 206 | if not os.path.isdir(parent_dir): 207 | os.makedirs(parent_dir) 208 | 209 | with open(raw_file_path, 'w') as data_file: 210 | data_file.write(item.get('html')) 211 | 212 | with open(self.global_count_file, 'w') as f: 213 | f.write(self.global_count_str) 214 | 215 | return item 216 | --------------------------------------------------------------------------------