├── hkjc
    ├── hkjc
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── science.py
    │   ├── URLDuplicateFilter.py
    │   ├── items.py
    │   ├── settings.py
    │   └── pipelines.py
    ├── scrapy.cfg
    └── requirements.txt
├── README.md
├── howstuffworks
    ├── howstuffworks
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── science.py
    │   ├── items.py
    │   ├── URLDuplicateFilter.py
    │   ├── pipelines.py
    │   └── settings.py
    └── scrapy.cfg
├── wallpaperswide
    ├── wallpaperswide
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── wallpaper.py
    │   ├── items.py
    │   ├── pipelines.py
    │   └── settings.py
    └── scrapy.cfg
├── .gitignore
├── requirements.txt
└── processors
    └── tagger.py


/hkjc/hkjc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Srcappers for Frictionle.
2 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wallpaperswide/wallpaperswide/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | *.pyc
3 | *.pyo
4 | *.log
5 | .DS_Store
6 | *.json
7 | 


--------------------------------------------------------------------------------
/hkjc/hkjc/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/wallpaperswide/wallpaperswide/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/hkjc/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = hkjc.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = hkjc
12 | 


--------------------------------------------------------------------------------
/howstuffworks/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = howstuffworks.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = howstuffworks
12 | 


--------------------------------------------------------------------------------
/wallpaperswide/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = wallpaperswide.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = wallpaperswide
12 | 


--------------------------------------------------------------------------------
/wallpaperswide/wallpaperswide/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WallpaperItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     image_url = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/wallpaperswide/wallpaperswide/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class WallpaperswidePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/hkjc/requirements.txt:
--------------------------------------------------------------------------------
 1 | Scrapy==1.0.5
 2 | Twisted==16.1.1
 3 | argparse==1.2.1
 4 | attrs==15.2.0
 5 | cffi==1.6.0
 6 | cryptography==1.3.1
 7 | cssselect==0.9.1
 8 | enum34==1.1.3
 9 | html2text==2016.4.2
10 | idna==2.1
11 | ipaddress==1.0.16
12 | lxml==3.6.0
13 | pyOpenSSL==16.0.0
14 | pyasn1==0.1.9
15 | pyasn1-modules==0.0.8
16 | pycparser==2.14
17 | queuelib==1.4.2
18 | service-identity==16.0.0
19 | six==1.10.0
20 | w3lib==1.14.2
21 | wsgiref==0.1.2
22 | zope.interface==4.1.3
23 | pytz
24 | 


--------------------------------------------------------------------------------
/hkjc/hkjc/URLDuplicateFilter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from scrapy.dupefilters import RFPDupeFilter
 4 | 
 5 | class URLFilter(RFPDupeFilter):
 6 |     """A dupe filter that considers specific ids in the url"""
 7 | 
 8 |     def __getid(self, url):
 9 |         return url
10 | 
11 |     def request_seen(self, request):
12 |         fp = self.__getid(request.url)
13 |         if fp in self.fingerprints:
14 |             return True
15 |         self.fingerprints.add(fp)
16 |         if self.file:
17 |             self.file.write(fp + os.linesep)
18 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ArticleItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     url = scrapy.Field()
14 |     title = scrapy.Field()
15 |     desc = scrapy.Field()
16 |     excerpt = scrapy.Field()
17 |     images = scrapy.Field()
18 |     related = scrapy.Field()
19 | 
20 |     urlb64 = scrapy.Field()
21 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/URLDuplicateFilter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from scrapy.dupefilters import RFPDupeFilter
 4 | from scrapy.utils.request import request_fingerprint
 5 | 
 6 | class URLFilter(RFPDupeFilter):
 7 |     """A dupe filter that considers specific ids in the url"""
 8 | 
 9 |     def __getid(self, url):
10 |         return url
11 | 
12 |     def request_seen(self, request):
13 |         fp = self.__getid(request.url)
14 |         if fp in self.fingerprints:
15 |             return True
16 |         self.fingerprints.add(fp)
17 |         if self.file:
18 |             self.file.write(fp + os.linesep)
19 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appnope==0.1.0
 2 | cffi==1.3.0
 3 | characteristic==14.3.0
 4 | cryptography==1.1
 5 | cssselect==0.9.1
 6 | decorator==4.0.4
 7 | enum34==1.0.4
 8 | html2text==2015.11.4
 9 | idna==2.0
10 | ipaddress==1.0.15
11 | ipython==4.0.0
12 | ipython-genutils==0.1.0
13 | lxml==3.5.0
14 | path.py==8.1.2
15 | pexpect==4.0.1
16 | pickleshare==0.5
17 | ptyprocess==0.5
18 | pyasn1==0.1.9
19 | pyasn1-modules==0.0.8
20 | pycparser==2.14
21 | pymongo==3.1.1
22 | pyOpenSSL==0.15.1
23 | queuelib==1.4.2
24 | Scrapy==1.0.3
25 | service-identity==14.0.0
26 | simplegeneric==0.8.1
27 | six==1.10.0
28 | traitlets==4.0.0
29 | Twisted==15.4.0
30 | w3lib==1.13.0
31 | wheel==0.24.0
32 | zope.interface==4.1.3
33 | 


--------------------------------------------------------------------------------
/hkjc/hkjc/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ArticleItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     server_timestamp = scrapy.Field()
14 |     client_timestamp = scrapy.Field()
15 | 
16 |     home_team = scrapy.Field()
17 |     home_goal = scrapy.Field()
18 |     away_team = scrapy.Field()
19 |     away_goal = scrapy.Field()
20 | 
21 |     post_time = scrapy.Field()
22 | 
23 |     flag_1 = scrapy.Field()
24 |     flag_2 = scrapy.Field()
25 |     flag_3 = scrapy.Field()
26 | 
27 |     home_odds = scrapy.Field()
28 |     draw_odds = scrapy.Field()
29 |     away_odds = scrapy.Field()
30 | 
31 |     final = scrapy.Field()
32 | 
33 |     html = scrapy.Field()
34 | 
35 | 
36 | class RawPages(scrapy.Item):
37 |     pages = scrapy.Field()
38 | 


--------------------------------------------------------------------------------
/processors/tagger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from optparse import OptionParser
 4 | 
 5 | import nltk
 6 | 
 7 | def parse(file_path):
 8 |     """
 9 |     Given a file 'file_path', returns list of tags
10 |     """
11 |     with open(file_path, 'rb') as f:
12 |         contents = json.loads(f.read())
13 |         print extract_tags(contents['desc'])
14 | 
15 | 
16 | #
17 | # https://en.wikipedia.org/w/api.php?action=query&clshow=!hidden&cllimit=500&prop=categories&titles=Sachin_Tendulkar
18 | #
19 | 
20 | def extract_tags(content):
21 |     """
22 |     Given string content, returns list of tags
23 |     """
24 |     tag, tags = [], []
25 |     pos_tags = ['NN', 'NNP', 'NNS', 'NNPS']
26 |     for pos_element in nltk.pos_tag(content.split()):
27 |         if pos_element[1] in pos_tags:
28 |             tag.append(pos_element)
29 |         else:
30 |             if len(tag) > 0:
31 |                 tags.append(tag)
32 |             tag = []
33 |     if len(tag) > 0:
34 |         tags.append(tag)
35 | 
36 |     return tags
37 | 
38 | if __name__ == '__main__':
39 |     parser = OptionParser()
40 |     parser.add_option("-d", "--dir", dest="directory",
41 |                   help="all files of this directory will be parsed", metavar="DIR")
42 |     parser.add_option("-f", "--file", dest="file",
43 |                   help="file for parsing", metavar="FILE")
44 | 
45 |     (opts, args) = parser.parse_args()
46 | 
47 |     if opts.file:
48 |         parse(opts.file)
49 | 
50 |     if opts.directory:
51 |         files = os.listdir(opts.directory)
52 |         for file_name in files:
53 |             file_path = os.path.join(opts.directory, file_name)
54 |             parse(file_path)
55 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import time, base64, json, os
 8 | import pymongo
 9 | from scrapy.exceptions import DropItem
10 | 
11 | class DatabaseEntryPipeline(object):
12 | 
13 |     collection_name = 'map'
14 | 
15 |     def __init__(self, mongo_uri, mongo_db):
16 |         self.mongo_uri = mongo_uri
17 |         self.mongo_db = mongo_db
18 | 
19 |     @classmethod
20 |     def from_crawler(cls, crawler):
21 |         return cls(
22 |             mongo_uri=crawler.settings.get('MONGO_URI'),
23 |             mongo_db=crawler.settings.get('MONGO_DATABASE', 'frictionle')
24 |         )
25 | 
26 |     def open_spider(self, spider):
27 |         self.client = pymongo.MongoClient(self.mongo_uri)
28 |         self.db = self.client[self.mongo_db]
29 | 
30 |     def close_spider(self, spider):
31 |         self.client.close()
32 | 
33 |     def process_item(self, item, spider):
34 |         try:
35 |             _id = base64.b64encode(item['url'])
36 |             result = self.db[self.collection_name].insert({
37 |                 '_id': _id,
38 |                 'u': item['url'],
39 |                 't': item['title'],
40 |                 'c': time.time()
41 |             })
42 |             item['urlb64'] = _id
43 |         except pymongo.errors.DuplicateKeyError:
44 |             raise DropItem("Duplicate item found: %s" % item['url'])
45 |         else:
46 |             return item
47 | 
48 | 
49 | class FileWriterPipeline(object):
50 |     def __init__(self, dump_folder):
51 |         self.dump_folder = dump_folder
52 | 
53 |     @classmethod
54 |     def from_crawler(cls, crawler):
55 |         return cls(
56 |             dump_folder=crawler.settings.get('DUMP_FOLDER'),
57 |         )
58 | 
59 |     def process_item(self, item, spider):
60 |         file_path = os.path.join(self.dump_folder, spider.site, spider.name, item['urlb64'])
61 |         with open(file_path, 'wb') as data_file:
62 |             content = json.dumps(dict(item))
63 |             data_file.write(content)
64 |         return item
65 | 


--------------------------------------------------------------------------------
/wallpaperswide/wallpaperswide/spiders/wallpaper.py:
--------------------------------------------------------------------------------
 1 | import scrapy, html2text
 2 | from wallpaperswide.items import WallpaperItem
 3 | 
 4 | class WallpaperCrawler(scrapy.Spider):
 5 |     name = "wallpaper"
 6 |     site = "wallpaperswide.com"
 7 |     allowed_domains = ["wallpaperswide.com"]
 8 |     base_url = 'http://wallpaperswide.com'
 9 |     start_urls = [
10 |         "http://wallpaperswide.com/games-desktop-wallpapers.html"
11 |     ]
12 |     priorities = [
13 |         '1366x768',
14 |         '1280x768',
15 |         '1280x720',
16 |         '1440x900',
17 |         '1600x900',
18 |         '1280x1024',
19 |         '1280x960',
20 |         '960x600',
21 |         '800x600'
22 |     ]
23 | 
24 |     def is_wallpaper(self, response):
25 |         if response.css('#wallpaper-resolutions'):
26 |             return False
27 |         return False
28 | 
29 |     def getatindex(self, a, index=0):
30 |         if not a:
31 |             return a
32 |         return a[index]
33 | 
34 |     def mergeall(self, a):
35 |         return ''.join(a)
36 | 
37 | 
38 |     def parse_wallpaper_page(self, response):
39 |         """
40 |         Returns an item
41 |         """
42 |         item = WallpaperItem()
43 |         image_url = None
44 |         for resolution in WallpaperCrawler.priorities:
45 |             xp = '//div[@id="wallpaper-resolutions"]/a[text()="%s"]/@href' % resolution
46 |             link = response.xpath(xp).extract()
47 |             if link:
48 |                 image_url = WallpaperCrawler.base_url + self.getatindex(link)
49 |                 break
50 | 
51 |         if image_url:
52 |             item['image_url'] = image_url
53 |             yield item
54 |         else:
55 |             self.logger.warning('No images found for given resolution %s' % response.url)
56 | 
57 | 
58 |     def parse(self, response):
59 |         for wallpaper_page_url in response.xpath('//div[@id="content"]//ul[@class="wallpapers"]/li//a/@href').extract():
60 |             wallpaper_page_url = WallpaperCrawler.base_url + wallpaper_page_url
61 |             yield scrapy.Request(wallpaper_page_url, callback=self.parse_wallpaper_page)
62 | 
63 |         next_urls = response.xpath('//*[@id="content"]/div[@class="pagination"]/a[contains(text(),"Next")]/@href').extract()
64 |         for next_url in next_urls:
65 |             next_url = WallpaperCrawler.base_url + next_url
66 |             yield scrapy.Request(next_url, callback=self.parse)
67 | 


--------------------------------------------------------------------------------
/wallpaperswide/wallpaperswide/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for wallpaperswide project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'wallpaperswide'
13 | 
14 | SPIDER_MODULES = ['wallpaperswide.spiders']
15 | NEWSPIDER_MODULE = 'wallpaperswide.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'wallpaperswide (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'wallpaperswide.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'wallpaperswide.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | #ITEM_PIPELINES = {
65 | #    'wallpaperswide.pipelines.SomePipeline': 300,
66 | #}
67 | 
68 | # Enable and configure the AutoThrottle extension (disabled by default)
69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
71 | #AUTOTHROTTLE_ENABLED=True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY=5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY=60
76 | # Enable showing throttling stats for every response received:
77 | #AUTOTHROTTLE_DEBUG=False
78 | 
79 | # Enable and configure HTTP caching (disabled by default)
80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
81 | #HTTPCACHE_ENABLED=True
82 | #HTTPCACHE_EXPIRATION_SECS=0
83 | #HTTPCACHE_DIR='httpcache'
84 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
86 | 


--------------------------------------------------------------------------------
/hkjc/hkjc/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for howstuffworks project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'hkjc'
13 | 
14 | SPIDER_MODULES = ['hkjc.spiders']
15 | NEWSPIDER_MODULE = 'hkjc.spiders'
16 | 
17 | RAW_FOLDER = '/home/vagrant/temp/raw'
18 | DECRYPTED_FOLDER = '/home/vagrant/temp/decrypted'
19 | META_FOLDER = '/home/vagrant/temp/meta'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'howstuffworks (+http://www.yourdomain.com)'
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS=32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY=3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
33 | #CONCURRENT_REQUESTS_PER_IP=16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED=False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED=False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'howstuffworks.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'howstuffworks.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'hkjc.pipelines.FileWriterPipeline': 200
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
74 | #AUTOTHROTTLE_ENABLED=True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY=5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY=60
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG=False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED=True
85 | #HTTPCACHE_EXPIRATION_SECS=0
86 | #HTTPCACHE_DIR='httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
89 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for howstuffworks project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'howstuffworks'
13 | 
14 | SPIDER_MODULES = ['howstuffworks.spiders']
15 | NEWSPIDER_MODULE = 'howstuffworks.spiders'
16 | 
17 | MONGO_URI = 'mongodb://localhost:27017'
18 | MONGO_DATABASE = 'frictionle'
19 | 
20 | DUMP_FOLDER = '/Users/arpitbhayani/frictionle/site_data/'
21 | 
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'howstuffworks (+http://www.yourdomain.com)'
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS=32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY=3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
34 | #CONCURRENT_REQUESTS_PER_IP=16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED=False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED=False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'howstuffworks.middlewares.MyCustomSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'howstuffworks.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    'howstuffworks.pipelines.DatabaseEntryPipeline': 100,
70 |    'howstuffworks.pipelines.FileWriterPipeline': 200,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
76 | #AUTOTHROTTLE_ENABLED=True
77 | # The initial download delay
78 | #AUTOTHROTTLE_START_DELAY=5
79 | # The maximum download delay to be set in case of high latencies
80 | #AUTOTHROTTLE_MAX_DELAY=60
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG=False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED=True
87 | #HTTPCACHE_EXPIRATION_SECS=0
88 | #HTTPCACHE_DIR='httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
90 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/hkjc/hkjc/spiders/science.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | import html2text
  3 | from hkjc.items import ArticleItem
  4 | from urlparse import urlparse
  5 | from os.path import splitext, basename
  6 | 
  7 | import pytz
  8 | import logging
  9 | from datetime import datetime
 10 | 
 11 | class HKJCCrawler(scrapy.Spider):
 12 |     name = "hkjc"
 13 |     site = "hkjc"
 14 |     allowed_domains = ["bet.hkjc.com"]
 15 |     base_domain = "http://bet.hkjc.com"
 16 |     start_urls = [
 17 |         "http://bet.hkjc.com/football/odds/odds_hha.aspx?ci=en-US",
 18 |     ]
 19 | 
 20 |     def get_raw(self, htmltext):
 21 |         converter = html2text.HTML2Text()
 22 |         converter.ignore_links = True
 23 |         return converter.handle(htmltext).strip()
 24 | 
 25 |     def get_image_filename(self, image_url):
 26 |         if type(image_url) != list or len(image_url) == 0:
 27 |             return None
 28 | 
 29 |         disassembled = urlparse(image_url[0])
 30 |         filename, file_ext = splitext(basename(disassembled.path))
 31 |         return filename
 32 | 
 33 |     def process_team_name(self, name):
 34 |         return '"%s"' % name.replace(' ', '_')
 35 | 
 36 |     def process_rows(self, rows, server_timestamp, html):
 37 |         items = []
 38 |         for row in rows:
 39 | 
 40 |             item = ArticleItem()
 41 | 
 42 |             temp_rows = row.css('tr::attr(class)').extract()
 43 |             if len(temp_rows) == 0:
 44 |                 continue
 45 | 
 46 |             class_attr = temp_rows[0].lower()
 47 |             if not ('rhead' in class_attr or 'rchead' in class_attr or 'tdpage' in class_attr):
 48 | 
 49 |                 # Save raw html
 50 |                 item['html'] = html
 51 | 
 52 |                 # Server timestamp
 53 |                 dt = server_timestamp.split(' ')
 54 |                 d = dt[0].replace('/', '-')
 55 |                 t = dt[1]
 56 | 
 57 |                 dtobj = datetime.strptime('%s %s' % (d, t), '%d-%m-%y %H:%M')
 58 |                 dtobj = pytz.timezone("Hongkong").localize(dtobj)
 59 |                 item['server_timestamp'] = dtobj.strftime('%Y-%m-%d %H:%M:%S %Z')
 60 | 
 61 |                 # Client timestamp
 62 |                 dtobj = pytz.timezone("Hongkong").localize(datetime.now())
 63 |                 item['client_timestamp'] = dtobj.strftime('%Y-%m-%d %H:%M:%S %Z')
 64 | 
 65 |                 # Fetch Flags
 66 |                 flags = row.css('td.cflag')
 67 |                 if len(flags) >= 1:
 68 |                     item['flag_1'] = self.get_image_filename(flags[0].css('img::attr(src)').extract()) or 0
 69 |                 if len(flags) >= 2:
 70 |                     item['flag_2'] = self.get_image_filename(flags[1].css('img::attr(src)').extract()) or 0
 71 | 
 72 |                 # Fetch Venue
 73 |                 flags = row.css('td.cvenue')
 74 |                 if len(flags) >= 1:
 75 |                     item['flag_3'] = self.get_image_filename(flags[0].css('img::attr(src)').extract()) or 0
 76 | 
 77 |                 # Fetch teams
 78 |                 teams = row.css('td.cteams')
 79 |                 if len(teams) >= 1:
 80 |                     all_teams = self.get_raw(teams[0].css('a')[0].extract())
 81 | 
 82 |                     tokens = all_teams.split(' vs ')
 83 | 
 84 |                     home_team_name = tokens[0].split('[')[0].strip() if '[' in tokens[0] else tokens[0]
 85 |                     away_team_name = tokens[1].split('[')[0].strip() if '[' in tokens[1] else tokens[1]
 86 | 
 87 |                     item['home_team'] = self.process_team_name(home_team_name)
 88 |                     item['away_team'] = self.process_team_name(away_team_name)
 89 | 
 90 |                     if len(teams[0].css('a *')) >= 2:
 91 |                         item['home_goal'] = self.get_raw(teams[0].css('a *')[1].extract())
 92 |                     if len(teams[0].css('a *')) >= 5:
 93 |                         item['away_goal'] = self.get_raw(teams[0].css('a *')[4].extract())
 94 | 
 95 |                 # Fetch Odds
 96 |                 odds = row.css('td.codds')
 97 |                 if len(odds) >= 1:
 98 |                     item['home_odds'] = self.get_raw(odds[0].extract())
 99 |                 if len(odds) >= 2:
100 |                     item['draw_odds'] = self.get_raw(odds[1].extract())
101 |                 if len(odds) >= 3:
102 |                     item['away_odds'] = self.get_raw(odds[2].extract())
103 | 
104 |                 # Fetch Post Time
105 |                 eest = row.css('td.cesst')
106 |                 if len(eest) >= 1:
107 |                     tstr = self.get_raw(eest[0].extract())
108 |                     dtobj = datetime.strptime(tstr, '%d/%m %H:%M')
109 |                     if dtobj.year == 1900:
110 |                         dtobj = dtobj.replace(year=datetime.now().year)
111 |                     dtobj = pytz.timezone("Hongkong").localize(dtobj)
112 |                     item['post_time'] = dtobj.strftime('%Y-%m-%d %H:%M:%S %Z')
113 | 
114 |             items.append(item)
115 | 
116 |         return items
117 | 
118 |     def parse_article_page(self, response):
119 |         """
120 |         Returns an item
121 |         """
122 |         articles = []
123 |         odd_tables = response.css('table.tHHA.tOdds')
124 |         server_timestamp = self.get_raw(response.css('#server_datetime').extract()[0])
125 | 
126 |         for odd_table in odd_tables:
127 |             rows = odd_table.css('tr')
128 |             articles.extend(self.process_rows(rows, server_timestamp, response.body))
129 |         return articles
130 | 
131 |     def parse(self, response):
132 |         for article in self.parse_article_page(response):
133 |             yield article
134 | 
135 |         page_buttons = response.css('#tblOdds a')
136 | 
137 |         for page_button in page_buttons:
138 |             link_text = self.get_raw(page_button.extract())
139 |             if link_text.lower() == 'next':
140 |                 next_link = page_button.xpath('@href').extract()
141 |                 if len(next_link) >= 0:
142 |                     yield scrapy.Request(self.base_domain + next_link[0], callback=self.parse)
143 | 


--------------------------------------------------------------------------------
/howstuffworks/howstuffworks/spiders/science.py:
--------------------------------------------------------------------------------
  1 | import scrapy, html2text
  2 | from howstuffworks.items import ArticleItem
  3 | 
  4 | class ScienceCrawler(scrapy.Spider):
  5 |     name = "science"
  6 |     site = "howstuffworks"
  7 |     allowed_domains = ["science.howstuffworks.com"]
  8 |     start_urls = [
  9 |         "http://science.howstuffworks.com/engineering/civil",
 10 |         "http://science.howstuffworks.com/materials-science-channel.htm",
 11 |         "http://science.howstuffworks.com/engineering/structural",
 12 |         "http://science.howstuffworks.com/devices-channel.htm",
 13 |         "http://science.howstuffworks.com/robots-channel.htm",
 14 |         "http://science.howstuffworks.com/environmental/conservation",
 15 |         "http://science.howstuffworks.com/environmental/energy",
 16 |         "http://science.howstuffworks.com/environmental/green-science",
 17 |         "http://science.howstuffworks.com/environmental/earth",
 18 |         "http://science.howstuffworks.com/environmental/terms",
 19 |         "http://science.howstuffworks.com/environmental/green-tech",
 20 |         "http://science.howstuffworks.com/nature/climate-weather",
 21 |         "http://science.howstuffworks.com/nature/natural-disasters",
 22 |         "http://science.howstuffworks.com/innovation/big-thinkers",
 23 |         "http://science.howstuffworks.com/innovation/everyday-innovations",
 24 |         "http://science.howstuffworks.com/innovation/inventions",
 25 |         "http://science.howstuffworks.com/innovation/new-inventions",
 26 |         "http://science.howstuffworks.com/innovation/science-questions",
 27 |         "http://science.howstuffworks.com/innovation/edible-innovations",
 28 |         "http://science.howstuffworks.com/innovation/famous-inventors",
 29 |         "http://science.howstuffworks.com/innovation/nasa-inventions",
 30 |         "http://science.howstuffworks.com/innovation/repurposed-inventions",
 31 |         "http://science.howstuffworks.com/innovation/scientific-experiments",
 32 |         "http://science.howstuffworks.com/life/biology-fields",
 33 |         "http://science.howstuffworks.com/life/cellular-microscopic",
 34 |         "http://science.howstuffworks.com/life/fungi",
 35 |         "http://science.howstuffworks.com/life/inside-the-mind",
 36 |         "http://science.howstuffworks.com/life/botany",
 37 |         "http://science.howstuffworks.com/life/evolution",
 38 |         "http://science.howstuffworks.com/life/genetic",
 39 |         "http://science.howstuffworks.com/military-aircraft-channel.htm",
 40 |         "http://science.howstuffworks.com/biological-warfare-channel.htm",
 41 |         "http://science.howstuffworks.com/explosives-channel.htm",
 42 |         "http://science.howstuffworks.com/future-military-technology.htm",
 43 |         "http://science.howstuffworks.com/personal-finance-in-military.htm",
 44 |         "http://science.howstuffworks.com/surveillance-stealth-channel.htm",
 45 |         "http://science.howstuffworks.com/military/army-careers",
 46 |         "http://science.howstuffworks.com/military-branches-channel.htm",
 47 |         "http://science.howstuffworks.com/firearms-channel.htm",
 48 |         "http://science.howstuffworks.com/naval-technology-channel.htm",
 49 |         "http://science.howstuffworks.com/soldiers-channel.htm",
 50 |         "http://science.howstuffworks.com/tanks-fighting-vehicles-channel.htm",
 51 |         "http://science.howstuffworks.com/acoustics-channel.htm",
 52 |         "http://science.howstuffworks.com/electricity-channel.htm",
 53 |         "http://science.howstuffworks.com/history-of-physics-channel.htm",
 54 |         "http://science.howstuffworks.com/math-concepts",
 55 |         "http://science.howstuffworks.com/mechanics-channel.htm",
 56 |         "http://science.howstuffworks.com/optics-channel.htm",
 57 |         "http://science.howstuffworks.com/chemistry-channel.htm",
 58 |         "http://science.howstuffworks.com/forensic-science-channel.htm",
 59 |         "http://science.howstuffworks.com/magnetism-channel.htm",
 60 |         "http://science.howstuffworks.com/matter-channel.htm",
 61 |         "http://science.howstuffworks.com/nuclear-science-channel.htm",
 62 |         "http://science.howstuffworks.com/dictionary/astronomy-terms",
 63 |         "http://science.howstuffworks.com/dictionary/biology-terms",
 64 |         "http://science.howstuffworks.com/dictionary/famous-scientists",
 65 |         "http://science.howstuffworks.com/dictionary/meteorological-terms",
 66 |         "http://science.howstuffworks.com/dictionary/petrology-terms",
 67 |         "http://science.howstuffworks.com/dictionary/awards-organizations",
 68 |         "http://science.howstuffworks.com/dictionary/chemistry-terms",
 69 |         "http://science.howstuffworks.com/dictionary/geology-terms",
 70 |         "http://science.howstuffworks.com/dictionary/physics-terms",
 71 |         "http://science.howstuffworks.com/dictionary/plant-terms",
 72 |         "http://science.howstuffworks.com/science-vs-myth/afterlife",
 73 |         "http://science.howstuffworks.com/science-vs-myth/extrasensory-perceptions",
 74 |         "http://science.howstuffworks.com/science-vs-myth/unexplained-phenomena",
 75 |         "http://science.howstuffworks.com/science-vs-myth/everyday-myths",
 76 |         "http://science.howstuffworks.com/science-vs-myth/strange-creatures",
 77 |         "http://science.howstuffworks.com/science-vs-myth/what-if",
 78 |         "http://science.howstuffworks.com/space/aliens-ufos",
 79 |         "http://science.howstuffworks.com/future-space-channel.htm",
 80 |         "http://science.howstuffworks.com/space-exploration-channel.htm",
 81 |         "http://science.howstuffworks.com/astronomy-channel.htm",
 82 |         "http://science.howstuffworks.com/spaceflight-channel.htm",
 83 |         "http://science.howstuffworks.com/space-transportation-systems-channel.htm",
 84 |         "http://science.howstuffworks.com/transport/engines-equipment",
 85 |         "http://science.howstuffworks.com/transport/flight"
 86 |     ]
 87 | 
 88 |     def is_article(self, response):
 89 |         if response.css('#Title'):
 90 |             return False
 91 |         return False
 92 | 
 93 | 
 94 |     def getatindex(self, a, index=0):
 95 |         if not a:
 96 |             return a
 97 |         return a[index]
 98 | 
 99 |     def mergeall(self, a):
100 |         return ''.join(a)
101 | 
102 | 
103 |     def parse_article_page(self, response):
104 |         """
105 |         Returns an item
106 |         """
107 |         item = ArticleItem()
108 |         item['url'] = response.url
109 |         item['title'] = self.getatindex(response.xpath('//*[@id="Title"]//h1/text()').extract())
110 |         item['desc'] = self.mergeall(response.xpath('//*[@id="ArticleWell"]//div[@class="content"]/p/text()').extract())
111 |         item['excerpt'] = item['desc'][:800]
112 |         item['related'] = response.xpath('//*[@id="RelatedLinks0"]//a/@href').extract()
113 |         item['images'] = {
114 |             'inset': response.xpath('//*[@id="ArticleWell"]//img/@src').extract(),
115 |             'fb': self.getatindex(response.xpath('//*[@property="og:image"]/@content').extract())
116 |         }
117 | 
118 |         if len(item['desc']) < 1000:
119 |             self.logger.warning('Description is less than 1000 chars. Might be fishy %s' % item['url'])
120 |         yield item
121 | 
122 | 
123 |     def parse(self, response):
124 |         for article_url in response.css('#ContentLibrary a[class="img"]').xpath('@href').extract():
125 |             yield scrapy.Request(article_url, callback=self.parse_article_page)
126 | 
127 |         next_buttons = response.xpath('//*[@id="ContentLibrary"]//img[@src="http://s.hswstatic.com/en-us/skins/hsw/arrow-right-3x5-2.png"]')
128 |         for next_button in next_buttons:
129 |              url = self.getatindex(next_button.xpath('../@href').extract())
130 |              yield scrapy.Request(url, callback=self.parse)
131 | 


--------------------------------------------------------------------------------
/hkjc/hkjc/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import os
  8 | import pickle
  9 | import logging
 10 | from datetime import datetime
 11 | 
 12 | 
 13 | class FileWriterPipeline(object):
 14 |     def __init__(self, raw_folder, decrypted_folder, meta_folder):
 15 |         self.raw_folder = raw_folder
 16 |         self.decrypted_folder = decrypted_folder
 17 |         self.meta_folder = meta_folder
 18 |         self.match_names = {}
 19 |         self.global_count = 0
 20 |         self.global_count_str = 0
 21 | 
 22 |         self.global_count_file = os.path.join(self.meta_folder, 'count.txt')
 23 |         parent_dir = os.path.dirname(self.global_count_file)
 24 |         if not os.path.isdir(parent_dir):
 25 |             os.makedirs(parent_dir)
 26 | 
 27 |         if not os.path.isfile(self.global_count_file):
 28 |             with open(self.global_count_file, 'w') as f:
 29 |                 f.write('0004000000')
 30 | 
 31 |         self.global_count_str = '0'
 32 |         with open(self.global_count_file, 'r') as f:
 33 |             self.global_count = int(f.read().strip())
 34 |             self.global_count_str = str(self.global_count).zfill(10)
 35 |             self.global_count += 1
 36 | 
 37 |     def close_spider(self, spider):
 38 |         last_matches_file = os.path.join(self.meta_folder, 'matches.pkl')
 39 | 
 40 |         if not os.path.isfile(last_matches_file):
 41 |             with open(last_matches_file, 'w') as f:
 42 |                 pickle.dump(self.match_names, f)
 43 |             return
 44 | 
 45 |         with open(last_matches_file, 'r') as f:
 46 |             temp = pickle.load(f)
 47 |             for name in temp:
 48 |                 if name not in self.match_names:
 49 |                     # Mark match with name as final
 50 |                     file_path = temp[name]
 51 |                     file_data = []
 52 |                     with open(file_path, 'rw') as t:
 53 |                         file_data = t.readlines()
 54 |                         data = file_data[-1].strip()
 55 |                         tokens = data.split(',')
 56 |                         tokens[-1] = '1\n'
 57 | 
 58 |                         file_data = file_data[:-1] + [','.join(tokens)]
 59 | 
 60 |                     with open(file_path, 'w') as t:
 61 |                         for line in file_data:
 62 |                             t.write(line)
 63 | 
 64 |         with open(last_matches_file, 'w') as f:
 65 |             pickle.dump(self.match_names, f)
 66 | 
 67 |     @classmethod
 68 |     def from_crawler(cls, crawler):
 69 |         return cls(
 70 |             raw_folder=crawler.settings.get('RAW_FOLDER'),
 71 |             decrypted_folder=crawler.settings.get('DECRYPTED_FOLDER'),
 72 |             meta_folder=crawler.settings.get('META_FOLDER')
 73 |         )
 74 | 
 75 |     def format_item(self, item, is_final):
 76 | 
 77 |         if 'server_timestamp' not in item:
 78 |             item['server_timestamp'] = ''
 79 | 
 80 |         if 'client_timestamp' not in item:
 81 |             item['client_timestamp'] = ''
 82 | 
 83 |         if 'home_team' not in item:
 84 |             item['home_team'] = ''
 85 | 
 86 |         if 'home_goal' not in item:
 87 |             item['home_goal'] = ''
 88 | 
 89 |         if 'away_team' not in item:
 90 |             item['away_team'] = ''
 91 | 
 92 |         if 'away_goal' not in item:
 93 |             item['away_goal'] = ''
 94 | 
 95 |         if 'post_time' not in item:
 96 |             item['post_time'] = ''
 97 | 
 98 |         if 'flag_1' not in item:
 99 |             item['flag_1'] = 0
100 | 
101 |         if 'flag_2' not in item:
102 |             item['flag_2'] = 0
103 | 
104 |         if 'flag_3' not in item:
105 |             item['flag_3'] = 0
106 | 
107 |         if 'home_odds' not in item:
108 |             item['home_odds'] = ''
109 | 
110 |         if 'draw_odds' not in item:
111 |             item['draw_odds'] = ''
112 | 
113 |         if 'away_odds' not in item:
114 |             item['away_odds'] = ''
115 | 
116 |         item['final'] = '1' is is_final or '0'
117 | 
118 |         return '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' % (
119 |             item['server_timestamp'],
120 |             item['client_timestamp'],
121 |             item['home_team'],
122 |             item['home_goal'],
123 |             item['away_team'],
124 |             item['away_goal'],
125 |             item['post_time'],
126 |             item['flag_1'],
127 |             item['flag_2'],
128 |             item['flag_3'],
129 |             item['home_odds'],
130 |             item['draw_odds'],
131 |             item['away_odds'],
132 |             item['final']
133 |         )
134 | 
135 |     def is_valid_item(self, item):
136 |         """Checks if given item is valid (has legitimate entries)
137 |         Ex: Not a header row, not with irregular data
138 |         """
139 |         if 'home_odds' not in item and 'draw_odds' not in item and\
140 |                 'away_odds' not in item and 'post_time' not in item:
141 |             return False
142 |         return True
143 | 
144 |     def are_odds_same(self, str1, str2):
145 |         odds1 = ','.join(str1.split(',')[-4:-1])
146 |         odds2 = ','.join(str2.split(',')[-4:-1])
147 |         return odds1 == odds2
148 | 
149 |     def process_item(self, item, spider):
150 |         if not self.is_valid_item(item):
151 |             logging.warning("Skipping item: " + str(item))
152 |             return item
153 | 
154 |         # Get file path for decrypeted data
155 |         # Format Handicap_HAD/decrypted/2015/04/14/Shakhtar_Dontsk-Braga/Handicap_HAD
156 |         if item.get('post_time') is None:
157 |             return item
158 | 
159 |         dtobj = datetime.strptime(item.get('post_time'), '%Y-%m-%d %H:%M:%S HKT')
160 | 
161 |         # match_name = Shakhtar_Dontsk-Braga
162 |         if item.get('home_team') is None or item.get('away_team') is None:
163 |             return item
164 |         match_name = "%s-%s" % (item.get('home_team').strip('"'),
165 |                                 item.get('away_team').strip('"'))
166 | 
167 | 
168 |         decrypted_filepath = 'Handicap_HAD/decrypted/%s/%s/%s/%s/Handicap_HAD' % (dtobj.year, dtobj.month, dtobj.day, match_name)
169 | 
170 |         # Check if file needs to be changed
171 |         file_path = os.path.join(self.decrypted_folder, decrypted_filepath)
172 | 
173 |         save_item = False
174 | 
175 |         if os.path.isfile(file_path):
176 |             # Check if any odds changed
177 |             with open(file_path, 'r') as data_file:
178 |                 last_data = data_file.readlines()[-1].strip()
179 |                 item_str = self.format_item(item, False)
180 | 
181 |                 if not self.are_odds_same(last_data, item_str):
182 |                     save_item = True
183 |         else:
184 |             save_item = True
185 | 
186 |         if save_item:
187 |             self.match_names[match_name] = file_path
188 | 
189 |             parent_dir = os.path.dirname(file_path)
190 |             if not os.path.isdir(parent_dir):
191 |                 os.makedirs(parent_dir)
192 | 
193 |             # Saving item in file
194 |             with open(file_path, 'a') as data_file:
195 |                 item_str = self.format_item(item, False)
196 |                 data_file.write(item_str)
197 |                 data_file.write('\n')
198 | 
199 |             # Saving raw file
200 |             dtobj = datetime.now()
201 |             raw_file = 'soccer/raw/%s/%s/%s/%s_%s-%s-%s_Handicap_Had' % (dtobj.year, dtobj.month, dtobj.day, self.global_count_str, dtobj.hour, dtobj.minute, dtobj.second)
202 | 
203 |             raw_file_path = os.path.join(self.raw_folder, raw_file)
204 | 
205 |             parent_dir = os.path.dirname(raw_file_path)
206 |             if not os.path.isdir(parent_dir):
207 |                 os.makedirs(parent_dir)
208 | 
209 |             with open(raw_file_path, 'w') as data_file:
210 |                 data_file.write(item.get('html'))
211 | 
212 |             with open(self.global_count_file, 'w') as f:
213 |                 f.write(self.global_count_str)
214 | 
215 |         return item
216 | 


--------------------------------------------------------------------------------