├── README.md
├── juno_crawler
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── items.cpython-36.pyc
    │   └── settings.cpython-36.pyc
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-36.pyc
    │       └── juno.cpython-36.pyc
    │   └── juno.py
├── scrapy.cfg
└── setup.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Juno Download Crawler
 2 | 
 3 | 
 4 | Crawls [Juno Download](http://www.junodownload.com "Juno Download") and collects data on the entire back catalogue of music singles.
 5 | 
 6 | Fields collected:
 7 | - Artist
 8 | - Title
 9 | - Record label
10 | - Catalog number
11 | - Release date
12 | - Music genre
13 | - Individual track names
14 | - mp3 sample urls
15 | 
16 | Example output code:
17 | 
18 | ```json
19 | [
20 |   {
21 |     "_type": "JunoCrawlerItem",
22 |     "catalog_number": "SB 215-0",
23 |     "title": "Tell Me",
24 |     "release_date": "10 Sep 08",
25 |     "artist": "CLEAR VIEW feat JESSICA",
26 |     "label": "Songbird Holland",
27 |     "tracks": [
28 |       [
29 |         "Tell Me - (6:43)",
30 |         "http://www.junodownload.com/MP3/SF1354749-02-01-01.mp3"
31 |       ],
32 |       [
33 |         "Tell Me (Max Graham remix) - (8:49)",
34 |         "http://www.junodownload.com/MP3/SF1354749-02-01-02.mp3"
35 |       ]
36 |     ],
37 |     "genre": "Progressive House"
38 |   }
39 | ]
40 | ```
41 | 


--------------------------------------------------------------------------------
/juno_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmurray/juno_crawler/108b041c2d0154ff0fe76111d85b9ae80d9e257a/juno_crawler/__init__.py


--------------------------------------------------------------------------------
/juno_crawler/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmurray/juno_crawler/108b041c2d0154ff0fe76111d85b9ae80d9e257a/juno_crawler/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/juno_crawler/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmurray/juno_crawler/108b041c2d0154ff0fe76111d85b9ae80d9e257a/juno_crawler/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/juno_crawler/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmurray/juno_crawler/108b041c2d0154ff0fe76111d85b9ae80d9e257a/juno_crawler/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/juno_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class JunoCrawlerItem(scrapy.Item):
11 |     artist = scrapy.Field()
12 |     title = scrapy.Field()
13 |     label = scrapy.Field()
14 |     tracks = scrapy.Field()
15 |     catalog_number = scrapy.Field()
16 |     release_date = scrapy.Field()
17 |     genre = scrapy.Field()
18 | 


--------------------------------------------------------------------------------
/juno_crawler/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class JunoCrawlerSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/juno_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class JunoCrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/juno_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for juno_crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'juno_crawler'
13 | 
14 | SPIDER_MODULES = ['juno_crawler.spiders']
15 | NEWSPIDER_MODULE = 'juno_crawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'juno_crawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'juno_crawler.middlewares.JunoCrawlerSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'juno_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'juno_crawler.pipelines.JunoCrawlerPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/juno_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/juno_crawler/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmurray/juno_crawler/108b041c2d0154ff0fe76111d85b9ae80d9e257a/juno_crawler/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/juno_crawler/spiders/__pycache__/juno.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mattmurray/juno_crawler/108b041c2d0154ff0fe76111d85b9ae80d9e257a/juno_crawler/spiders/__pycache__/juno.cpython-36.pyc


--------------------------------------------------------------------------------
/juno_crawler/spiders/juno.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # import scrapy
 3 | from scrapy import Spider
 4 | from scrapy.http import Request
 5 | from juno_crawler.items import JunoCrawlerItem
 6 | 
 7 | class JunoSpider(Spider):
 8 |     name = "juno"
 9 |     allowed_domains = ["junodownload.com"]
10 |     start_urls = ['http://www.junodownload.com/all/back-cat/releases/?music_product_type=single']
11 | 
12 |     def parse(self, response):
13 |         next_page_url = response.xpath('//a[span[contains(@class, "glyphicon-arrow-right")]]/@href').extract_first()
14 | 
15 |         item = JunoCrawlerItem()
16 | 
17 |         releases = response.xpath('.//div[@class="productlist_widget_container"]')
18 |         for release in releases:
19 |             artist_text = release.xpath(
20 |                 './/div[@class="productlist_widget_product_artists"]/span[@class="jq_highlight pwrtext"]/descendant-or-self::*/text()')
21 |             artist_list = []
22 |             for artist in artist_text:
23 |                 artist_list.append(artist.extract())
24 | 
25 |             artist = ''.join(artist_list)
26 |             item['artist'] = artist
27 | 
28 |             title = release.xpath(
29 |                 './/div[@class="productlist_widget_product_title"]/span[@class="jq_highlight pwrtext"]/a/text()').extract_first()
30 |             item['title'] = title
31 | 
32 |             label = release.xpath(
33 |                 './/div[@class="productlist_widget_product_label"]/span[@class="jq_highlight pwrtext"]/a/text()').extract_first()
34 |             item['label'] = label
35 | 
36 |             track_div = release.xpath('.//div[@class="productlist_widget_tracklist_left"]')
37 |             for tracks in track_div:
38 |                 track_urls = tracks.xpath(
39 |                     './/div[@class="productlist_widget_tracklist_row"]/a[@data-ua_action="play"]/@href').extract() # updated
40 |                 track_name_list = tracks.xpath(
41 |                     './/div[@class="productlist_widget_tracklist_row_text"]/span[@class="jq_highlight"]/text()').extract()
42 |                 track_names = []
43 |                 for tracks in track_name_list:
44 |                     track = tracks.replace('\t','')
45 |                     if len(track) >0:
46 |                         track_names.append(track)
47 | 
48 |             tracks = list(zip(track_names, track_urls))
49 |             item['tracks'] = tracks
50 | 
51 |             catalog_number = release.xpath(
52 |                 './/div[@class="productlist_widget_product_preview_buy"]/text()').extract_first()
53 |             catalog_number = catalog_number.strip()
54 |             item['catalog_number'] = catalog_number
55 | 
56 |             release_date = release.xpath(
57 |                 './/div[@class="productlist_widget_product_preview_buy"]/span/text()').extract_first()
58 |             item['release_date'] = release_date
59 | 
60 |             genre = release.xpath(
61 |                 './/div[@class="productlist_widget_product_preview_buy"]/span/following-sibling::span/text()').extract_first()
62 |             genre = genre.strip()
63 |             item['genre'] = genre
64 | 
65 |             yield item
66 | 
67 |         yield Request(next_page_url)
68 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = juno_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = juno_crawler
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = juno_crawler.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------