├── .gitignore
├── README.md
├── javLibraryCrawl
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── pipelines.py.old
    ├── requirements.txt
    ├── result
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   ├── actor.py
    │   ├── best_rated.py
    │   ├── most_wanted.py
    │   ├── new_entries.py
    │   └── new_releases.py
├── scrapy.cfg
└── src
    └── images
        └── empty


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # image files
60 | src/images/
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # JavLibraryCrawler
 2 | 
 3 | This project allows you to scrape all movies from the javLibrary.
 4 | It  crawl the following items:
 5 | * Title
 6 | * Designation
 7 | * URL to the library website
 8 | * list of category
 9 | * Release Date
10 | * Duration
11 | * Actor
12 | * Cover image URL
13 | * Cover image hash value
14 | 
15 | It will also download cover image in local and generate the corresponded thum, you can configurate the image setting in [settings](https://github.com/hukewei/JavLibraryCrawler/blob/master/javLibraryCrawl/settings.py).
16 | 
17 | The tutorial for the image settings can be found [here](http://doc.scrapy.org/en/latest/topics/images.html).
18 | 
19 | ##Install
20 | 
21 | * Install pip [from here](https://pip.pypa.io/en/latest/installing.html).
22 | 
23 | * Install scrapy [from here](http://doc.scrapy.org/en/latest/intro/install.html).
24 | 
25 | * Install dependencies:
26 | ```
27 | pip install -r requirements.txt
28 | ```
29 | 
30 | ##Run
31 | This project contains two type of crawlers:
32 | 
33 | * Best rated movies (best_rated_spider)
34 | 
35 | * ALL movies (actor_spider)
36 | 
37 | To start the crawlers, please run : 
38 | Crawl only best rated movies (500 movies) :
39 | ```
40 | scrapy crawl best_rated_spider
41 | ```
42 | or crawl all movies in the library(> 150000 movies, the somme of all cover images is around 16 GB ).
43 | ```
44 | scrapy crawl actor_spider
45 | ```
46 | 
47 | ##Credit
48 | This project uses the [scrapy](https://github.com/scrapy/scrapy) to build the crawlers.
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hukewei/JavLibraryCrawler/7ef12431c3df76db8c053411bb0a4b0fa3da188f/javLibraryCrawl/__init__.py


--------------------------------------------------------------------------------
/javLibraryCrawl/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class JavlibrarycrawlItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     title = scrapy.Field() #
14 |     designation = scrapy.Field() # ABP-108
15 |     url = scrapy.Field() #javliiqq6e
16 |     category = scrapy.Field() # categories
17 |     release_date = scrapy.Field() # 2015-04-24
18 |     duration = scrapy.Field() # 120
19 |     image_urls = scrapy.Field() #
20 |     images = scrapy.Field()
21 |     actor = scrapy.Field()
22 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import pymongo
  8 | import json
  9 | import sys
 10 | import jpush as jpush
 11 | 
 12 | from scrapy.contrib.pipeline.images import ImagesPipeline
 13 | from scrapy.exceptions import DropItem
 14 | from scrapy.http import Request
 15 | from scrapy.conf import settings
 16 | from scrapy import log
 17 | from conf import app_key, master_secret
 18 | 
 19 |             
 20 | class JavlibrarycrawlPipeline(ImagesPipeline):
 21 |     def process_item(self, item, spider):
 22 |         return item
 23 | 
 24 |     def get_media_requests(self, item, info):
 25 |         for image_url in item['image_urls']:
 26 |             yield Request(image_url)
 27 | 
 28 | class MongoDBPipeline(object):
 29 |     def __init__(self):
 30 |        reload(sys)
 31 |        sys.setdefaultencoding("utf-8")
 32 |        self._jpush = jpush.JPush(app_key, master_secret) 
 33 |         
 34 |     def process_item(self, item, spider):
 35 |         connection = pymongo.MongoClient(
 36 |             settings['MONGODB_SERVER'],
 37 |             settings['MONGODB_PORT']
 38 |         )
 39 |         db = connection[settings['MONGODB_DB']]
 40 |         if spider.name == 'actor_spider':
 41 |             self.collection = db[settings['MONGODB_COLLECTION']]
 42 |         elif spider.name == 'best_rated_spider':
 43 |             self.collection = db[settings['MONGODB_COLLECTION_BEST_RATED']]
 44 |         elif spider.name == 'most_wanted_spider':
 45 |             self.collection = db[settings['MONGODB_COLLECTION_MOST_WANTED']]
 46 |         elif spider.name == 'new_releases_spider':
 47 |             self.collection = db[settings['MONGODB_COLLECTION_NEW_RELEASES']]
 48 |         elif spider.name == 'new_entries_spider':
 49 |             self.collection = db[settings['MONGODB_COLLECTION_NEW_ENTRIES']]
 50 |         else:
 51 |             self.collection = db[settings['MONGODB_COLLECTION']]
 52 |         valid = True
 53 |         for data in item:
 54 |             if not data:
 55 |                 valid = False
 56 |                 raise DropItem("Missing {0}!".format(data))
 57 |         title = item['title']
 58 |         actors = item['actor']
 59 |         if self.is_already_in_db(title):
 60 |             valid = False
 61 |         if valid:
 62 |             _id = self.collection.insert(dict(item))
 63 |             self.send_notification(_id, title, actors)
 64 |             log.msg("Question added to MongoDB database!",
 65 |                     level=log.DEBUG, spider=spider)
 66 |         return item
 67 | 
 68 |     def send_notification(self, _id, title, actors):
 69 | 	for actor in actors:
 70 |             client_ids = self.get_actor_subscribers(actor)
 71 |             if client_ids:
 72 |                 push = self._jpush.create_push()
 73 |                 push.audience = jpush.audience(
 74 |                     jpush.registration_id(*client_ids)
 75 |                 )
 76 | 	        log.msg("client ids = " + json.dumps(client_ids))
 77 | 	        message = jpush.android(alert=u'新片通知 : 您关注的艺人 ' + actor.encode('utf-8') + u' 有新片 %s ，点击查看详情。' %(title.encode('utf-8')), extras={'VideoID':str(_id)})
 78 |                 push.notification = jpush.notification(alert=u'新片通知 : 您关注的艺人发布了新片，点击查看详情。', android=message)
 79 | 	        log.msg("Sending push notification for %s and %s" %(title, actor))
 80 |                 push.platform = jpush.all_
 81 |                 push.send()
 82 | 
 83 |     def is_already_in_db(self, title):
 84 |         'check if the title is already in the current db or not'
 85 |         return self.collection.find( { 'title': { '$exists': True, '$in': [title] } } ).count() > 0
 86 | 
 87 |     def get_actor_subscribers(self, actor):
 88 |         'iterate the membersPreference collection, find client_id where favorite_actors contains actor'
 89 |         connection = pymongo.MongoClient(
 90 |             settings['MONGODB_SERVER'],
 91 |             settings['MONGODB_PORT']
 92 |         )
 93 |         db = connection[settings['MONGODB_DB']]
 94 |         self.collection = db['membersPreference']
 95 |         result = []
 96 |         cursor = self.collection.find({'notified_actors' : actor}, { 'clientID': 1, '_id':0 })
 97 |         for record in cursor:
 98 |             result.append(record.get('clientID'))
 99 | 	result.append('02068f6a423')
100 | 	return result
101 | 
102 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/pipelines.py.old:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | from scrapy.contrib.pipeline.images import ImagesPipeline
10 | from scrapy.exceptions import DropItem
11 | from scrapy.http import Request
12 | from scrapy.conf import settings
13 | from scrapy import log
14 | 
15 | 
16 | 
17 | class JavlibrarycrawlPipeline(ImagesPipeline):
18 |     def process_item(self, item, spider):
19 |         return item
20 | 
21 |     def get_media_requests(self, item, info):
22 |     	for image_url in item['image_urls']:
23 |     		yield Request(image_url)
24 | 
25 | 
26 | 
27 | class MongoDBPipeline(object):
28 |     def process_item(self, item, spider):
29 |         connection = pymongo.MongoClient(
30 |             settings['MONGODB_SERVER'],
31 |             settings['MONGODB_PORT']
32 |         )
33 |         db = connection[settings['MONGODB_DB']]
34 |         if spider.name == 'actor_spider':
35 |             self.collection = db[settings['MONGODB_COLLECTION']]
36 |         elif spider.name == 'best_rated_spider':
37 |             self.collection = db[settings['MONGODB_COLLECTION_BEST_RATED']]
38 |         elif spider.name == 'most_wanted_spider':
39 |             self.collection = db[settings['MONGODB_COLLECTION_MOST_WANTED']]
40 |         elif spider.name == 'new_releases_spider':
41 |             self.collection = db[settings['MONGODB_COLLECTION_NEW_RELEASES']]
42 |         elif spider.name == 'new_entries_spider':
43 |             self.collection = db[settings['MONGODB_COLLECTION_NEW_ENTRIES']]
44 |         else:
45 |             self.collection = db[settings['MONGODB_COLLECTION']]
46 |         valid = True
47 |         for data in item:
48 |             if not data:
49 |                 valid = False
50 |                 raise DropItem("Missing {0}!".format(data))
51 |         if valid:
52 |             self.collection.insert(dict(item))
53 |             log.msg("Question added to MongoDB database!",
54 |                     level=log.DEBUG, spider=spider)
55 |         return item
56 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/requirements.txt:
--------------------------------------------------------------------------------
1 | image>=1.3.9
2 | pymongo>=3.0.1
3 | requests>=2.7.0
4 | jpush>=3.0.1
5 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/result:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hukewei/JavLibraryCrawler/7ef12431c3df76db8c053411bb0a4b0fa3da188f/javLibraryCrawl/result


--------------------------------------------------------------------------------
/javLibraryCrawl/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for javLibraryCrawl project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'javLibraryCrawl'
12 | 
13 | SPIDER_MODULES = ['javLibraryCrawl.spiders']
14 | NEWSPIDER_MODULE = 'javLibraryCrawl.spiders'
15 | ITEM_PIPELINES = {'javLibraryCrawl.pipelines.MongoDBPipeline':5,}
16 | 
17 | IMAGES_STORE = 'src/images'
18 | 
19 | IMAGES_THUMBS = {
20 |     'small': (50, 50),
21 | }
22 | 
23 | MONGODB_SERVER = "localhost"
24 | MONGODB_PORT = 27017
25 | MONGODB_DB = "javLibrary"
26 | MONGODB_COLLECTION = "videos"
27 | MONGODB_COLLECTION_ALL = "videos"
28 | MONGODB_COLLECTION_BEST_RATED = "best_rated"
29 | MONGODB_COLLECTION_MOST_WANTED = "most_wanted"
30 | MONGODB_COLLECTION_NEW_RELEASES = "new_releases"
31 | MONGODB_COLLECTION_NEW_ENTRIES = "new_entries"
32 | 
33 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
34 | #USER_AGENT = 'javLibraryCrawl (+http://www.yourdomain.com)'
35 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/javLibraryCrawl/spiders/actor.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.selector import Selector
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy import log
 5 | 
 6 | from javLibraryCrawl.items import JavlibrarycrawlItem
 7 | 
 8 | 
 9 | class BestRatedSpider(CrawlSpider):
10 |     name = "actor_spider"
11 |     allowed_domains = ["javlibrary.com"]
12 |     start_urls = [
13 |         "http://www.javlibrary.com/cn/star_list.php",
14 |     ]
15 |     rules = (
16 |         # Extract links matching 'category.php'
17 |         # and follow links from them (since no callback means follow=True by default).
18 |         Rule(SgmlLinkExtractor(allow=('vl_star\.php', ))),
19 |         # Extract links matching 'item.php' and parse them with the spider's method parse_item
20 |         Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True),
21 |     )
22 | 
23 |     def parse_video(self, response):
24 |         """
25 |         The lines below is a spider contract. For more info see:
26 |         http://www.javlibrary.com/cn/star_list.php
27 |         @url http://www.javlibrary.com/cn/star_list.php
28 |         @scrapes name
29 |         """
30 |         video = Selector(response)
31 |         items = []
32 | 
33 |         item = JavlibrarycrawlItem()
34 |         item['url'] = response.request.url
35 |         item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract()
36 |         item['title'] = video.xpath("//h3/a/text()").extract()[0]
37 |         item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0]
38 |         item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract()
39 |         item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract()
40 |         item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract()
41 |         item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract()
42 |         items.append(item)
43 | 
44 |         return items


--------------------------------------------------------------------------------
/javLibraryCrawl/spiders/best_rated.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.selector import Selector
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy import log
 5 | 
 6 | from javLibraryCrawl.items import JavlibrarycrawlItem
 7 | 
 8 | 
 9 | class BestRatedSpider(CrawlSpider):
10 |     name = "best_rated_spider"
11 |     allowed_domains = ["javlibrary.com"]
12 |     start_urls = [
13 |         "http://www.javlibrary.com/cn/vl_bestrated.php?&mode=&page=25",
14 |     ]
15 |     rules = (
16 |         # Extract links matching 'category.php'
17 |         # and follow links from them (since no callback means follow=True by default).
18 |         Rule(SgmlLinkExtractor(allow=('vl_bestrated\.php', ))),
19 |         # Extract links matching 'item.php' and parse them with the spider's method parse_item
20 |         Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True),
21 |     )
22 | 
23 |     def parse_video(self, response):
24 |         """
25 |         The lines below is a spider contract. For more info see:
26 |         http://www.javlibrary.com/cn/vl_bestrated.php
27 |         @url http://www.javlibrary.com/cn/vl_bestrated.php
28 |         @scrapes name
29 |         """
30 |         video = Selector(response)
31 |         items = []
32 | 
33 |         item = JavlibrarycrawlItem()
34 |         item['url'] = response.request.url
35 |         item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract()
36 |         item['title'] = video.xpath("//h3/a/text()").extract()[0]
37 |         item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0]
38 |         item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract()
39 |         item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract()
40 |         item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract()
41 |         item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract()
42 |         items.append(item)
43 | 
44 |         return items


--------------------------------------------------------------------------------
/javLibraryCrawl/spiders/most_wanted.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.selector import Selector
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy import log
 5 | 
 6 | from javLibraryCrawl.items import JavlibrarycrawlItem
 7 | 
 8 | 
 9 | class BestRatedSpider(CrawlSpider):
10 |     name = "most_wanted_spider"
11 |     allowed_domains = ["javlibrary.com"]
12 |     start_urls = [
13 |         "http://www.javlibrary.com/cn/vl_mostwanted.php?&mode=&page=25",
14 |     ]
15 |     rules = (
16 |         # Extract links matching 'category.php'
17 |         # and follow links from them (since no callback means follow=True by default).
18 |         Rule(SgmlLinkExtractor(allow=('vl_mostwanted\.php', ))),
19 |         # Extract links matching 'item.php' and parse them with the spider's method parse_item
20 |         Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True),
21 |     )
22 | 
23 |     def parse_video(self, response):
24 |         """
25 |         The lines below is a spider contract. For more info see:
26 |         http://www.javlibrary.com/cn/vl_mostwanted.php
27 |         @url http://www.javlibrary.com/cn/vl_mostwanted.php
28 |         @scrapes name
29 |         """
30 |         video = Selector(response)
31 |         items = []
32 | 
33 |         item = JavlibrarycrawlItem()
34 |         item['url'] = response.request.url
35 |         item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract()
36 |         item['title'] = video.xpath("//h3/a/text()").extract()[0]
37 |         item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0]
38 |         item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract()
39 |         item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract()
40 |         item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract()
41 |         item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract()
42 |         items.append(item)
43 | 
44 |         return items


--------------------------------------------------------------------------------
/javLibraryCrawl/spiders/new_entries.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.selector import Selector
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy import log
 5 | 
 6 | from javLibraryCrawl.items import JavlibrarycrawlItem
 7 | 
 8 | 
 9 | class BestRatedSpider(CrawlSpider):
10 |     name = "new_entries_spider"
11 |     allowed_domains = ["javlibrary.com"]
12 |     start_urls = [
13 |         "http://www.javlibrary.com/cn/vl_newentries.php?&mode=&page=25",
14 |     ]
15 |     rules = (
16 |         # Extract links matching 'category.php'
17 |         # and follow links from them (since no callback means follow=True by default).
18 |         Rule(SgmlLinkExtractor(allow=('vl_newentries\.php', ))),
19 |         # Extract links matching 'item.php' and parse them with the spider's method parse_item
20 |         Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True),
21 |     )
22 | 
23 |     def parse_video(self, response):
24 |         """
25 |         The lines below is a spider contract. For more info see:
26 |         http://www.javlibrary.com/cn/vl_newentries.php
27 |         @url http://www.javlibrary.com/cn/vl_newentries.php
28 |         @scrapes name
29 |         """
30 |         video = Selector(response)
31 |         items = []
32 | 
33 |         item = JavlibrarycrawlItem()
34 |         item['url'] = response.request.url
35 |         item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract()
36 |         item['title'] = video.xpath("//h3/a/text()").extract()[0]
37 |         item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0]
38 |         item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract()
39 |         item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract()
40 |         item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract()
41 |         item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract()
42 |         items.append(item)
43 | 
44 |         return items


--------------------------------------------------------------------------------
/javLibraryCrawl/spiders/new_releases.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.selector import Selector
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy import log
 5 | 
 6 | from javLibraryCrawl.items import JavlibrarycrawlItem
 7 | 
 8 | 
 9 | class BestRatedSpider(CrawlSpider):
10 |     name = "new_releases_spider"
11 |     allowed_domains = ["javlibrary.com"]
12 |     start_urls = [
13 |         "http://www.javlibrary.com/cn/vl_newrelease.php",
14 |     ]
15 |     rules = (
16 |         # Extract links matching 'category.php'
17 |         # and follow links from them (since no callback means follow=True by default).
18 |         Rule(SgmlLinkExtractor(allow=('vl_newrelease\.php', ))),
19 |         # Extract links matching 'item.php' and parse them with the spider's method parse_item
20 |         Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True),
21 |     )
22 | 
23 |     def parse_video(self, response):
24 |         """
25 |         The lines below is a spider contract. For more info see:
26 |         http://www.javlibrary.com/cn/vl_newrelease.php
27 |         @url http://www.javlibrary.com/cn/vl_newrelease.php
28 |         @scrapes name
29 |         """
30 |         video = Selector(response)
31 |         items = []
32 | 
33 |         item = JavlibrarycrawlItem()
34 |         item['url'] = response.request.url
35 |         item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract()
36 |         item['title'] = video.xpath("//h3/a/text()").extract()[0]
37 |         item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0]
38 |         item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract()
39 |         item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract()
40 |         item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract()
41 |         item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract()
42 |         items.append(item)
43 | 
44 |         return items
45 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = javLibraryCrawl.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = javLibraryCrawl
12 | 


--------------------------------------------------------------------------------
/src/images/empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hukewei/JavLibraryCrawler/7ef12431c3df76db8c053411bb0a4b0fa3da188f/src/images/empty


--------------------------------------------------------------------------------