├── .gitignore ├── README.md ├── javLibraryCrawl ├── __init__.py ├── items.py ├── pipelines.py ├── pipelines.py.old ├── requirements.txt ├── result ├── settings.py └── spiders │ ├── __init__.py │ ├── actor.py │ ├── best_rated.py │ ├── most_wanted.py │ ├── new_entries.py │ └── new_releases.py ├── scrapy.cfg └── src └── images └── empty /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # image files 60 | src/images/ 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JavLibraryCrawler 2 | 3 | This project allows you to scrape all movies from the javLibrary. 4 | It crawl the following items: 5 | * Title 6 | * Designation 7 | * URL to the library website 8 | * list of category 9 | * Release Date 10 | * Duration 11 | * Actor 12 | * Cover image URL 13 | * Cover image hash value 14 | 15 | It will also download cover image in local and generate the corresponded thum, you can configurate the image setting in [settings](https://github.com/hukewei/JavLibraryCrawler/blob/master/javLibraryCrawl/settings.py). 16 | 17 | The tutorial for the image settings can be found [here](http://doc.scrapy.org/en/latest/topics/images.html). 18 | 19 | ##Install 20 | 21 | * Install pip [from here](https://pip.pypa.io/en/latest/installing.html). 22 | 23 | * Install scrapy [from here](http://doc.scrapy.org/en/latest/intro/install.html). 24 | 25 | * Install dependencies: 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | ##Run 31 | This project contains two type of crawlers: 32 | 33 | * Best rated movies (best_rated_spider) 34 | 35 | * ALL movies (actor_spider) 36 | 37 | To start the crawlers, please run : 38 | Crawl only best rated movies (500 movies) : 39 | ``` 40 | scrapy crawl best_rated_spider 41 | ``` 42 | or crawl all movies in the library(> 150000 movies, the somme of all cover images is around 16 GB ). 43 | ``` 44 | scrapy crawl actor_spider 45 | ``` 46 | 47 | ##Credit 48 | This project uses the [scrapy](https://github.com/scrapy/scrapy) to build the crawlers. 49 | 50 | 51 | -------------------------------------------------------------------------------- /javLibraryCrawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukewei/JavLibraryCrawler/7ef12431c3df76db8c053411bb0a4b0fa3da188f/javLibraryCrawl/__init__.py -------------------------------------------------------------------------------- /javLibraryCrawl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class JavlibrarycrawlItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | title = scrapy.Field() # 14 | designation = scrapy.Field() # ABP-108 15 | url = scrapy.Field() #javliiqq6e 16 | category = scrapy.Field() # categories 17 | release_date = scrapy.Field() # 2015-04-24 18 | duration = scrapy.Field() # 120 19 | image_urls = scrapy.Field() # 20 | images = scrapy.Field() 21 | actor = scrapy.Field() 22 | -------------------------------------------------------------------------------- /javLibraryCrawl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | import json 9 | import sys 10 | import jpush as jpush 11 | 12 | from scrapy.contrib.pipeline.images import ImagesPipeline 13 | from scrapy.exceptions import DropItem 14 | from scrapy.http import Request 15 | from scrapy.conf import settings 16 | from scrapy import log 17 | from conf import app_key, master_secret 18 | 19 | 20 | class JavlibrarycrawlPipeline(ImagesPipeline): 21 | def process_item(self, item, spider): 22 | return item 23 | 24 | def get_media_requests(self, item, info): 25 | for image_url in item['image_urls']: 26 | yield Request(image_url) 27 | 28 | class MongoDBPipeline(object): 29 | def __init__(self): 30 | reload(sys) 31 | sys.setdefaultencoding("utf-8") 32 | self._jpush = jpush.JPush(app_key, master_secret) 33 | 34 | def process_item(self, item, spider): 35 | connection = pymongo.MongoClient( 36 | settings['MONGODB_SERVER'], 37 | settings['MONGODB_PORT'] 38 | ) 39 | db = connection[settings['MONGODB_DB']] 40 | if spider.name == 'actor_spider': 41 | self.collection = db[settings['MONGODB_COLLECTION']] 42 | elif spider.name == 'best_rated_spider': 43 | self.collection = db[settings['MONGODB_COLLECTION_BEST_RATED']] 44 | elif spider.name == 'most_wanted_spider': 45 | self.collection = db[settings['MONGODB_COLLECTION_MOST_WANTED']] 46 | elif spider.name == 'new_releases_spider': 47 | self.collection = db[settings['MONGODB_COLLECTION_NEW_RELEASES']] 48 | elif spider.name == 'new_entries_spider': 49 | self.collection = db[settings['MONGODB_COLLECTION_NEW_ENTRIES']] 50 | else: 51 | self.collection = db[settings['MONGODB_COLLECTION']] 52 | valid = True 53 | for data in item: 54 | if not data: 55 | valid = False 56 | raise DropItem("Missing {0}!".format(data)) 57 | title = item['title'] 58 | actors = item['actor'] 59 | if self.is_already_in_db(title): 60 | valid = False 61 | if valid: 62 | _id = self.collection.insert(dict(item)) 63 | self.send_notification(_id, title, actors) 64 | log.msg("Question added to MongoDB database!", 65 | level=log.DEBUG, spider=spider) 66 | return item 67 | 68 | def send_notification(self, _id, title, actors): 69 | for actor in actors: 70 | client_ids = self.get_actor_subscribers(actor) 71 | if client_ids: 72 | push = self._jpush.create_push() 73 | push.audience = jpush.audience( 74 | jpush.registration_id(*client_ids) 75 | ) 76 | log.msg("client ids = " + json.dumps(client_ids)) 77 | message = jpush.android(alert=u'新片通知 : 您关注的艺人 ' + actor.encode('utf-8') + u' 有新片 %s ,点击查看详情。' %(title.encode('utf-8')), extras={'VideoID':str(_id)}) 78 | push.notification = jpush.notification(alert=u'新片通知 : 您关注的艺人发布了新片,点击查看详情。', android=message) 79 | log.msg("Sending push notification for %s and %s" %(title, actor)) 80 | push.platform = jpush.all_ 81 | push.send() 82 | 83 | def is_already_in_db(self, title): 84 | 'check if the title is already in the current db or not' 85 | return self.collection.find( { 'title': { '$exists': True, '$in': [title] } } ).count() > 0 86 | 87 | def get_actor_subscribers(self, actor): 88 | 'iterate the membersPreference collection, find client_id where favorite_actors contains actor' 89 | connection = pymongo.MongoClient( 90 | settings['MONGODB_SERVER'], 91 | settings['MONGODB_PORT'] 92 | ) 93 | db = connection[settings['MONGODB_DB']] 94 | self.collection = db['membersPreference'] 95 | result = [] 96 | cursor = self.collection.find({'notified_actors' : actor}, { 'clientID': 1, '_id':0 }) 97 | for record in cursor: 98 | result.append(record.get('clientID')) 99 | result.append('02068f6a423') 100 | return result 101 | 102 | -------------------------------------------------------------------------------- /javLibraryCrawl/pipelines.py.old: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | from scrapy.contrib.pipeline.images import ImagesPipeline 10 | from scrapy.exceptions import DropItem 11 | from scrapy.http import Request 12 | from scrapy.conf import settings 13 | from scrapy import log 14 | 15 | 16 | 17 | class JavlibrarycrawlPipeline(ImagesPipeline): 18 | def process_item(self, item, spider): 19 | return item 20 | 21 | def get_media_requests(self, item, info): 22 | for image_url in item['image_urls']: 23 | yield Request(image_url) 24 | 25 | 26 | 27 | class MongoDBPipeline(object): 28 | def process_item(self, item, spider): 29 | connection = pymongo.MongoClient( 30 | settings['MONGODB_SERVER'], 31 | settings['MONGODB_PORT'] 32 | ) 33 | db = connection[settings['MONGODB_DB']] 34 | if spider.name == 'actor_spider': 35 | self.collection = db[settings['MONGODB_COLLECTION']] 36 | elif spider.name == 'best_rated_spider': 37 | self.collection = db[settings['MONGODB_COLLECTION_BEST_RATED']] 38 | elif spider.name == 'most_wanted_spider': 39 | self.collection = db[settings['MONGODB_COLLECTION_MOST_WANTED']] 40 | elif spider.name == 'new_releases_spider': 41 | self.collection = db[settings['MONGODB_COLLECTION_NEW_RELEASES']] 42 | elif spider.name == 'new_entries_spider': 43 | self.collection = db[settings['MONGODB_COLLECTION_NEW_ENTRIES']] 44 | else: 45 | self.collection = db[settings['MONGODB_COLLECTION']] 46 | valid = True 47 | for data in item: 48 | if not data: 49 | valid = False 50 | raise DropItem("Missing {0}!".format(data)) 51 | if valid: 52 | self.collection.insert(dict(item)) 53 | log.msg("Question added to MongoDB database!", 54 | level=log.DEBUG, spider=spider) 55 | return item 56 | -------------------------------------------------------------------------------- /javLibraryCrawl/requirements.txt: -------------------------------------------------------------------------------- 1 | image>=1.3.9 2 | pymongo>=3.0.1 3 | requests>=2.7.0 4 | jpush>=3.0.1 5 | -------------------------------------------------------------------------------- /javLibraryCrawl/result: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukewei/JavLibraryCrawler/7ef12431c3df76db8c053411bb0a4b0fa3da188f/javLibraryCrawl/result -------------------------------------------------------------------------------- /javLibraryCrawl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for javLibraryCrawl project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'javLibraryCrawl' 12 | 13 | SPIDER_MODULES = ['javLibraryCrawl.spiders'] 14 | NEWSPIDER_MODULE = 'javLibraryCrawl.spiders' 15 | ITEM_PIPELINES = {'javLibraryCrawl.pipelines.MongoDBPipeline':5,} 16 | 17 | IMAGES_STORE = 'src/images' 18 | 19 | IMAGES_THUMBS = { 20 | 'small': (50, 50), 21 | } 22 | 23 | MONGODB_SERVER = "localhost" 24 | MONGODB_PORT = 27017 25 | MONGODB_DB = "javLibrary" 26 | MONGODB_COLLECTION = "videos" 27 | MONGODB_COLLECTION_ALL = "videos" 28 | MONGODB_COLLECTION_BEST_RATED = "best_rated" 29 | MONGODB_COLLECTION_MOST_WANTED = "most_wanted" 30 | MONGODB_COLLECTION_NEW_RELEASES = "new_releases" 31 | MONGODB_COLLECTION_NEW_ENTRIES = "new_entries" 32 | 33 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 34 | #USER_AGENT = 'javLibraryCrawl (+http://www.yourdomain.com)' 35 | -------------------------------------------------------------------------------- /javLibraryCrawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /javLibraryCrawl/spiders/actor.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.selector import Selector 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy import log 5 | 6 | from javLibraryCrawl.items import JavlibrarycrawlItem 7 | 8 | 9 | class BestRatedSpider(CrawlSpider): 10 | name = "actor_spider" 11 | allowed_domains = ["javlibrary.com"] 12 | start_urls = [ 13 | "http://www.javlibrary.com/cn/star_list.php", 14 | ] 15 | rules = ( 16 | # Extract links matching 'category.php' 17 | # and follow links from them (since no callback means follow=True by default). 18 | Rule(SgmlLinkExtractor(allow=('vl_star\.php', ))), 19 | # Extract links matching 'item.php' and parse them with the spider's method parse_item 20 | Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True), 21 | ) 22 | 23 | def parse_video(self, response): 24 | """ 25 | The lines below is a spider contract. For more info see: 26 | http://www.javlibrary.com/cn/star_list.php 27 | @url http://www.javlibrary.com/cn/star_list.php 28 | @scrapes name 29 | """ 30 | video = Selector(response) 31 | items = [] 32 | 33 | item = JavlibrarycrawlItem() 34 | item['url'] = response.request.url 35 | item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract() 36 | item['title'] = video.xpath("//h3/a/text()").extract()[0] 37 | item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0] 38 | item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract() 39 | item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract() 40 | item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract() 41 | item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract() 42 | items.append(item) 43 | 44 | return items -------------------------------------------------------------------------------- /javLibraryCrawl/spiders/best_rated.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.selector import Selector 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy import log 5 | 6 | from javLibraryCrawl.items import JavlibrarycrawlItem 7 | 8 | 9 | class BestRatedSpider(CrawlSpider): 10 | name = "best_rated_spider" 11 | allowed_domains = ["javlibrary.com"] 12 | start_urls = [ 13 | "http://www.javlibrary.com/cn/vl_bestrated.php?&mode=&page=25", 14 | ] 15 | rules = ( 16 | # Extract links matching 'category.php' 17 | # and follow links from them (since no callback means follow=True by default). 18 | Rule(SgmlLinkExtractor(allow=('vl_bestrated\.php', ))), 19 | # Extract links matching 'item.php' and parse them with the spider's method parse_item 20 | Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True), 21 | ) 22 | 23 | def parse_video(self, response): 24 | """ 25 | The lines below is a spider contract. For more info see: 26 | http://www.javlibrary.com/cn/vl_bestrated.php 27 | @url http://www.javlibrary.com/cn/vl_bestrated.php 28 | @scrapes name 29 | """ 30 | video = Selector(response) 31 | items = [] 32 | 33 | item = JavlibrarycrawlItem() 34 | item['url'] = response.request.url 35 | item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract() 36 | item['title'] = video.xpath("//h3/a/text()").extract()[0] 37 | item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0] 38 | item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract() 39 | item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract() 40 | item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract() 41 | item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract() 42 | items.append(item) 43 | 44 | return items -------------------------------------------------------------------------------- /javLibraryCrawl/spiders/most_wanted.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.selector import Selector 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy import log 5 | 6 | from javLibraryCrawl.items import JavlibrarycrawlItem 7 | 8 | 9 | class BestRatedSpider(CrawlSpider): 10 | name = "most_wanted_spider" 11 | allowed_domains = ["javlibrary.com"] 12 | start_urls = [ 13 | "http://www.javlibrary.com/cn/vl_mostwanted.php?&mode=&page=25", 14 | ] 15 | rules = ( 16 | # Extract links matching 'category.php' 17 | # and follow links from them (since no callback means follow=True by default). 18 | Rule(SgmlLinkExtractor(allow=('vl_mostwanted\.php', ))), 19 | # Extract links matching 'item.php' and parse them with the spider's method parse_item 20 | Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True), 21 | ) 22 | 23 | def parse_video(self, response): 24 | """ 25 | The lines below is a spider contract. For more info see: 26 | http://www.javlibrary.com/cn/vl_mostwanted.php 27 | @url http://www.javlibrary.com/cn/vl_mostwanted.php 28 | @scrapes name 29 | """ 30 | video = Selector(response) 31 | items = [] 32 | 33 | item = JavlibrarycrawlItem() 34 | item['url'] = response.request.url 35 | item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract() 36 | item['title'] = video.xpath("//h3/a/text()").extract()[0] 37 | item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0] 38 | item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract() 39 | item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract() 40 | item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract() 41 | item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract() 42 | items.append(item) 43 | 44 | return items -------------------------------------------------------------------------------- /javLibraryCrawl/spiders/new_entries.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.selector import Selector 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy import log 5 | 6 | from javLibraryCrawl.items import JavlibrarycrawlItem 7 | 8 | 9 | class BestRatedSpider(CrawlSpider): 10 | name = "new_entries_spider" 11 | allowed_domains = ["javlibrary.com"] 12 | start_urls = [ 13 | "http://www.javlibrary.com/cn/vl_newentries.php?&mode=&page=25", 14 | ] 15 | rules = ( 16 | # Extract links matching 'category.php' 17 | # and follow links from them (since no callback means follow=True by default). 18 | Rule(SgmlLinkExtractor(allow=('vl_newentries\.php', ))), 19 | # Extract links matching 'item.php' and parse them with the spider's method parse_item 20 | Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True), 21 | ) 22 | 23 | def parse_video(self, response): 24 | """ 25 | The lines below is a spider contract. For more info see: 26 | http://www.javlibrary.com/cn/vl_newentries.php 27 | @url http://www.javlibrary.com/cn/vl_newentries.php 28 | @scrapes name 29 | """ 30 | video = Selector(response) 31 | items = [] 32 | 33 | item = JavlibrarycrawlItem() 34 | item['url'] = response.request.url 35 | item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract() 36 | item['title'] = video.xpath("//h3/a/text()").extract()[0] 37 | item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0] 38 | item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract() 39 | item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract() 40 | item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract() 41 | item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract() 42 | items.append(item) 43 | 44 | return items -------------------------------------------------------------------------------- /javLibraryCrawl/spiders/new_releases.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.selector import Selector 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy import log 5 | 6 | from javLibraryCrawl.items import JavlibrarycrawlItem 7 | 8 | 9 | class BestRatedSpider(CrawlSpider): 10 | name = "new_releases_spider" 11 | allowed_domains = ["javlibrary.com"] 12 | start_urls = [ 13 | "http://www.javlibrary.com/cn/vl_newrelease.php", 14 | ] 15 | rules = ( 16 | # Extract links matching 'category.php' 17 | # and follow links from them (since no callback means follow=True by default). 18 | Rule(SgmlLinkExtractor(allow=('vl_newrelease\.php', ))), 19 | # Extract links matching 'item.php' and parse them with the spider's method parse_item 20 | Rule(SgmlLinkExtractor(allow=(r'/\?v=jav.*',)), callback='parse_video', follow=True), 21 | ) 22 | 23 | def parse_video(self, response): 24 | """ 25 | The lines below is a spider contract. For more info see: 26 | http://www.javlibrary.com/cn/vl_newrelease.php 27 | @url http://www.javlibrary.com/cn/vl_newrelease.php 28 | @scrapes name 29 | """ 30 | video = Selector(response) 31 | items = [] 32 | 33 | item = JavlibrarycrawlItem() 34 | item['url'] = response.request.url 35 | item['image_urls'] = video.xpath("//*[@id='video_jacket_img']/@src").extract() 36 | item['title'] = video.xpath("//h3/a/text()").extract()[0] 37 | item['designation'] = video.xpath('//*[@id="video_id"]/table/tr/td[2]/text()').extract()[0] 38 | item['category'] = video.xpath('//*[@class="genre"]/a/text()').extract() 39 | item['actor'] = video.xpath('//*[@class="star"]/a/text()').extract() 40 | item['duration'] = video.xpath('//*[@id="video_length"]/table/tr/td[2]/span/text()').extract() 41 | item['release_date'] = video.xpath('//*[@id="video_date"]/table/tr/td[2]/text()').extract() 42 | items.append(item) 43 | 44 | return items 45 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = javLibraryCrawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = javLibraryCrawl 12 | -------------------------------------------------------------------------------- /src/images/empty: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hukewei/JavLibraryCrawler/7ef12431c3df76db8c053411bb0a4b0fa3da188f/src/images/empty --------------------------------------------------------------------------------