├── crawl_service ├── __init__.py ├── spiders │ ├── __init__.py │ ├── websosanh.py │ └── lazada.py ├── pipelines.py ├── items.py ├── run.py ├── middlewares.py └── settings.py ├── scrapy.cfg ├── requirements.txt └── README.md /crawl_service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawl_service/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawl_service.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawl_service 12 | -------------------------------------------------------------------------------- /crawl_service/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class CrawlServicePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /crawl_service/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CrawlServiceItem(scrapy.Item): 12 | pass 13 | #name = scrapy.Field() 14 | #image = scrapy.Field() 15 | #price = scrapy.Field() 16 | #short_description = scrapy.Field() 17 | #full_description = scrapy.Field() 18 | 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asn1crypto==0.24.0 2 | attrs==18.2.0 3 | Automat==0.7.0 4 | certifi==2018.10.15 5 | cffi==1.11.5 6 | chardet==3.0.4 7 | constantly==15.1.0 8 | cryptography==2.3.1 9 | cssselect==1.0.3 10 | enum34==1.1.6 11 | functools32==3.2.3.post2 12 | hyperlink==18.0.0 13 | idna==2.7 14 | incremental==17.5.0 15 | ipaddress==1.0.22 16 | lxml==4.2.5 17 | parsel==1.5.1 18 | pyasn1==0.4.4 19 | pyasn1-modules==0.2.2 20 | pycparser==2.19 21 | PyDispatcher==2.0.5 22 | PyHamcrest==1.9.0 23 | pyOpenSSL==18.0.0 24 | queuelib==1.5.0 25 | Scrapy==1.5.1 26 | scrapy-splash==0.7.2 27 | service-identity==17.0.0 28 | six==1.11.0 29 | Twisted==18.9.0 30 | urllib3==1.24 31 | w3lib==1.19.0 32 | zope.interface==4.6.0 33 | -------------------------------------------------------------------------------- /crawl_service/run.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from scrapy.crawler import CrawlerProcess 3 | from scrapy.utils.project import get_project_settings 4 | from scrapy.utils.log import configure_logging 5 | from twisted.internet.error import ReactorNotRestartable 6 | from spiders.websosanh import WebsosanhSpider 7 | from spiders.lazada import LazadaSpider 8 | configure_logging(install_root_handler=False) 9 | logging.basicConfig( 10 | filename='crawl.log', 11 | format='%(levelname)s: %(message)s', 12 | level=logging.WARNING) 13 | 14 | if __name__ == '__main__': 15 | 16 | process = CrawlerProcess(get_project_settings()) 17 | process.crawl(WebsosanhSpider) 18 | process.crawl(LazadaSpider) 19 | 20 | try: 21 | process.start() 22 | except ReactorNotRestartable as e: 23 | pass 24 | finally: 25 | process.stop() 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawler-JS 2 | Dùng scrapy-splash kết hợp lua script để crawl các trang web sử dụng Javascript (websosanh) 3 | ``` 4 | ├── crawl_service 5 | │   ├── __init__.py 6 | │   ├── items.py 7 | │   ├── middlewares.py 8 | │   ├── pipelines.py 9 | │   ├── run.py 10 | │   ├── settings.py 11 | │   └── spiders 12 | │   ├── __init__.py 13 | │   └── websosanh.py 14 | └── lazada.py 15 | ├── requirements.txt 16 | └── scrapy.cfg 17 | ``` 18 | 19 | - Cài đặt Splash 20 | 21 | Cài Docker sau đó chạy 22 | ``` 23 | $ sudo docker pull scrapinghub/splash 24 | ``` 25 | và 26 | ``` 27 | $ sudo docker run -p 8050:8050 scrapinghub/splash 28 | ``` 29 | - Cài các thư viện cần thiết khác ( Nên dùng virtualenv ) 30 | ``` 31 | pip install -r requirements.txt 32 | ``` 33 | - Chạy script 34 | ``` 35 | python run.py 36 | ``` 37 | hoặc 38 | ``` 39 | scrapy crawl wss 40 | scrapy crawl lazada 41 | ``` 42 | -------------------------------------------------------------------------------- /crawl_service/spiders/websosanh.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | 5 | 6 | class CrawlServiceItem(scrapy.Item): 7 | name = scrapy.Field() 8 | price = scrapy.Field() 9 | image = scrapy.Field() 10 | 11 | 12 | class WebsosanhSpider(scrapy.Spider): 13 | name = "wss" 14 | allowed_domains = ["websosanh.vn"] 15 | 16 | start_urls = ["https://websosanh.vn/socola/cat-2053.htm"] 17 | 18 | script = """ 19 | function main(splash) 20 | local url = splash.args.url 21 | assert(splash:go(url)) 22 | assert(splash:wait(0.5)) 23 | assert(splash:runjs("$('.next')[0].click();")) 24 | return { 25 | html = splash:html(), 26 | url = splash:url(), 27 | } 28 | end 29 | """ 30 | 31 | def start_requests(self): 32 | for url in self.start_urls: 33 | yield SplashRequest(url, endpoint="render.html", callback=self.parse) 34 | 35 | def parse(self, response): 36 | item = CrawlServiceItem() 37 | for data in response.xpath("//li[@class='item ']"): 38 | item["name"] = data.xpath("./h3/a/text()").extract_first() 39 | if item["name"] == None: 40 | item["name"] = data.xpath("./h2/a/text()").extract_first() 41 | item["price"] = data.xpath("./div[2]/text()").extract_first() 42 | item["image"] = data.xpath("./div[1]/a/img[1]/@data-src").extract_first() 43 | yield item 44 | 45 | yield SplashRequest( 46 | url=response.url, 47 | callback=self.parse, 48 | meta={ 49 | "splash": {"endpoint": "execute", "args": {"lua_source": self.script}} 50 | }, 51 | ) 52 | -------------------------------------------------------------------------------- /crawl_service/spiders/lazada.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | 5 | 6 | lua_script = """ 7 | function main(splash) 8 | local num_scrolls = 10 9 | local scroll_delay = 1.0 10 | local scroll_to = splash:jsfunc("window.scrollTo") 11 | local get_body_height = splash:jsfunc( 12 | "function() {return document.body.scrollHeight;}" 13 | ) 14 | assert(splash:go(splash.args.url)) 15 | splash:wait(splash.args.wait) 16 | for _ = 1, num_scrolls do 17 | scroll_to(0, get_body_height()) 18 | splash:wait(scroll_delay) 19 | end 20 | return splash:html() 21 | end 22 | """ 23 | 24 | 25 | class CrawlServiceItem(scrapy.Item): 26 | name = scrapy.Field() 27 | price = scrapy.Field() 28 | image = scrapy.Field() 29 | 30 | 31 | class LazadaSpider(scrapy.Spider): 32 | 33 | name = "lazada" 34 | allowed_domains = ['lazada.vn'] 35 | start_urls = ["https://www.lazada.vn/dien-thoai-di-dong"]# + str(i) 36 | # for i in range(1, 103)] 37 | 38 | def start_requests(self): 39 | for url in self.start_urls: 40 | yield SplashRequest(url=url, callback=self.parse, 41 | endpoint="execute", 42 | args={'wait': 2, 'lua_source': lua_script},) 43 | 44 | def parse(self, response): 45 | item = CrawlServiceItem() 46 | for data in response.xpath('//div[@data-qa-locator="product-item"]'): 47 | item["name"] = data.xpath('./div[37]/div/div/div[2]/div[2]/a/text()').extract_first() 48 | item["price"] = data.xpath( 49 | './div[38]/div/div/div[2]/div[3]/span/text()').extract_first() 50 | item["image"] = data.xpath( 51 | './div[38]/div/div/div[1]/div[1]/a/img/@src').extract_first() 52 | yield item 53 | 54 | 55 | -------------------------------------------------------------------------------- /crawl_service/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CrawlServiceSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CrawlServiceDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /crawl_service/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawl_service project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawl_service' 13 | 14 | SPIDER_MODULES = ['crawl_service.spiders'] 15 | NEWSPIDER_MODULE = 'crawl_service.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'crawl_service (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | 25 | SPLASH_URL = 'http://127.0.0.1:8050' 26 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 27 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 28 | COOKIES_ENABLED = True 29 | SPLASH_COOKIES_DEBUG = False 30 | SPIDER_MIDDLEWARES = { 31 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 32 | } 33 | DOWNLOADER_MIDDLEWARES = { 34 | 'scrapy_splash.SplashCookiesMiddleware': 723, 35 | 'scrapy_splash.SplashMiddleware': 725, 36 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 37 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400, 38 | } 39 | 40 | 41 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 42 | #CONCURRENT_REQUESTS = 32 43 | 44 | # Configure a delay for requests for the same website (default: 0) 45 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 46 | # See also autothrottle settings and docs 47 | #DOWNLOAD_DELAY = 3 48 | # The download delay setting will honor only one of: 49 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 50 | #CONCURRENT_REQUESTS_PER_IP = 16 51 | 52 | # Disable cookies (enabled by default) 53 | #COOKIES_ENABLED = False 54 | 55 | # Disable Telnet Console (enabled by default) 56 | #TELNETCONSOLE_ENABLED = False 57 | 58 | # Override the default request headers: 59 | #DEFAULT_REQUEST_HEADERS = { 60 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 61 | # 'Accept-Language': 'en', 62 | #} 63 | 64 | # Enable or disable spider middlewares 65 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 66 | #SPIDER_MIDDLEWARES = { 67 | # 'crawl_service.middlewares.CrawlServiceSpiderMiddleware': 543, 68 | #} 69 | 70 | # Enable or disable downloader middlewares 71 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 72 | #DOWNLOADER_MIDDLEWARES = { 73 | # 'crawl_service.middlewares.CrawlServiceDownloaderMiddleware': 543, 74 | #} 75 | 76 | # Enable or disable extensions 77 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 78 | #EXTENSIONS = { 79 | # 'scrapy.extensions.telnet.TelnetConsole': None, 80 | #} 81 | 82 | # Configure item pipelines 83 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 84 | ITEM_PIPELINES = { 85 | 'crawl_service.pipelines.CrawlServicePipeline': 300, 86 | } 87 | 88 | # Enable and configure the AutoThrottle extension (disabled by default) 89 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 90 | #AUTOTHROTTLE_ENABLED = True 91 | # The initial download delay 92 | #AUTOTHROTTLE_START_DELAY = 5 93 | # The maximum download delay to be set in case of high latencies 94 | #AUTOTHROTTLE_MAX_DELAY = 60 95 | # The average number of requests Scrapy should be sending in parallel to 96 | # each remote server 97 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 98 | # Enable showing throttling stats for every response received: 99 | #AUTOTHROTTLE_DEBUG = False 100 | 101 | # Enable and configure HTTP caching (disabled by default) 102 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 103 | #HTTPCACHE_ENABLED = True 104 | #HTTPCACHE_EXPIRATION_SECS = 0 105 | #HTTPCACHE_DIR = 'httpcache' 106 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 107 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 108 | --------------------------------------------------------------------------------