├── crawl_service
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── websosanh.py
    │   └── lazada.py
    ├── pipelines.py
    ├── items.py
    ├── run.py
    ├── middlewares.py
    └── settings.py
├── scrapy.cfg
├── requirements.txt
└── README.md


/crawl_service/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawl_service/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawl_service.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl_service
12 | 


--------------------------------------------------------------------------------
/crawl_service/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class CrawlServicePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/crawl_service/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CrawlServiceItem(scrapy.Item):
12 |     pass
13 |     #name = scrapy.Field()
14 |     #image = scrapy.Field()
15 |     #price = scrapy.Field()
16 |     #short_description = scrapy.Field()
17 |     #full_description = scrapy.Field()
18 |     
19 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | asn1crypto==0.24.0
 2 | attrs==18.2.0
 3 | Automat==0.7.0
 4 | certifi==2018.10.15
 5 | cffi==1.11.5
 6 | chardet==3.0.4
 7 | constantly==15.1.0
 8 | cryptography==2.3.1
 9 | cssselect==1.0.3
10 | enum34==1.1.6
11 | functools32==3.2.3.post2
12 | hyperlink==18.0.0
13 | idna==2.7
14 | incremental==17.5.0
15 | ipaddress==1.0.22
16 | lxml==4.2.5
17 | parsel==1.5.1
18 | pyasn1==0.4.4
19 | pyasn1-modules==0.2.2
20 | pycparser==2.19
21 | PyDispatcher==2.0.5
22 | PyHamcrest==1.9.0
23 | pyOpenSSL==18.0.0
24 | queuelib==1.5.0
25 | Scrapy==1.5.1
26 | scrapy-splash==0.7.2
27 | service-identity==17.0.0
28 | six==1.11.0
29 | Twisted==18.9.0
30 | urllib3==1.24
31 | w3lib==1.19.0
32 | zope.interface==4.6.0
33 | 


--------------------------------------------------------------------------------
/crawl_service/run.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from scrapy.crawler import CrawlerProcess
 3 | from scrapy.utils.project import get_project_settings
 4 | from scrapy.utils.log import configure_logging
 5 | from twisted.internet.error import ReactorNotRestartable
 6 | from spiders.websosanh import WebsosanhSpider
 7 | from spiders.lazada import LazadaSpider
 8 | configure_logging(install_root_handler=False)
 9 | logging.basicConfig(
10 |     filename='crawl.log',
11 |     format='%(levelname)s: %(message)s',
12 |     level=logging.WARNING)
13 | 
14 | if __name__ == '__main__':
15 | 
16 |     process = CrawlerProcess(get_project_settings())
17 |     process.crawl(WebsosanhSpider)
18 |     process.crawl(LazadaSpider)
19 | 
20 |     try:
21 |         process.start()
22 |     except ReactorNotRestartable as e:
23 |         pass
24 |     finally:
25 |         process.stop()
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Crawler-JS
 2 | Dùng scrapy-splash kết hợp lua script để crawl các trang web sử dụng Javascript (websosanh)
 3 | ```
 4 | ├── crawl_service
 5 | │   ├── __init__.py
 6 | │   ├── items.py
 7 | │   ├── middlewares.py
 8 | │   ├── pipelines.py
 9 | │   ├── run.py
10 | │   ├── settings.py
11 | │   └── spiders
12 | │       ├── __init__.py
13 | │       └── websosanh.py
14 |         └── lazada.py
15 | ├── requirements.txt
16 | └── scrapy.cfg
17 | ```
18 | 
19 | - Cài đặt Splash 
20 | 
21 | Cài Docker sau đó chạy 
22 | ```
23 | $ sudo docker pull scrapinghub/splash
24 | ```
25 | và
26 | ```
27 | $ sudo docker run -p 8050:8050 scrapinghub/splash
28 | ```
29 | - Cài các thư viện cần thiết khác ( Nên dùng virtualenv )
30 | ```
31 | pip install -r requirements.txt
32 | ```
33 | - Chạy script 
34 | ```
35 | python run.py
36 | ```
37 | hoặc 
38 | ```
39 | scrapy crawl wss 
40 | scrapy crawl lazada
41 | ```
42 | 


--------------------------------------------------------------------------------
/crawl_service/spiders/websosanh.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | 
 5 | 
 6 | class CrawlServiceItem(scrapy.Item):
 7 |     name = scrapy.Field()
 8 |     price = scrapy.Field()
 9 |     image = scrapy.Field()
10 | 
11 | 
12 | class WebsosanhSpider(scrapy.Spider):
13 |     name = "wss"
14 |     allowed_domains = ["websosanh.vn"]
15 | 
16 |     start_urls = ["https://websosanh.vn/socola/cat-2053.htm"]
17 | 
18 |     script = """
19 |         function main(splash)
20 |             local url = splash.args.url
21 |             assert(splash:go(url))
22 |             assert(splash:wait(0.5))
23 |             assert(splash:runjs("$('.next')[0].click();"))
24 |             return {
25 |                 html = splash:html(),
26 |                 url = splash:url(),
27 |             }
28 |         end
29 |         """
30 | 
31 |     def start_requests(self):
32 |         for url in self.start_urls:
33 |             yield SplashRequest(url, endpoint="render.html", callback=self.parse)
34 | 
35 |     def parse(self, response):
36 |         item = CrawlServiceItem()
37 |         for data in response.xpath("//li[@class='item ']"):
38 |             item["name"] = data.xpath("./h3/a/text()").extract_first()
39 |             if item["name"] == None:
40 |                 item["name"] = data.xpath("./h2/a/text()").extract_first()
41 |             item["price"] = data.xpath("./div[2]/text()").extract_first()
42 |             item["image"] = data.xpath("./div[1]/a/img[1]/@data-src").extract_first()
43 |             yield item
44 | 
45 |         yield SplashRequest(
46 |             url=response.url,
47 |             callback=self.parse,
48 |             meta={
49 |                 "splash": {"endpoint": "execute", "args": {"lua_source": self.script}}
50 |             },
51 |         )
52 | 


--------------------------------------------------------------------------------
/crawl_service/spiders/lazada.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | 
 5 | 
 6 | lua_script = """
 7 | function main(splash)
 8 |     local num_scrolls = 10
 9 |     local scroll_delay = 1.0
10 |     local scroll_to = splash:jsfunc("window.scrollTo")
11 |     local get_body_height = splash:jsfunc(
12 |         "function() {return document.body.scrollHeight;}"
13 |     )
14 |     assert(splash:go(splash.args.url))
15 |     splash:wait(splash.args.wait)
16 |     for _ = 1, num_scrolls do
17 |         scroll_to(0, get_body_height())
18 |         splash:wait(scroll_delay)
19 |     end
20 |     return splash:html()
21 | end
22 | """
23 | 
24 | 
25 | class CrawlServiceItem(scrapy.Item):
26 |     name = scrapy.Field()
27 |     price = scrapy.Field()
28 |     image = scrapy.Field()
29 | 
30 | 
31 | class LazadaSpider(scrapy.Spider):
32 | 
33 |     name = "lazada"
34 |     allowed_domains = ['lazada.vn']
35 |     start_urls = ["https://www.lazada.vn/dien-thoai-di-dong"]# + str(i)
36 | #                  for i in range(1, 103)]
37 | 
38 |     def start_requests(self):
39 |         for url in self.start_urls:
40 |             yield SplashRequest(url=url, callback=self.parse,
41 |                                 endpoint="execute",
42 |                                 args={'wait': 2, 'lua_source': lua_script},)
43 | 
44 |     def parse(self, response):
45 |         item = CrawlServiceItem()
46 |         for data in response.xpath('//div[@data-qa-locator="product-item"]'):
47 |             item["name"] = data.xpath('./div[37]/div/div/div[2]/div[2]/a/text()').extract_first()
48 |             item["price"] = data.xpath(
49 |                 './div[38]/div/div/div[2]/div[3]/span/text()').extract_first()
50 |             item["image"] = data.xpath(
51 |                 './div[38]/div/div/div[1]/div[1]/a/img/@src').extract_first()
52 |             yield item
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/crawl_service/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class CrawlServiceSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class CrawlServiceDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/crawl_service/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for crawl_service project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'crawl_service'
 13 | 
 14 | SPIDER_MODULES = ['crawl_service.spiders']
 15 | NEWSPIDER_MODULE = 'crawl_service.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'crawl_service (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | 
 25 | SPLASH_URL = 'http://127.0.0.1:8050'
 26 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 27 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 28 | COOKIES_ENABLED = True 
 29 | SPLASH_COOKIES_DEBUG = False
 30 | SPIDER_MIDDLEWARES = {
 31 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 32 | }
 33 | DOWNLOADER_MIDDLEWARES = {
 34 |     'scrapy_splash.SplashCookiesMiddleware': 723,
 35 |     'scrapy_splash.SplashMiddleware': 725,
 36 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 37 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 400,
 38 | }
 39 | 
 40 | 
 41 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 42 | #CONCURRENT_REQUESTS = 32
 43 | 
 44 | # Configure a delay for requests for the same website (default: 0)
 45 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 46 | # See also autothrottle settings and docs
 47 | #DOWNLOAD_DELAY = 3
 48 | # The download delay setting will honor only one of:
 49 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 50 | #CONCURRENT_REQUESTS_PER_IP = 16
 51 | 
 52 | # Disable cookies (enabled by default)
 53 | #COOKIES_ENABLED = False
 54 | 
 55 | # Disable Telnet Console (enabled by default)
 56 | #TELNETCONSOLE_ENABLED = False
 57 | 
 58 | # Override the default request headers:
 59 | #DEFAULT_REQUEST_HEADERS = {
 60 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 61 | #   'Accept-Language': 'en',
 62 | #}
 63 | 
 64 | # Enable or disable spider middlewares
 65 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 66 | #SPIDER_MIDDLEWARES = {
 67 | #    'crawl_service.middlewares.CrawlServiceSpiderMiddleware': 543,
 68 | #}
 69 | 
 70 | # Enable or disable downloader middlewares
 71 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 72 | #DOWNLOADER_MIDDLEWARES = {
 73 | #    'crawl_service.middlewares.CrawlServiceDownloaderMiddleware': 543,
 74 | #}
 75 | 
 76 | # Enable or disable extensions
 77 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 78 | #EXTENSIONS = {
 79 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 80 | #}
 81 | 
 82 | # Configure item pipelines
 83 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 84 | ITEM_PIPELINES = {
 85 |     'crawl_service.pipelines.CrawlServicePipeline': 300,
 86 | }
 87 | 
 88 | # Enable and configure the AutoThrottle extension (disabled by default)
 89 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 90 | #AUTOTHROTTLE_ENABLED = True
 91 | # The initial download delay
 92 | #AUTOTHROTTLE_START_DELAY = 5
 93 | # The maximum download delay to be set in case of high latencies
 94 | #AUTOTHROTTLE_MAX_DELAY = 60
 95 | # The average number of requests Scrapy should be sending in parallel to
 96 | # each remote server
 97 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 98 | # Enable showing throttling stats for every response received:
 99 | #AUTOTHROTTLE_DEBUG = False
100 | 
101 | # Enable and configure HTTP caching (disabled by default)
102 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
103 | #HTTPCACHE_ENABLED = True
104 | #HTTPCACHE_EXPIRATION_SECS = 0
105 | #HTTPCACHE_DIR = 'httpcache'
106 | #HTTPCACHE_IGNORE_HTTP_CODES = []
107 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
108 | 


--------------------------------------------------------------------------------