├── youtubeCrawler ├── __init__.py ├── spiders │ ├── __init__.py │ └── youtubecrawler.py ├── pipelines.py ├── items.py └── settings.py ├── youtubecrawler.sh └── scrapy.cfg /youtubeCrawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /youtubeCrawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /youtubecrawler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | curl --silent https://www.youtube.com/watch?v=SYdGDH1KrnM | egrep "watch-view|yt-uix-button-content" | tr -s " " "\n" | egrep "watch-view|yt-uix-button-content" | tr -s ">|<" "\n" | grep ^[0-9] | tr -s "\n" " " | cut -d " " -f1,2,4 | tr -s " " "\n" 3 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = youtubeCrawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = youtubeCrawler 12 | -------------------------------------------------------------------------------- /youtubeCrawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class YoutubecrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /youtubeCrawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class YoutubecrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /youtubeCrawler/spiders/youtubecrawler.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider 2 | 3 | 4 | class YoutubeCrawler(CrawlSpider): 5 | 6 | name = "youtube" 7 | allowed_domains = ["youtube.com"] 8 | start_urls = [ 9 | "https://www.youtube.com/watch?v=bAxlsajbcUg" 10 | ] 11 | 12 | def parse(self, response): 13 | 14 | visits = response.xpath("//div[@class='watch-view-count']\ 15 | /text()").extract()[0] 16 | 17 | likes = response.xpath("//span[contains(@class, 'like-button-renderer')]\ 18 | /span[position()=1]/button/span/text()").extract()[0] 19 | 20 | dislikes = response.xpath("//span[contains(@class, 'like-button-renderer')]\ 21 | /span[position()=3]/button/span/text()").extract()[0] 22 | 23 | print "visits: %s" % visits 24 | print "likes: %s" % likes 25 | print "dislikes: %s" % dislikes 26 | -------------------------------------------------------------------------------- /youtubeCrawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for youtubeCrawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'youtubeCrawler' 13 | 14 | SPIDER_MODULES = ['youtubeCrawler.spiders'] 15 | NEWSPIDER_MODULE = 'youtubeCrawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'youtubeCrawler (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'youtubeCrawler.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'youtubeCrawler.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | #ITEM_PIPELINES = { 65 | # 'youtubeCrawler.pipelines.SomePipeline': 300, 66 | #} 67 | 68 | # Enable and configure the AutoThrottle extension (disabled by default) 69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 71 | #AUTOTHROTTLE_ENABLED=True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY=5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY=60 76 | # Enable showing throttling stats for every response received: 77 | #AUTOTHROTTLE_DEBUG=False 78 | 79 | # Enable and configure HTTP caching (disabled by default) 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 81 | #HTTPCACHE_ENABLED=True 82 | #HTTPCACHE_EXPIRATION_SECS=0 83 | #HTTPCACHE_DIR='httpcache' 84 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 86 | --------------------------------------------------------------------------------