├── EDBSpider ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── Download.py │ ├── Spider.py │ └── __init__.py ├── README.md ├── img ├── Platform.png ├── Trend.png ├── Type.png └── Year.png ├── main.py └── scrapy.cfg /EDBSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/EDBSpider/__init__.py -------------------------------------------------------------------------------- /EDBSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class EdbspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | id = scrapy.Field() 15 | title = scrapy.Field() 16 | author = scrapy.Field() 17 | date = scrapy.Field() 18 | type = scrapy.Field() 19 | platform = scrapy.Field() 20 | # category = scrapy.Field() 21 | 22 | 23 | class DownloadItem(scrapy.Item): 24 | files = scrapy.Field() 25 | file_urls = scrapy.Field() 26 | -------------------------------------------------------------------------------- /EDBSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class EdbspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class EdbspiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /EDBSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.pipelines.files import FilesPipeline 9 | from urlparse import urlparse 10 | from os.path import basename 11 | 12 | 13 | class FileDownloadPipeline(FilesPipeline): 14 | def file_path(self, request, response=None, info=None): 15 | path = urlparse(request.url).path 16 | return basename(path) 17 | 18 | 19 | class EdbspiderPipeline(object): 20 | def process_item(self, item, spider): 21 | return item 22 | -------------------------------------------------------------------------------- /EDBSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for EDBSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'EDBSpider' 13 | 14 | SPIDER_MODULES = ['EDBSpider.spiders'] 15 | NEWSPIDER_MODULE = 'EDBSpider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1' 20 | 21 | # FEED_URI = 'EDB.csv' 22 | # FEED_FORMAT = 'CSV' 23 | 24 | FEED_EXPORT_FIELDS = ["id", "title", "author", "date", "type", "platform"] 25 | 26 | # Obey robots.txt rules 27 | ROBOTSTXT_OBEY = False 28 | 29 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 30 | CONCURRENT_REQUESTS = 32 31 | 32 | # Configure a delay for requests for the same website (default: 0) 33 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 34 | # See also autothrottle settings and docs 35 | #DOWNLOAD_DELAY = 3 36 | # The download delay setting will honor only one of: 37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 38 | #CONCURRENT_REQUESTS_PER_IP = 16 39 | 40 | # Disable cookies (enabled by default) 41 | #COOKIES_ENABLED = False 42 | 43 | # Disable Telnet Console (enabled by default) 44 | #TELNETCONSOLE_ENABLED = False 45 | 46 | # Override the default request headers: 47 | #DEFAULT_REQUEST_HEADERS = { 48 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 49 | # 'Accept-Language': 'en', 50 | #} 51 | 52 | # Enable or disable spider middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 54 | #SPIDER_MIDDLEWARES = { 55 | # 'EDBSpider.middlewares.EdbspiderSpiderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable downloader middlewares 59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 60 | #DOWNLOADER_MIDDLEWARES = { 61 | # 'EDBSpider.middlewares.EdbspiderDownloaderMiddleware': 543, 62 | #} 63 | 64 | # Enable or disable extensions 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 72 | ITEM_PIPELINES = { 73 | 'EDBSpider.pipelines.FileDownloadPipeline': 300, 74 | # 'EDBSpider.pipelines.EdbspiderPipeline': 300, 75 | } 76 | 77 | FILES_STORE = 'Download' 78 | 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | #AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | #AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | #AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | #AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | #HTTPCACHE_ENABLED = True 95 | #HTTPCACHE_EXPIRATION_SECS = 0 96 | #HTTPCACHE_DIR = 'httpcache' 97 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | -------------------------------------------------------------------------------- /EDBSpider/spiders/Download.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from EDBSpider.items import DownloadItem 3 | import HTMLParser 4 | 5 | 6 | class Download(scrapy.Spider): 7 | name = 'Download' 8 | # allowed_domains = ['exploit-db.com'] 9 | start_urls = [ 10 | 11 | 'https://www.exploit-db.com/webapps/', 12 | 'https://www.exploit-db.com/remote/', 13 | 'https://www.exploit-db.com/local/', 14 | 'https://www.exploit-db.com/dos/' 15 | ] 16 | 17 | def parse(self, response): 18 | # print response.url 19 | selector = scrapy.Selector(response) 20 | list = selector.xpath('//table[@class="exploit_list bootstrap-wrapper"]/tbody/tr') 21 | item = DownloadItem() 22 | for piece in list: 23 | dlink = piece.xpath('td[@class="dlink"]/a/@href').extract() # dlink 24 | item['file_urls'] = dlink 25 | item['files'] = dlink[0].split("/")[4] 26 | yield item 27 | 28 | next = selector.xpath('//div[@class="pagination"]').re(r'href="(.*?)">next') 29 | if next: 30 | html_parser = HTMLParser.HTMLParser() 31 | url = html_parser.unescape(next[0]) 32 | yield scrapy.http.Request(url, callback=self.parse) 33 | -------------------------------------------------------------------------------- /EDBSpider/spiders/Spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from EDBSpider.items import EdbspiderItem 3 | import HTMLParser 4 | 5 | 6 | class EDBSpider(scrapy.Spider): 7 | name = 'EDBSpider' 8 | # allowed_domains = ['exploit-db.com'] 9 | start_urls = [ 10 | 11 | 'https://old.exploit-db.com/webapps/', 12 | 'https://old.exploit-db.com/remote/', 13 | 'https://old.exploit-db.com/local/', 14 | 'https://old.exploit-db.com/dos/' 15 | ] 16 | 17 | def parse(self, response): 18 | # print response.url 19 | selector = scrapy.Selector(response) 20 | list = selector.xpath('//table[@class="exploit_list bootstrap-wrapper"]/tbody/tr') 21 | item = EdbspiderItem() 22 | for piece in list: 23 | item['date'] = piece.xpath('td[@class="date"]/text()').re(r'(\d+-\d+-\d+)') # Date 24 | item['id'] = piece.xpath('td[@class="description"]/a/@href').re(r'/(\d+)/') # EDB-ID 25 | item['title'] = piece.xpath('td[@class="description"]/a/@title').extract() # Title 26 | item['platform'] = piece.xpath('td[@class="platform"]/a/@title').extract() # Platform 27 | item['author'] = piece.xpath('td[@class="author"]/a/@title').extract() # Author 28 | item['type'] = response.url.split("/")[3] # Type 29 | # item['category'] = piece.xpath('td[@class="description"]/a/@title').extract()[0].split(' - ')[-1] 30 | yield item 31 | 32 | next = selector.xpath('//div[@class="pagination"]').re(r'href="(.*?)">next') 33 | if next: 34 | html_parser = HTMLParser.HTMLParser() 35 | url = html_parser.unescape(next[0]) 36 | yield scrapy.http.Request(url, callback=self.parse) 37 | -------------------------------------------------------------------------------- /EDBSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Exploit-DB-Spider 2 | 3 | A scrapy-based crawler for crawling Exploit Database 4 | ![Trend](/img/Trend.png) 5 | 6 | ## Background 7 | 8 | This is a web crawler based on Scrapy for crawling EXP information on the Exploit Database(a CVE compliant archive of public exploits and corresponding vulnerable software). 9 | 10 | What can you do with this web crawler? 11 | * Crawling all the exploit information on Exploit Database to the local area; 12 | * Do some data analysis on the exploit information that has been crawled; 13 | * Download all the exploit scripts on Exploit Database. 14 | 15 | ## Install 16 | 17 | ``` 18 | $ git clone https://github.com/Dy1aNT/Exploit-DB-Spider.git 19 | $ cd Exploit-DB-Spider 20 | ``` 21 | 22 | Install dependencies 23 | 24 | ``` 25 | $ pip install -r requirements.txt 26 | ``` 27 | 28 | ## Usage 29 | 30 | ``` 31 | $ python main.py 32 | ``` 33 | 34 | ## Result 35 | 36 | ![Year](/img/Year.png) 37 | ![Type](/img/Type.png) 38 | ![Platform](/img/Platform.png) 39 | -------------------------------------------------------------------------------- /img/Platform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Platform.png -------------------------------------------------------------------------------- /img/Trend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Trend.png -------------------------------------------------------------------------------- /img/Type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Type.png -------------------------------------------------------------------------------- /img/Year.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Year.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | 4 | cmdline.execute("scrapy crawl EDBSpider -o EDB.csv".split()) 5 | #cmdline.execute("scrapy crawl Download".split()) 6 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = EDBSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = EDBSpider 12 | --------------------------------------------------------------------------------