├── EDBSpider
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── Download.py
    │   ├── Spider.py
    │   └── __init__.py
├── README.md
├── img
    ├── Platform.png
    ├── Trend.png
    ├── Type.png
    └── Year.png
├── main.py
└── scrapy.cfg


/EDBSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/EDBSpider/__init__.py


--------------------------------------------------------------------------------
/EDBSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class EdbspiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     id = scrapy.Field()
15 |     title = scrapy.Field()
16 |     author = scrapy.Field()
17 |     date = scrapy.Field()
18 |     type = scrapy.Field()
19 |     platform = scrapy.Field()
20 |     # category = scrapy.Field()
21 | 
22 | 
23 | class DownloadItem(scrapy.Item):
24 |     files = scrapy.Field()
25 |     file_urls = scrapy.Field()
26 | 


--------------------------------------------------------------------------------
/EDBSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class EdbspiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class EdbspiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/EDBSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.pipelines.files import FilesPipeline
 9 | from urlparse import urlparse
10 | from os.path import basename
11 | 
12 | 
13 | class FileDownloadPipeline(FilesPipeline):
14 |     def file_path(self, request, response=None, info=None):
15 |         path = urlparse(request.url).path
16 |         return basename(path)
17 | 
18 | 
19 | class EdbspiderPipeline(object):
20 |     def process_item(self, item, spider):
21 |         return item
22 | 


--------------------------------------------------------------------------------
/EDBSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for EDBSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'EDBSpider'
13 | 
14 | SPIDER_MODULES = ['EDBSpider.spiders']
15 | NEWSPIDER_MODULE = 'EDBSpider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'
20 | 
21 | # FEED_URI = 'EDB.csv'
22 | # FEED_FORMAT = 'CSV'
23 | 
24 | FEED_EXPORT_FIELDS = ["id", "title", "author", "date", "type", "platform"]
25 | 
26 | # Obey robots.txt rules
27 | ROBOTSTXT_OBEY = False
28 | 
29 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
30 | CONCURRENT_REQUESTS = 32
31 | 
32 | # Configure a delay for requests for the same website (default: 0)
33 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
34 | # See also autothrottle settings and docs
35 | #DOWNLOAD_DELAY = 3
36 | # The download delay setting will honor only one of:
37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
38 | #CONCURRENT_REQUESTS_PER_IP = 16
39 | 
40 | # Disable cookies (enabled by default)
41 | #COOKIES_ENABLED = False
42 | 
43 | # Disable Telnet Console (enabled by default)
44 | #TELNETCONSOLE_ENABLED = False
45 | 
46 | # Override the default request headers:
47 | #DEFAULT_REQUEST_HEADERS = {
48 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 | #   'Accept-Language': 'en',
50 | #}
51 | 
52 | # Enable or disable spider middlewares
53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
54 | #SPIDER_MIDDLEWARES = {
55 | #    'EDBSpider.middlewares.EdbspiderSpiderMiddleware': 543,
56 | #}
57 | 
58 | # Enable or disable downloader middlewares
59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
60 | #DOWNLOADER_MIDDLEWARES = {
61 | #    'EDBSpider.middlewares.EdbspiderDownloaderMiddleware': 543,
62 | #}
63 | 
64 | # Enable or disable extensions
65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
66 | #EXTENSIONS = {
67 | #    'scrapy.extensions.telnet.TelnetConsole': None,
68 | #}
69 | 
70 | # Configure item pipelines
71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
72 | ITEM_PIPELINES = {
73 |     'EDBSpider.pipelines.FileDownloadPipeline': 300,
74 | #    'EDBSpider.pipelines.EdbspiderPipeline': 300,
75 | }
76 | 
77 | FILES_STORE = 'Download'
78 | 
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | #AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | #AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | #AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | #AUTOTHROTTLE_DEBUG = False
91 | 
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | #HTTPCACHE_ENABLED = True
95 | #HTTPCACHE_EXPIRATION_SECS = 0
96 | #HTTPCACHE_DIR = 'httpcache'
97 | #HTTPCACHE_IGNORE_HTTP_CODES = []
98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 | 


--------------------------------------------------------------------------------
/EDBSpider/spiders/Download.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from EDBSpider.items import DownloadItem
 3 | import HTMLParser
 4 | 
 5 | 
 6 | class Download(scrapy.Spider):
 7 |     name = 'Download'
 8 |     # allowed_domains = ['exploit-db.com']
 9 |     start_urls = [
10 | 
11 |         'https://www.exploit-db.com/webapps/',
12 |         'https://www.exploit-db.com/remote/',
13 |         'https://www.exploit-db.com/local/',
14 |         'https://www.exploit-db.com/dos/'
15 |     ]
16 | 
17 |     def parse(self, response):
18 |         # print response.url
19 |         selector = scrapy.Selector(response)
20 |         list = selector.xpath('//table[@class="exploit_list bootstrap-wrapper"]/tbody/tr')
21 |         item = DownloadItem()
22 |         for piece in list:
23 |             dlink = piece.xpath('td[@class="dlink"]/a/@href').extract()  # dlink
24 |             item['file_urls'] = dlink
25 |             item['files'] = dlink[0].split("/")[4]
26 |             yield item
27 | 
28 |         next = selector.xpath('//div[@class="pagination"]').re(r'href="(.*?)">next')
29 |         if next:
30 |             html_parser = HTMLParser.HTMLParser()
31 |             url = html_parser.unescape(next[0])
32 |             yield scrapy.http.Request(url, callback=self.parse)
33 | 


--------------------------------------------------------------------------------
/EDBSpider/spiders/Spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from EDBSpider.items import EdbspiderItem
 3 | import HTMLParser
 4 | 
 5 | 
 6 | class EDBSpider(scrapy.Spider):
 7 |     name = 'EDBSpider'
 8 |     # allowed_domains = ['exploit-db.com']
 9 |     start_urls = [
10 | 
11 |         'https://old.exploit-db.com/webapps/',
12 |         'https://old.exploit-db.com/remote/',
13 |         'https://old.exploit-db.com/local/',
14 |         'https://old.exploit-db.com/dos/'
15 |     ]
16 | 
17 |     def parse(self, response):
18 |         # print response.url
19 |         selector = scrapy.Selector(response)
20 |         list = selector.xpath('//table[@class="exploit_list bootstrap-wrapper"]/tbody/tr')
21 |         item = EdbspiderItem()
22 |         for piece in list:
23 |             item['date'] = piece.xpath('td[@class="date"]/text()').re(r'(\d+-\d+-\d+)')  # Date
24 |             item['id'] = piece.xpath('td[@class="description"]/a/@href').re(r'/(\d+)/')  # EDB-ID
25 |             item['title'] = piece.xpath('td[@class="description"]/a/@title').extract()  # Title
26 |             item['platform'] = piece.xpath('td[@class="platform"]/a/@title').extract()  # Platform
27 |             item['author'] = piece.xpath('td[@class="author"]/a/@title').extract()  # Author
28 |             item['type'] = response.url.split("/")[3]  # Type
29 |             # item['category'] = piece.xpath('td[@class="description"]/a/@title').extract()[0].split(' - ')[-1]
30 |             yield item
31 | 
32 |         next = selector.xpath('//div[@class="pagination"]').re(r'href="(.*?)">next')
33 |         if next:
34 |             html_parser = HTMLParser.HTMLParser()
35 |             url = html_parser.unescape(next[0])
36 |             yield scrapy.http.Request(url, callback=self.parse)
37 | 


--------------------------------------------------------------------------------
/EDBSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Exploit-DB-Spider
 2 | 
 3 | A scrapy-based crawler for crawling Exploit Database
 4 | ![Trend](/img/Trend.png)
 5 | 
 6 | ## Background
 7 | 
 8 | This is a web crawler based on Scrapy for crawling EXP information on the Exploit Database(a CVE compliant archive of public exploits and corresponding vulnerable software).
 9 | 
10 | What can you do with this web crawler?
11 | * Crawling all the exploit information on Exploit Database to the local area;
12 | * Do some data analysis on the exploit information that has been crawled;
13 | * Download all the exploit scripts on Exploit Database.
14 | 
15 | ## Install
16 | 
17 | ```
18 | $ git clone https://github.com/Dy1aNT/Exploit-DB-Spider.git
19 | $ cd Exploit-DB-Spider
20 | ```
21 | 
22 | Install dependencies
23 | 
24 | ```
25 | $ pip install -r requirements.txt
26 | ```
27 | 
28 | ## Usage
29 | 
30 | ```
31 | $ python main.py
32 | ```
33 | 
34 | ## Result
35 | 
36 | ![Year](/img/Year.png)
37 | ![Type](/img/Type.png)
38 | ![Platform](/img/Platform.png)
39 | 


--------------------------------------------------------------------------------
/img/Platform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Platform.png


--------------------------------------------------------------------------------
/img/Trend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Trend.png


--------------------------------------------------------------------------------
/img/Type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Type.png


--------------------------------------------------------------------------------
/img/Year.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Dy1anT/Exploit-DB-Spider/9a332e6a4c1b03b2e81dd442e58666820810cf5b/img/Year.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | 
4 | cmdline.execute("scrapy crawl EDBSpider -o EDB.csv".split())
5 | #cmdline.execute("scrapy crawl Download".split())
6 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = EDBSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = EDBSpider
12 | 


--------------------------------------------------------------------------------