├── readme.txt ├── scrapy.cfg └── sebug ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── sebugvul.py /readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hongxs/scrapy-sebug/b4f0ce167ea0b19f2ca9fb93e07800cdf50c11eb/readme.txt -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = sebug.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sebug 12 | -------------------------------------------------------------------------------- /sebug/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hongxs/scrapy-sebug/b4f0ce167ea0b19f2ca9fb93e07800cdf50c11eb/sebug/__init__.py -------------------------------------------------------------------------------- /sebug/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.item import Item,Field 10 | 11 | class SebugItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | ssv = Field() 15 | appdir = Field() 16 | title = Field() 17 | content = Field() 18 | publishdate = Field() 19 | # pass 20 | -------------------------------------------------------------------------------- /sebug/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy import log 9 | from twisted.enterprise import adbapi 10 | from scrapy.http import Request 11 | from scrapy.exceptions import DropItem 12 | #from scrapy.contrib.pipeline.images import ImagesPipeline 13 | import time 14 | import MySQLdb 15 | import MySQLdb.cursors 16 | import socket 17 | import select 18 | import sys 19 | import os 20 | import errno 21 | 22 | class SebugPipeline(object): 23 | def __init__(self): 24 | self.dbpool = adbapi.ConnectionPool('MySQLdb', 25 | db = 'sebug', 26 | user = 'root', 27 | passwd = 'xxx', 28 | cursorclass = MySQLdb.cursors.DictCursor, 29 | charset = 'utf8', 30 | use_unicode = False 31 | ) 32 | 33 | def process_item(self, item, spider): 34 | query = self.dbpool.runInteraction(self._conditional_insert,item) 35 | return item 36 | 37 | def _conditional_insert(self,tx,item): 38 | tx.execute('insert into vulninfo values (%s, %s, %s, %s, %s)', (item['ssv'],item['appdir'],item['title'],item['content'],item['publishdate'])) 39 | 40 | -------------------------------------------------------------------------------- /sebug/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for sebug project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'sebug' 12 | 13 | SPIDER_MODULES = ['sebug.spiders'] 14 | NEWSPIDER_MODULE = 'sebug.spiders' 15 | 16 | ITEM_PIPELINES = ['sebug.pipelines.SebugPipeline'] 17 | 18 | DOWNLOAD_DELAY = 2 19 | RANDOMIZE_DOWNLOAD_DELAY = True 20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' 21 | COOKIES_ENABLED = False 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | #USER_AGENT = 'sebug (+http://www.yourdomain.com)' 24 | -------------------------------------------------------------------------------- /sebug/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sebug/spiders/sebugvul.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.contrib.spiders import CrawlSpider,Rule 4 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 5 | from scrapy.selector import HtmlXPathSelector 6 | from scrapy.item import Item 7 | from sebug.items import SebugItem 8 | import re 9 | 10 | class SebugvulSpider(CrawlSpider): 11 | name = "sebugvul" 12 | allowed_domains = ["sebug.net"] 13 | start_urls = ( 14 | 'http://sebug.net/vuldb/vulnerabilities?start=1', 15 | ) 16 | 17 | rules = [ 18 | Rule(SgmlLinkExtractor(allow=('/vuldb/ssvid-(\d{1,6})$',)),callback='parse_vul'), 19 | Rule(SgmlLinkExtractor(allow=('/vuldb/vulnerabilities\?start=(\d{1,5})$',)),follow=True) 20 | ] 21 | 22 | def parse_vul(self, response): 23 | hxs = HtmlXPathSelector(response) 24 | item = SebugItem() 25 | item['title'] = hxs.select('//h2[@class="article_title"]/text()').extract()[0] 26 | item['ssv'] = hxs.select('//div[@class="vuln"]/a/text()').re('\d{1,6}')[0] 27 | appdirtemp = hxs.select('//div[@class="vuln"]/a/text()').re('.+\D$') 28 | if appdirtemp == []: 29 | item['appdir'] = "" 30 | else: 31 | item['appdir'] = appdirtemp[0] 32 | item['publishdate'] = hxs.select('//div[@class="vuln"]/text()').re('\d{4}-\d{1,2}-\d{1,2}')[0] 33 | item['content'] = hxs.select('//div[@class="article_exp"]/pre/text()').extract()[0] 34 | # print item['content'] 35 | return item 36 | --------------------------------------------------------------------------------