├── readme.txt
├── scrapy.cfg
└── sebug
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        └── sebugvul.py


/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hongxs/scrapy-sebug/b4f0ce167ea0b19f2ca9fb93e07800cdf50c11eb/readme.txt


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = sebug.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sebug
12 | 


--------------------------------------------------------------------------------
/sebug/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hongxs/scrapy-sebug/b4f0ce167ea0b19f2ca9fb93e07800cdf50c11eb/sebug/__init__.py


--------------------------------------------------------------------------------
/sebug/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy.item import Item,Field
10 | 
11 | class SebugItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     ssv = Field()
15 |     appdir = Field()
16 |     title = Field()
17 |     content = Field()
18 |     publishdate = Field()
19 | #    pass
20 | 


--------------------------------------------------------------------------------
/sebug/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy import log
 9 | from twisted.enterprise import adbapi
10 | from scrapy.http import Request
11 | from scrapy.exceptions import DropItem
12 | #from scrapy.contrib.pipeline.images import ImagesPipeline
13 | import time
14 | import MySQLdb
15 | import MySQLdb.cursors
16 | import socket
17 | import select
18 | import sys
19 | import os
20 | import errno
21 | 
22 | class SebugPipeline(object):
23 |     def __init__(self):
24 | 	self.dbpool = adbapi.ConnectionPool('MySQLdb',
25 | 		db = 'sebug',
26 | 		user = 'root',
27 | 		passwd = 'xxx',
28 | 		cursorclass = MySQLdb.cursors.DictCursor,
29 | 		charset = 'utf8',
30 | 		use_unicode = False
31 | 	)    
32 | 
33 |     def process_item(self, item, spider):
34 | 	query = self.dbpool.runInteraction(self._conditional_insert,item)
35 | 	return item
36 | 
37 |     def _conditional_insert(self,tx,item):
38 | 	tx.execute('insert into vulninfo values (%s, %s, %s, %s, %s)', (item['ssv'],item['appdir'],item['title'],item['content'],item['publishdate']))
39 | 
40 | 


--------------------------------------------------------------------------------
/sebug/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for sebug project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'sebug'
12 | 
13 | SPIDER_MODULES = ['sebug.spiders']
14 | NEWSPIDER_MODULE = 'sebug.spiders'
15 | 
16 | ITEM_PIPELINES = ['sebug.pipelines.SebugPipeline']
17 | 
18 | DOWNLOAD_DELAY = 2
19 | RANDOMIZE_DOWNLOAD_DELAY = True
20 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
21 | COOKIES_ENABLED = False
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | #USER_AGENT = 'sebug (+http://www.yourdomain.com)'
24 | 


--------------------------------------------------------------------------------
/sebug/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sebug/spiders/sebugvul.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.contrib.spiders import CrawlSpider,Rule
 4 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 5 | from scrapy.selector import HtmlXPathSelector
 6 | from scrapy.item import Item
 7 | from sebug.items import SebugItem
 8 | import re
 9 | 
10 | class SebugvulSpider(CrawlSpider):
11 |     name = "sebugvul"
12 |     allowed_domains = ["sebug.net"]
13 |     start_urls = (
14 |         'http://sebug.net/vuldb/vulnerabilities?start=1',
15 |     )
16 |    
17 |     rules = [
18 | 	Rule(SgmlLinkExtractor(allow=('/vuldb/ssvid-(\d{1,6})$',)),callback='parse_vul'),
19 | 	Rule(SgmlLinkExtractor(allow=('/vuldb/vulnerabilities\?start=(\d{1,5})$',)),follow=True)
20 | ]
21 | 
22 |     def parse_vul(self, response):
23 |     	hxs = HtmlXPathSelector(response)
24 | 	item = SebugItem()
25 | 	item['title'] = hxs.select('//h2[@class="article_title"]/text()').extract()[0]
26 | 	item['ssv'] = hxs.select('//div[@class="vuln"]/a/text()').re('\d{1,6}')[0]
27 | 	appdirtemp = hxs.select('//div[@class="vuln"]/a/text()').re('.+\D$')
28 | 	if appdirtemp == []:
29 | 		item['appdir'] = ""
30 | 	else:
31 | 		item['appdir'] = appdirtemp[0]
32 | 	item['publishdate'] = hxs.select('//div[@class="vuln"]/text()').re('\d{4}-\d{1,2}-\d{1,2}')[0]
33 | 	item['content'] = hxs.select('//div[@class="article_exp"]/pre/text()').extract()[0]
34 | #	print item['content'] 
35 | 	return item
36 | 


--------------------------------------------------------------------------------