├── .gitignore ├── LICENSE ├── README.md └── bulletin_scraper ├── bulletin_scraper ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── bulletins.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Windows template 3 | # Windows image file caches 4 | Thumbs.db 5 | ehthumbs.db 6 | 7 | # Folder config file 8 | Desktop.ini 9 | 10 | # Recycle Bin used on file shares 11 | $RECYCLE.BIN/ 12 | 13 | # Windows Installer files 14 | *.cab 15 | *.msi 16 | *.msm 17 | *.msp 18 | 19 | # Windows shortcuts 20 | *.lnk 21 | ### Python template 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | env/ 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *,cover 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | ### VirtualEnv template 81 | # Virtualenv 82 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 83 | .Python 84 | [Bb]in 85 | [Ii]nclude 86 | [Ll]ib 87 | [Ss]cripts 88 | pyvenv.cfg 89 | pip-selfcheck.json 90 | ### JetBrains template 91 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 92 | 93 | *.iml 94 | 95 | ## Directory-based project format: 96 | .idea/ 97 | # if you remove the above rule, at least ignore the following: 98 | 99 | # User-specific stuff: 100 | # .idea/workspace.xml 101 | # .idea/tasks.xml 102 | # .idea/dictionaries 103 | 104 | # Sensitive or high-churn files: 105 | # .idea/dataSources.ids 106 | # .idea/dataSources.xml 107 | # .idea/sqlDataSources.xml 108 | # .idea/dynamic.xml 109 | # .idea/uiDesigner.xml 110 | 111 | # Gradle: 112 | # .idea/gradle.xml 113 | # .idea/libraries 114 | 115 | # Mongo Explorer plugin: 116 | # .idea/mongoSettings.xml 117 | 118 | ## File-based project format: 119 | *.ipr 120 | *.iws 121 | 122 | ## Plugin-specific files: 123 | 124 | # IntelliJ 125 | /out/ 126 | 127 | # mpeltonen/sbt-idea plugin 128 | .idea_modules/ 129 | 130 | # JIRA plugin 131 | atlassian-ide-plugin.xml 132 | 133 | # Crashlytics plugin (for Android Studio and IntelliJ) 134 | com_crashlytics_export_strings.xml 135 | crashlytics.properties 136 | crashlytics-build.properties 137 | ### IPythonNotebook template 138 | # Temporary data 139 | .ipynb_checkpoints/ 140 | 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, NorthBit, Ltd. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MS Bulletin Scraper 2 | 3 | This is a scraping tool to download all bulletin `.msu` files, extract the executables and download relevant symbols. 4 | 5 | Be warned, downloading everything eats up a lot of disk space. 6 | 7 | ## Dependencies 8 | 9 | 1. [Scrapy](http://scrapy.org/), install using `pip install scrapy`. 10 | 1. To download symbols, the script uses `Symchk.exe`. So you'll need to install [Windbg](https://msdn.microsoft.com/en-us/windows/hardware/hh852365.aspx). 11 | 1. We also use `expand.exe` to expand the `.msu` files, so you need to run this on Windows 7 or higher. 12 | 13 | 14 | ## Usage 15 | 16 | ```bash 17 | git clone https://github.com/NorthBit/bulletin-scraper.git bulletin-scraper 18 | cd bulletin-scraper\bulletin_scraper 19 | scrapy crawl bulletins 20 | ``` 21 | 22 | 23 | ## Configuration 24 | 25 | The scraper's configuration is saved in `bulletin_scraper\bulletin_scraper\settings.py`. There are some settings you MUST configure yourself. 26 | 27 | 1. `FILES_STORE` - the location ot which the bulletins will be downloaded. The default location is a `bulletins` directory under the scraper root. 28 | 1. `SYMCHK_PATH` - the path to `symchk.exe` 29 | 1. `SYM_PATH` - the symbol path. The default local store is `C:\temp\symbols`. -------------------------------------------------------------------------------- /bulletin_scraper/bulletin_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NorthBit/bulletin-scraper/98a5d9738d54680824d05bb420986f43c4d72e49/bulletin_scraper/bulletin_scraper/__init__.py -------------------------------------------------------------------------------- /bulletin_scraper/bulletin_scraper/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DownloadLinkItem(scrapy.Item): 12 | bulletin = scrapy.Field() 13 | product = scrapy.Field() 14 | url = scrapy.Field() 15 | files = scrapy.Field() 16 | msu_path = scrapy.Field() -------------------------------------------------------------------------------- /bulletin_scraper/bulletin_scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import subprocess 8 | 9 | import scrapy.pipelines.files 10 | from scrapy.exceptions import DropItem 11 | import os 12 | import scrapy 13 | import errno 14 | 15 | SYMCHK_PATH = r'C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\symchk.exe' 16 | SYM_PATH = r'SRV**https://msdl.microsoft.com/download/symbols' 17 | 18 | 19 | def expand(source, dest, filter_=None): 20 | if filter_ is None: 21 | filter_ = '*' 22 | subprocess.call(['expand', '-F:{}'.format(filter_), source, dest]) 23 | 24 | def delete(path): 25 | subprocess.call(['del', '/Q', path], shell=True) 26 | 27 | def symchk(path, symchk_path=None, sym_path=None, output_dir=None): 28 | if symchk_path is None: 29 | symchk_path = SYMCHK_PATH 30 | if sym_path is None: 31 | sym_path = SYM_PATH 32 | 33 | output_dir_args = [] 34 | if output_dir is not None: 35 | output_dir = os.path.join(os.getcwd(), output_dir) 36 | output_dir_args = ['/oc', output_dir] 37 | 38 | subprocess.call([symchk_path, '/r', path, '/s', sym_path, ] + output_dir_args) 39 | 40 | 41 | class MsuDownloadPipeline(scrapy.pipelines.files.FilesPipeline): 42 | def get_media_requests(self, item, info): 43 | url = item['url'] 44 | if not url.lower().endswith('.msu'): 45 | raise DropItem('Item not an MSU') 46 | request = scrapy.Request(url) 47 | request.meta['bulletin'] = item['bulletin'] 48 | yield request 49 | 50 | def item_completed(self, results, item, info): 51 | file_paths = (result['path'] for ok, result in results if ok) 52 | msu_paths = [path for path in file_paths if path.lower().endswith('.msu')] 53 | item['msu_path'] = msu_paths[0] 54 | return item 55 | 56 | def file_path(self, request, response=None, info=None): 57 | bulletin = request.meta['bulletin'].upper() 58 | path = os.path.join(bulletin, request.url.rsplit('/', 1)[-1]) 59 | return path 60 | 61 | 62 | class MsuExtractPipeline(object): 63 | @classmethod 64 | def from_crawler(cls, crawler): 65 | instance = cls() 66 | instance.settings = crawler.settings 67 | return instance 68 | 69 | def __init__(self): 70 | super(MsuExtractPipeline, self).__init__() 71 | self.settings = None 72 | 73 | def process_item(self, item, spider): 74 | msu_path = os.path.join(self.settings['FILES_STORE'], item['msu_path']) 75 | msu_dir = os.path.dirname(msu_path) 76 | msu_name = item['url'].rsplit('/', 1)[-1].rsplit('.', 1)[0] 77 | extract_dir = os.path.join(msu_dir, msu_name) 78 | try: 79 | os.mkdir(extract_dir) 80 | except WindowsError as e: 81 | if e.errno != errno.EEXIST: 82 | raise 83 | 84 | extract_cab = '{}.cab'.format(msu_name) 85 | expand(msu_path, extract_dir, extract_cab) 86 | filter_ = self.settings.get('EXTRACT_FILTER', None) 87 | expand(os.path.join(extract_dir, extract_cab), extract_dir, filter_=filter_) 88 | 89 | if spider.settings.get('DELETE_RUBBISH', False): 90 | # Delete all files that are not in directories. This includes a lot of rubbish files, as well as the 91 | # original `.cab` file. 92 | delete(os.path.join(extract_dir, '*')) 93 | 94 | if not self.settings.get('DONT_DOWNLOAD_SYMBOLS', False): 95 | self.download_symbols(extract_dir) 96 | 97 | if spider.settings.get('DELETE_MSU_FILES', False): 98 | try: 99 | os.unlink(msu_path) 100 | except WindowsError: 101 | pass 102 | 103 | 104 | return item 105 | 106 | def download_symbols(self, extract_dir): 107 | symchk_path = self.settings.get('SYMCHK_PATH', None) 108 | sym_path = self.settings.get('SYM_PATH', None) 109 | symchk(extract_dir, symchk_path=symchk_path, sym_path=sym_path) 110 | -------------------------------------------------------------------------------- /bulletin_scraper/bulletin_scraper/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bulletin_scraper project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | ############################# 13 | # # 14 | # User Configuration # 15 | # # 16 | ############################# 17 | 18 | # Where to store the downloaded bulletins 19 | FILES_STORE = r'.\bulletins' 20 | 21 | # The location of the `symchk` executable 22 | SYMCHK_PATH = r'C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\symchk.exe' 23 | 24 | # The symbol path to use. This also determines where the symbols are saved to. 25 | SYM_PATH = r'SRV*C:\temp\symbols*https://msdl.microsoft.com/download/symbols' 26 | 27 | # List of products to download, based on the names on the bulletin pages. 28 | PRODUCT_LIST = [] 29 | 30 | # Set a filter for cab extraction. Note that it only affects the extraction. 31 | EXTRACT_FILTER = None 32 | 33 | ## Set `DONT_DOWNLOAD_SYMBOLS` to `True` to prevent downloading symbols. 34 | DONT_DOWNLOAD_SYMBOLS = False 35 | 36 | # Delete `.msu` files after extraction 37 | DELETE_MSU_FILES = False 38 | 39 | # Delete rubbish files (the `.msu` files are full of them!) 40 | DELETE_RUBBISH = True 41 | 42 | 43 | ############################# 44 | # # 45 | # DO NOT CHANGE!!! # 46 | # # 47 | ############################# 48 | 49 | BOT_NAME = 'bulletin_scraper' 50 | 51 | SPIDER_MODULES = ['bulletin_scraper.spiders'] 52 | NEWSPIDER_MODULE = 'bulletin_scraper.spiders' 53 | 54 | 55 | ITEM_PIPELINES = { 56 | 'bulletin_scraper.pipelines.MsuDownloadPipeline' : 300, 57 | 'bulletin_scraper.pipelines.MsuExtractPipeline' : 500, 58 | } 59 | -------------------------------------------------------------------------------- /bulletin_scraper/bulletin_scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bulletin_scraper/bulletin_scraper/spiders/bulletins.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | from ..items import DownloadLinkItem 5 | 6 | 7 | class BulletinsSpider(scrapy.Spider): 8 | name = "bulletins" 9 | start_urls = ['https://technet.microsoft.com/en-us/library/security/dn631937.aspx'] 10 | 11 | def parse(self, response): 12 | for link in response.xpath('//a[string-length(@title)=4]'): 13 | title = link.xpath('@title').extract_first() 14 | try: 15 | year = int(title) 16 | except ValueError: 17 | # We need only the years! 18 | continue 19 | 20 | url = link.xpath('@href').extract_first() 21 | 22 | yield scrapy.Request(response.urljoin(url), self.parse_bulletin_table) 23 | 24 | def parse_bulletin_table(self, response): 25 | for url in response.xpath('//td/p/a/@href').extract(): 26 | yield scrapy.Request(response.urljoin(url), self.parse_bulletin_page) 27 | 28 | def parse_bulletin_page(self, response): 29 | visited_urls = set() 30 | for link in response.css('td a'): 31 | url = link.css('::attr(href)').extract_first() 32 | if url in visited_urls: 33 | continue 34 | visited_urls.add(url) 35 | if 'familyid' not in url.lower(): 36 | continue 37 | text = link.css('::text').extract_first() 38 | 39 | 40 | if self.settings['PRODUCT_LIST'] and text not in self.settings['PRODUCT_LIST']: 41 | continue 42 | 43 | request = scrapy.Request(response.urljoin(url), self.resolve_download_page) 44 | request.meta['bulletin'] = response.url.rsplit('/', 1)[-1].rsplit('.', 1)[0] 45 | request.meta['product'] = text 46 | yield request 47 | 48 | def resolve_download_page(self, response): 49 | yield scrapy.Request(response.urljoin(response.url.replace('details.aspx', 'confirmation.aspx')), 50 | self.download_updates, meta=response.meta) 51 | 52 | def download_updates(self, response): 53 | for url in response.css('td.file-link a::attr("href")').extract(): 54 | item = DownloadLinkItem() 55 | item['url'] = url 56 | item['product'] = response.meta['product'] 57 | item['bulletin'] = response.meta['bulletin'] 58 | yield item 59 | -------------------------------------------------------------------------------- /bulletin_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bulletin_scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bulletin_scraper 12 | --------------------------------------------------------------------------------