├── .gitignore
├── LICENSE
├── README.md
└── bulletin_scraper
    ├── bulletin_scraper
        ├── __init__.py
        ├── items.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
        │   ├── __init__.py
        │   └── bulletins.py
    └── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Windows template
  3 | # Windows image file caches
  4 | Thumbs.db
  5 | ehthumbs.db
  6 | 
  7 | # Folder config file
  8 | Desktop.ini
  9 | 
 10 | # Recycle Bin used on file shares
 11 | $RECYCLE.BIN/
 12 | 
 13 | # Windows Installer files
 14 | *.cab
 15 | *.msi
 16 | *.msm
 17 | *.msp
 18 | 
 19 | # Windows shortcuts
 20 | *.lnk
 21 | ### Python template
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | env/
 33 | build/
 34 | develop-eggs/
 35 | dist/
 36 | downloads/
 37 | eggs/
 38 | .eggs/
 39 | lib/
 40 | lib64/
 41 | parts/
 42 | sdist/
 43 | var/
 44 | *.egg-info/
 45 | .installed.cfg
 46 | *.egg
 47 | 
 48 | # PyInstaller
 49 | #  Usually these files are written by a python script from a template
 50 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 51 | *.manifest
 52 | *.spec
 53 | 
 54 | # Installer logs
 55 | pip-log.txt
 56 | pip-delete-this-directory.txt
 57 | 
 58 | # Unit test / coverage reports
 59 | htmlcov/
 60 | .tox/
 61 | .coverage
 62 | .coverage.*
 63 | .cache
 64 | nosetests.xml
 65 | coverage.xml
 66 | *,cover
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | ### VirtualEnv template
 81 | # Virtualenv
 82 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 83 | .Python
 84 | [Bb]in
 85 | [Ii]nclude
 86 | [Ll]ib
 87 | [Ss]cripts
 88 | pyvenv.cfg
 89 | pip-selfcheck.json
 90 | ### JetBrains template
 91 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
 92 | 
 93 | *.iml
 94 | 
 95 | ## Directory-based project format:
 96 | .idea/
 97 | # if you remove the above rule, at least ignore the following:
 98 | 
 99 | # User-specific stuff:
100 | # .idea/workspace.xml
101 | # .idea/tasks.xml
102 | # .idea/dictionaries
103 | 
104 | # Sensitive or high-churn files:
105 | # .idea/dataSources.ids
106 | # .idea/dataSources.xml
107 | # .idea/sqlDataSources.xml
108 | # .idea/dynamic.xml
109 | # .idea/uiDesigner.xml
110 | 
111 | # Gradle:
112 | # .idea/gradle.xml
113 | # .idea/libraries
114 | 
115 | # Mongo Explorer plugin:
116 | # .idea/mongoSettings.xml
117 | 
118 | ## File-based project format:
119 | *.ipr
120 | *.iws
121 | 
122 | ## Plugin-specific files:
123 | 
124 | # IntelliJ
125 | /out/
126 | 
127 | # mpeltonen/sbt-idea plugin
128 | .idea_modules/
129 | 
130 | # JIRA plugin
131 | atlassian-ide-plugin.xml
132 | 
133 | # Crashlytics plugin (for Android Studio and IntelliJ)
134 | com_crashlytics_export_strings.xml
135 | crashlytics.properties
136 | crashlytics-build.properties
137 | ### IPythonNotebook template
138 | # Temporary data
139 | .ipynb_checkpoints/
140 | 
141 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, NorthBit, Ltd.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MS Bulletin Scraper
 2 | 
 3 | This is a scraping tool to download all bulletin `.msu` files, extract the executables and download relevant symbols.
 4 | 
 5 | Be warned, downloading everything eats up a lot of disk space.
 6 | 
 7 | ## Dependencies
 8 | 
 9 | 1. [Scrapy](http://scrapy.org/), install using `pip install scrapy`.
10 | 1. To download symbols, the script uses `Symchk.exe`. So you'll need to install [Windbg](https://msdn.microsoft.com/en-us/windows/hardware/hh852365.aspx).
11 | 1. We also use `expand.exe` to expand the `.msu` files, so you need to run this on Windows 7 or higher.
12 | 
13 | 
14 | ## Usage
15 | 
16 | ```bash
17 | git clone https://github.com/NorthBit/bulletin-scraper.git bulletin-scraper
18 | cd bulletin-scraper\bulletin_scraper
19 | scrapy crawl bulletins
20 | ```
21 | 
22 | 
23 | ## Configuration
24 | 
25 | The scraper's configuration is saved in `bulletin_scraper\bulletin_scraper\settings.py`. There are some settings you MUST configure yourself.
26 | 
27 | 1. `FILES_STORE` - the location ot which the bulletins will be downloaded. The default location is a `bulletins` directory under the scraper root.
28 | 1. `SYMCHK_PATH` - the path to `symchk.exe`
29 | 1. `SYM_PATH` - the symbol path. The default local store is `C:\temp\symbols`.


--------------------------------------------------------------------------------
/bulletin_scraper/bulletin_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NorthBit/bulletin-scraper/98a5d9738d54680824d05bb420986f43c4d72e49/bulletin_scraper/bulletin_scraper/__init__.py


--------------------------------------------------------------------------------
/bulletin_scraper/bulletin_scraper/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DownloadLinkItem(scrapy.Item):
12 |     bulletin = scrapy.Field()
13 |     product = scrapy.Field()
14 |     url = scrapy.Field()
15 |     files = scrapy.Field()
16 |     msu_path = scrapy.Field()


--------------------------------------------------------------------------------
/bulletin_scraper/bulletin_scraper/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | #
  5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 | import subprocess
  8 | 
  9 | import scrapy.pipelines.files
 10 | from scrapy.exceptions import DropItem
 11 | import os
 12 | import scrapy
 13 | import errno
 14 | 
 15 | SYMCHK_PATH = r'C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\symchk.exe'
 16 | SYM_PATH = r'SRV**https://msdl.microsoft.com/download/symbols'
 17 | 
 18 | 
 19 | def expand(source, dest, filter_=None):
 20 |     if filter_ is None:
 21 |         filter_ = '*'
 22 |     subprocess.call(['expand', '-F:{}'.format(filter_), source, dest])
 23 | 
 24 | def delete(path):
 25 |     subprocess.call(['del', '/Q', path], shell=True)
 26 | 
 27 | def symchk(path, symchk_path=None, sym_path=None, output_dir=None):
 28 |     if symchk_path is None:
 29 |         symchk_path = SYMCHK_PATH
 30 |     if sym_path is None:
 31 |         sym_path = SYM_PATH
 32 | 
 33 |     output_dir_args = []
 34 |     if output_dir is not None:
 35 |         output_dir = os.path.join(os.getcwd(), output_dir)
 36 |         output_dir_args = ['/oc', output_dir]
 37 | 
 38 |     subprocess.call([symchk_path, '/r', path, '/s', sym_path, ] + output_dir_args)
 39 | 
 40 | 
 41 | class MsuDownloadPipeline(scrapy.pipelines.files.FilesPipeline):
 42 |     def get_media_requests(self, item, info):
 43 |         url = item['url']
 44 |         if not url.lower().endswith('.msu'):
 45 |             raise DropItem('Item not an MSU')
 46 |         request = scrapy.Request(url)
 47 |         request.meta['bulletin'] = item['bulletin']
 48 |         yield request
 49 | 
 50 |     def item_completed(self, results, item, info):
 51 |         file_paths = (result['path'] for ok, result in results if ok)
 52 |         msu_paths = [path for path in file_paths if path.lower().endswith('.msu')]
 53 |         item['msu_path'] = msu_paths[0]
 54 |         return item
 55 | 
 56 |     def file_path(self, request, response=None, info=None):
 57 |         bulletin = request.meta['bulletin'].upper()
 58 |         path = os.path.join(bulletin, request.url.rsplit('/', 1)[-1])
 59 |         return path
 60 | 
 61 | 
 62 | class MsuExtractPipeline(object):
 63 |     @classmethod
 64 |     def from_crawler(cls, crawler):
 65 |         instance = cls()
 66 |         instance.settings = crawler.settings
 67 |         return instance
 68 | 
 69 |     def __init__(self):
 70 |         super(MsuExtractPipeline, self).__init__()
 71 |         self.settings = None
 72 | 
 73 |     def process_item(self, item, spider):
 74 |         msu_path = os.path.join(self.settings['FILES_STORE'], item['msu_path'])
 75 |         msu_dir = os.path.dirname(msu_path)
 76 |         msu_name = item['url'].rsplit('/', 1)[-1].rsplit('.', 1)[0]
 77 |         extract_dir = os.path.join(msu_dir, msu_name)
 78 |         try:
 79 |             os.mkdir(extract_dir)
 80 |         except WindowsError as e:
 81 |             if e.errno != errno.EEXIST:
 82 |                 raise
 83 | 
 84 |         extract_cab = '{}.cab'.format(msu_name)
 85 |         expand(msu_path, extract_dir, extract_cab)
 86 |         filter_ = self.settings.get('EXTRACT_FILTER', None)
 87 |         expand(os.path.join(extract_dir, extract_cab), extract_dir, filter_=filter_)
 88 | 
 89 |         if spider.settings.get('DELETE_RUBBISH', False):
 90 |             # Delete all files that are not in directories. This includes a lot of rubbish files, as well as the
 91 |             # original `.cab` file.
 92 |             delete(os.path.join(extract_dir, '*'))
 93 | 
 94 |         if not self.settings.get('DONT_DOWNLOAD_SYMBOLS', False):
 95 |             self.download_symbols(extract_dir)
 96 | 
 97 |         if spider.settings.get('DELETE_MSU_FILES', False):
 98 |             try:
 99 |                 os.unlink(msu_path)
100 |             except WindowsError:
101 |                 pass
102 | 
103 | 
104 |         return item
105 | 
106 |     def download_symbols(self, extract_dir):
107 |         symchk_path = self.settings.get('SYMCHK_PATH', None)
108 |         sym_path = self.settings.get('SYM_PATH', None)
109 |         symchk(extract_dir, symchk_path=symchk_path, sym_path=sym_path)
110 | 


--------------------------------------------------------------------------------
/bulletin_scraper/bulletin_scraper/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for bulletin_scraper project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | #############################
13 | #                           #
14 | #     User Configuration    #
15 | #                           #
16 | #############################
17 | 
18 | # Where to store the downloaded bulletins
19 | FILES_STORE = r'.\bulletins'
20 | 
21 | # The location of the `symchk` executable
22 | SYMCHK_PATH = r'C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\symchk.exe'
23 | 
24 | # The symbol path to use. This also determines where the symbols are saved to.
25 | SYM_PATH = r'SRV*C:\temp\symbols*https://msdl.microsoft.com/download/symbols'
26 | 
27 | # List of products to download, based on the names on the bulletin pages.
28 | PRODUCT_LIST = []
29 | 
30 | # Set a filter for cab extraction. Note that it only affects the extraction.
31 | EXTRACT_FILTER = None
32 | 
33 | ## Set `DONT_DOWNLOAD_SYMBOLS` to `True` to prevent downloading symbols.
34 | DONT_DOWNLOAD_SYMBOLS = False
35 | 
36 | # Delete `.msu` files after extraction
37 | DELETE_MSU_FILES = False
38 | 
39 | # Delete rubbish files (the `.msu` files are full of them!)
40 | DELETE_RUBBISH = True
41 | 
42 | 
43 | #############################
44 | #                           #
45 | #     DO NOT CHANGE!!!      #
46 | #                           #
47 | #############################
48 | 
49 | BOT_NAME = 'bulletin_scraper'
50 | 
51 | SPIDER_MODULES = ['bulletin_scraper.spiders']
52 | NEWSPIDER_MODULE = 'bulletin_scraper.spiders'
53 | 
54 | 
55 | ITEM_PIPELINES = {
56 |     'bulletin_scraper.pipelines.MsuDownloadPipeline' : 300,
57 |     'bulletin_scraper.pipelines.MsuExtractPipeline' : 500,
58 | }
59 | 


--------------------------------------------------------------------------------
/bulletin_scraper/bulletin_scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bulletin_scraper/bulletin_scraper/spiders/bulletins.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | from ..items import DownloadLinkItem
 5 | 
 6 | 
 7 | class BulletinsSpider(scrapy.Spider):
 8 |     name = "bulletins"
 9 |     start_urls = ['https://technet.microsoft.com/en-us/library/security/dn631937.aspx']
10 | 
11 |     def parse(self, response):
12 |         for link in response.xpath('//a[string-length(@title)=4]'):
13 |             title = link.xpath('@title').extract_first()
14 |             try:
15 |                 year = int(title)
16 |             except ValueError:
17 |                 # We need only the years!
18 |                 continue
19 | 
20 |             url = link.xpath('@href').extract_first()
21 | 
22 |             yield scrapy.Request(response.urljoin(url), self.parse_bulletin_table)
23 | 
24 |     def parse_bulletin_table(self, response):
25 |         for url in response.xpath('//td/p/a/@href').extract():
26 |             yield scrapy.Request(response.urljoin(url), self.parse_bulletin_page)
27 | 
28 |     def parse_bulletin_page(self, response):
29 |         visited_urls = set()
30 |         for link in response.css('td a'):
31 |             url = link.css('::attr(href)').extract_first()
32 |             if url in visited_urls:
33 |                 continue
34 |             visited_urls.add(url)
35 |             if 'familyid' not in url.lower():
36 |                 continue
37 |             text = link.css('::text').extract_first()
38 | 
39 | 
40 |             if self.settings['PRODUCT_LIST'] and text not in self.settings['PRODUCT_LIST']:
41 |                 continue
42 | 
43 |             request = scrapy.Request(response.urljoin(url), self.resolve_download_page)
44 |             request.meta['bulletin'] = response.url.rsplit('/', 1)[-1].rsplit('.', 1)[0]
45 |             request.meta['product'] = text
46 |             yield request
47 | 
48 |     def resolve_download_page(self, response):
49 |         yield scrapy.Request(response.urljoin(response.url.replace('details.aspx', 'confirmation.aspx')),
50 |                              self.download_updates, meta=response.meta)
51 | 
52 |     def download_updates(self, response):
53 |         for url in response.css('td.file-link a::attr("href")').extract():
54 |             item = DownloadLinkItem()
55 |             item['url'] = url
56 |             item['product'] = response.meta['product']
57 |             item['bulletin'] = response.meta['bulletin']
58 |             yield item
59 | 


--------------------------------------------------------------------------------
/bulletin_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bulletin_scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bulletin_scraper
12 | 


--------------------------------------------------------------------------------