├── uefispider
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── asrock_spider.py
    │   ├── asus_spider.py
    │   ├── msi_spider.py
    │   ├── intel_spider.py
    │   ├── gigabyte_spider.py
    │   ├── dell_spider.py
    │   ├── lenovo_spider.py
    │   └── hp_spider.py
    ├── settings.py
    ├── pipelines.py
    └── items.py
├── scrapy.cfg
├── .gitignore
├── README.rst
└── LICENSE


/uefispider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = uefispider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = uefispider
12 | 


--------------------------------------------------------------------------------
/uefispider/spiders/__init__.py:
--------------------------------------------------------------------------------
 1 | # This package will contain the spiders of your Scrapy project
 2 | #
 3 | # Please refer to the documentation for information on how to create and manage
 4 | # your spiders.
 5 | 
 6 | from scrapy.spider import Spider
 7 | import os
 8 | 
 9 | class UefiSpider(Spider):
10 |   name = 'UefiSpider'
11 | 
12 |   def __init__(self, dump= 'output'):
13 |     self.output = dump
14 |     if self.output[0] != '/':
15 |       self.output = os.path.join(os.getcwd(), self.output)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Testing resources
 2 | output
 3 | 
 4 | *.py[cod]
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Packages
10 | *.egg
11 | *.egg-info
12 | dist
13 | build
14 | eggs
15 | parts
16 | bin
17 | var
18 | sdist
19 | develop-eggs
20 | .installed.cfg
21 | lib
22 | lib64
23 | __pycache__
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | 
28 | # Unit test / coverage reports
29 | .coverage
30 | .tox
31 | nosetests.xml
32 | 
33 | # Translations
34 | *.mo
35 | 
36 | # Mr Developer
37 | .mr.developer.cfg
38 | .project
39 | .pydevproject
40 | 


--------------------------------------------------------------------------------
/uefispider/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for uefispider project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'uefispider'
10 | 
11 | SPIDER_MODULES = ['uefispider.spiders']
12 | NEWSPIDER_MODULE = 'uefispider.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
15 | USER_AGENT = 'uefispider (+https://github.com/theopolis/uefi-spider)'
16 | 
17 | ITEM_PIPELINES = {
18 |   'uefispider.pipelines.UefispiderPipeline': 1
19 | }
20 | 
21 | COOKIES_DEBUG = True


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | UEFI Spider
 2 | ===========
 3 | The UEFI Spider is a set of HIGHLY specific scripts containing spidering logic for 
 4 | ISV/OEMs providing downloadable UEFI firmware updates. Each spider will attempt to document (in JSON) and download every identified UEFI firmware update.
 5 | 
 6 | **WARNING:** Using this tool is dangerous, upon running each spider you will have downloaded well over 50G of firmware updates. This is highly taxing on both your bandwidth and the services hosting the updates. Please read the EULA for each site before spidering. This code is provided for reference only; this project and its authors do not encourage using the spiders. 
 7 | 
 8 | Installation
 9 | ------------
10 | **Requirements**
11 | ::
12 | 
13 |   $ apt-get install libxml2-dev libxslt1-dev python-dev
14 |   $ pip install scrapy
15 | 
16 | Usage
17 | -----
18 | ::
19 | 
20 |   $ scrapy crawl -a dump=/path/to/spider/output DellSpider
21 | 
22 | **Supported Vendors**
23 | 
24 | - ASRock
25 | - Dell
26 | - Gigabyte
27 | - Intel
28 | - Lenovo
29 | - HP
30 | - MSI
31 | - VMware
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Teddy Reed
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/uefispider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import json
 7 | import os
 8 | 
 9 | from uefispider.items import *
10 | 
11 | class UefispiderPipeline(object):
12 |     def process_item(self, item, spider):
13 |         spider_name = spider.name
14 |         item_id = item["item_id"]
15 | 
16 |         print spider.output, spider_name, item_id
17 |         output_dir = os.path.join(spider.output, spider_name, item_id)
18 |         
19 |         binary = item["binary"] if "binary" in dict(item) else ""
20 |         item["binary"] = ""
21 | 
22 |         binary_name = "uefi.bin"
23 |         if "binary_name" in dict(item):
24 |             binary_name = item["binary_name"]
25 | 
26 |         try:
27 |             os.makedirs(output_dir)
28 |         except Exception, e:
29 |             print "Cannot make directories (%s). (%s)" % (output_dir, str(e))
30 | 
31 |         try:
32 |             if type(item) is not BinaryItem:
33 |                 ### Only write JSON if this is not a binary-only item.
34 |                 data = json.dumps(dict(item))
35 |                 with open(os.path.join(output_dir, "details.json"), "w") as fh:
36 |                     fh.write(data)
37 | 
38 |             if len(binary) > 0:
39 |                 ### An item may only include meta data.
40 |                 with open(os.path.join(output_dir, binary_name), "wb") as fh:
41 |                     fh.write(binary)
42 |         except Exception, e:
43 |             print "Cannot write data (%s). (%s)" % (output_dir, str(e))
44 | 
45 |         #return item
46 | 


--------------------------------------------------------------------------------
/uefispider/items.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your scraped items
  2 | #
  3 | # See documentation in:
  4 | # http://doc.scrapy.org/en/latest/topics/items.html
  5 | 
  6 | from scrapy.item import Item, Field
  7 | 
  8 | class UefispiderItem(Item):
  9 |     item_id = Field()
 10 |     binary = Field()
 11 |     binary_name = Field()
 12 |     pass
 13 | 
 14 | class BinaryItem(UefispiderItem):
 15 |     ### This item will only write a binary object.
 16 |     binary_name = Field()
 17 |     pass
 18 | 
 19 | class GigabyteLinkItem(UefispiderItem):
 20 |     driver_type = Field()
 21 |     url = Field()
 22 |     name = Field()
 23 | 
 24 | class GigabyteUpdateItem(UefispiderItem):
 25 |     version = Field()
 26 |     date = Field()
 27 |     desc = Field()
 28 |     bios_url = Field()
 29 |     attrs = Field()
 30 | 
 31 | class LenovoUpdateItem(UefispiderItem):
 32 |     version = Field()
 33 |     date = Field()
 34 |     desc = Field()
 35 |     bios_url = Field()
 36 |     url = Field()
 37 |     products = Field()
 38 |     notes_url = Field()
 39 | 
 40 | class AsrockLinkItem(UefispiderItem):
 41 |     chipset = Field()
 42 |     product = Field()
 43 |     url = Field()
 44 | 
 45 | class AsrockUpdateItem(UefispiderItem):
 46 |     version = Field()
 47 |     date = Field()
 48 |     desc = Field()
 49 |     bios_type = Field()
 50 |     bios_url = Field()
 51 |     attrs = Field()
 52 | 
 53 | class MsiUpdateLinkItem(UefispiderItem):
 54 |     url = Field()
 55 |     title = Field()
 56 |     id = Field()
 57 | 
 58 | class MsiUpdatePageItem(UefispiderItem):
 59 |     desc = Field()
 60 |     driver_type = Field()
 61 |     bios_url = Field()
 62 |     version = Field()
 63 |     date = Field()
 64 |     attrs = Field()
 65 | 
 66 | class HPBiosUpdateLinkItem(UefispiderItem):
 67 |     url = Field()
 68 |     date = Field()
 69 |     name = Field()
 70 | 
 71 | class HPBiosUpdatePageItem(UefispiderItem):
 72 |     bios_url = Field()
 73 |     notes_url = Field()
 74 |     version = Field()
 75 |     download_name = Field()
 76 |     attrs = Field()
 77 | 
 78 |     ### From revision history
 79 |     previous_versions = Field()
 80 | 
 81 |     ### From a textual-update
 82 |     importance = Field()
 83 |     compatibility = Field()
 84 |     ssm = Field() # remote update
 85 |     desc = Field()
 86 |     fixes = Field()
 87 | 
 88 | class IntelBiosUpdateLinkItem(UefispiderItem):
 89 |     url = Field()
 90 |     name = Field()
 91 |     date = Field()
 92 |     version = Field()
 93 |     desc = Field()
 94 |     status = Field()
 95 | 
 96 | class IntelBiosUpdatePageItem(UefispiderItem):
 97 |     bios_url = Field()
 98 |     notes_url = Field()
 99 |     products = Field()
100 |     attrs = Field() # attributes from LinkItem
101 | 
102 | class DellBiosResultsItem(Item):
103 |     total = Field()
104 | 
105 | class DellBiosUpdateLinkItem(UefispiderItem):
106 |     url = Field()
107 |     release_date = Field()
108 |     driver_type = Field()
109 |     compatibility = Field()
110 |     desc = Field()
111 | 
112 | class DellBiosUpdatePageItem(UefispiderItem):
113 |     bios_urls = Field()
114 |     file_names = Field()
115 |     notes_url = Field()
116 |     previous_versions = Field()
117 |     importance = Field()
118 |     version = Field()
119 |     fixes = Field()
120 |     attrs = Field() # attributes from LinkItem
121 | 


--------------------------------------------------------------------------------
/uefispider/spiders/asrock_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request, FormRequest
  5 | 
  6 | from uefispider.items import *
  7 | 
  8 | import json
  9 | import re
 10 | import copy
 11 | 
 12 | class AsrockSpider(UefiSpider):
 13 |     name = 'AsrockSpider'
 14 |     allowed_domains = [
 15 |         "asrock.com",
 16 |         "66.226.78.22"
 17 |     ]
 18 | 
 19 |     start_urls = [
 20 |         "http://www.asrock.com/support/download.asp?c=All"
 21 |     ]
 22 | 
 23 |     def parse(self, response):
 24 |         sel = Selector(response)
 25 | 
 26 |         machines = []
 27 |         rows = sel.css("tr")
 28 |         for row in rows:
 29 |             bgcolor = row.xpath("@bgcolor")
 30 |             if not bgcolor or len(bgcolor) == 0:
 31 |                 continue
 32 |             bgcolor = bgcolor.extract()[0]
 33 |             if bgcolor not in ["white", "#e8e8e8"]:
 34 |                 continue
 35 |             cells = row.css("td")
 36 |             chipset = cells[0].xpath(".//text()").extract()[0]
 37 |             if chipset in ["Chipset"]:
 38 |                 continue
 39 |             name = cells[1].xpath(".//text()").extract()[0]
 40 |             link = cells[1].css("a").xpath("@href").extract()[0]
 41 |             #print chipset, name, link
 42 |             item = AsrockLinkItem()
 43 |             item["chipset"] = chipset
 44 |             item["product"] = name
 45 |             item["url"] = "http://www.asrock.com%s" % link
 46 | 
 47 |             machines.append(item)
 48 | 
 49 |         for machine in machines:
 50 |             yield Request(machine["url"], callback= self.parse_machine,
 51 |                 meta= {"item": machine})
 52 | 
 53 |     def parse_downloads(self, response):
 54 |         def extract_field(field_sel):
 55 |             return field_sel.xpath(".//text()").extract()[0]
 56 |         sel = Selector(response)
 57 | 
 58 |         updates = []
 59 |         rows = sel.css("tr")
 60 |         for row in rows:
 61 |             cells = row.css("td")
 62 |             if len(cells) != 10:
 63 |                 continue
 64 |             item = AsrockUpdateItem()
 65 |             item["version"] = extract_field(cells[0])
 66 |             item["date"] = extract_field(cells[1])
 67 |             item["bios_type"] = extract_field(cells[2])
 68 |             if item["bios_type"] not in ["Instant Flash"]:
 69 |                 continue
 70 |             item["desc"] = extract_field(cells[4])
 71 |             item["bios_url"] = cells[8].css("a").xpath("@href").extract()[0]
 72 |             item["binary_name"] = item["bios_url"].split("/")[-1]
 73 |             item["item_id"] = item["binary_name"].replace(".zip", "")
 74 | 
 75 |             item["attrs"] = dict(response.meta["item"])
 76 |             #print dict(item)
 77 |             updates.append(item)
 78 | 
 79 |         for update in updates:
 80 |             yield Request(url= update["bios_url"], callback= self.parse_binary,
 81 |                meta= {"item": update})
 82 |             pass
 83 |         pass
 84 | 
 85 |     def parse_machine(self, response):
 86 |         sel = Selector(response)
 87 | 
 88 |         download_link = None
 89 |         list_items = sel.css("#LeftMenu").css("li")
 90 |         for item in list_items:
 91 |             text = item.xpath(".//text()").extract()[0]
 92 |             if text.find("Download") < 0:
 93 |                 continue
 94 |             try:
 95 |                 download_link = item.css("a").xpath("@href").extract()[0]
 96 |             except:
 97 |                 continue
 98 | 
 99 |         if download_link is not None:
100 |             yield Request(url= "http://www.asrock.com%s&os=BIOS" % download_link, 
101 |                 callback= self.parse_downloads,
102 |                 meta= {"item": response.meta["item"]})
103 |         pass
104 | 
105 |     def parse_binary(self, response):
106 |         item = response.meta["item"]
107 |         item["binary"] = response.body
108 | 
109 |         yield item
110 | 


--------------------------------------------------------------------------------
/uefispider/spiders/asus_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request, FormRequest
  5 | ### Need to change useragent
  6 | from scrapy.utils.project import get_project_settings
  7 | 
  8 | from uefispider.items import *
  9 | 
 10 | import json
 11 | import re
 12 | import copy
 13 | 
 14 | def _select_form(index, categories):
 15 |     ### Start at Repeater{index} and add each category in the tuple.
 16 |     repeater = []
 17 |     for category in categories:
 18 |         repeater.append("Repeater%d$%s" % (index, category))
 19 |         index += 1
 20 |     form = {
 21 |         "ScriptManager1": "ScriptManager1|%s$LinkButton1" % "$".join(repeater),
 22 |         "langNormal": "en",
 23 |         "hd_l_series": "Series",
 24 |         "hd_l_model": "Model",
 25 |         "hd_l_os": "OS",
 26 |         "hd_select_type": "1",
 27 |         "__EVENTTARGET": "%s$LinkButton1" % "$".join(repeater),
 28 |         "__EVENTARGUMENT": "",
 29 |         "__ASYNCPOST": "true"
 30 |     }
 31 |     return form
 32 | 
 33 | class AsusSpider(UefiSpider):
 34 |     name = 'AsusSpider'
 35 |     allowed_domains = [
 36 |         "asus.com"
 37 |     ]
 38 | 
 39 |     product_types = [
 40 |         ("ct100", "ct100"), # Laptops
 41 |         ("ct101", "ct100"), # Tablets
 42 |         ("ct102", "ct100"), # Motherboards
 43 |         ("ct103", "ct100"), # Barebones
 44 |         ("ct103", "ct101"), # Desktops
 45 |         ("ct103", "ct102"), # All-in-Ones
 46 |         ("ct104", "ct100"), # Servers
 47 |     ]
 48 | 
 49 |     start_urls = [
 50 |         ### Start at model selector.
 51 |         "http://support.asus.com/download/options.aspx?SLanguage=en",
 52 |     ]
 53 | 
 54 |     select_urls = [
 55 |         "http://support.asus.com/Select/ModelSelect.aspx?SLanguage=en&type=1&KeepThis=true",
 56 |     ]
 57 | 
 58 |     def _get_uas(self):
 59 |         ### Edit user agent
 60 |         settings = get_project_settings()
 61 |         return " ".join([
 62 |             settings.get("USER_AGENT"),
 63 |             ### The ASP.NET application is checking for async-compatible browsers.
 64 |             "Mozilla/5.0 (Windows NT 6.1; WOW64)"
 65 |             #"AppleWebKit/537.36 (KHTML, like Gecko)",
 66 |             #"Chrome/34.0.1847.116",
 67 |             #"Safari/537.36",
 68 |         ])
 69 |         pass
 70 | 
 71 |     def parse(self, response):
 72 | 
 73 |         yield Request(url= self.select_urls[0],
 74 |             headers= {"User-Agent": self._get_uas()}, 
 75 |             #meta= {"cookiejar": "GLOBAL"},
 76 |             callback= self.parse_again)
 77 |         
 78 |     def parse_again(self, response):
 79 |         sel = Selector(response)
 80 | 
 81 |         hidden_fields = {}
 82 |         inputs = sel.xpath("//input")
 83 |         for ele in inputs:
 84 |             input_type = ele.xpath(".//@type").extract()[0]
 85 |             value = ele.xpath(".//@value").extract()[0]
 86 |             name = ele.xpath(".//@name").extract()[0]
 87 |             if input_type not in ["hidden"]:
 88 |                 continue
 89 |             hidden_fields[name] = value
 90 | 
 91 |         for product_type in self.product_types:
 92 |             ### Create a POST form and apply a generated ScriptManager
 93 |             form_data = _select_form(1, product_type)
 94 |             for field in hidden_fields:
 95 |                 ### Replace static fields with page-generated inputs.
 96 |                 form_data[field] = hidden_fields[field]
 97 |             #print form_data
 98 |             yield FormRequest(formdata= form_data, method= "POST",
 99 |                 headers= {
100 |                     "Content-Type": "application/x-www-form-urlencoded",
101 |                     #"X-MicrosoftAjax": "Delta=true",
102 |                     "X-Requested-With": "XMLHttpRequest",
103 |                     "User-Agent": self._get_uas()
104 |                 },
105 |                 url= self.select_urls[0],
106 |                 #meta= {"cookiejar": "GLOBAL"},
107 |                 callback= self.parse_series)
108 |             return
109 | 
110 |     def parse_series(self, response):
111 |         sel = Selector(response)
112 | 
113 |         from scrapy.shell import inspect_response
114 |         inspect_response(response)


--------------------------------------------------------------------------------
/uefispider/spiders/msi_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request, FormRequest
  5 | 
  6 | from uefispider.items import *
  7 | 
  8 | import json
  9 | import re
 10 | import copy
 11 | 
 12 | json_headers = {
 13 |   "X-Requested-With": "XMLHttpRequest",
 14 |   "Accept": "application/json, text/javascript, */*",
 15 | }
 16 | 
 17 | class MsiSpider(UefiSpider):
 18 |     name = 'MsiSpider'
 19 |     allowed_domains = [
 20 |         "msi.com"
 21 |     ]
 22 | 
 23 |     start_urls = [
 24 |         "http://us.msi.com/download/pages/list_ajax"
 25 |     ]
 26 | 
 27 |     msi_search_vars = {
 28 |         "p": "service",
 29 |         "d": "list",
 30 |         "c": "download",
 31 |         "no": "",
 32 |         "cat": "mb",
 33 |         "pno": "",
 34 |         "switch": "ProductSelector",
 35 |         "sw": "ajax"
 36 |     }
 37 | 
 38 |     def _get_vars(self, no, pno):
 39 |         search_vars = copy.copy(self.msi_search_vars)
 40 |         search_vars["no"] = str(no)
 41 |         search_vars["pno"] = str(pno)
 42 |         return search_vars
 43 | 
 44 |     def parse(self, response):
 45 |         ### Generate a search for AMD and Intel chips
 46 |         intel_search = self._get_vars(170, 1)
 47 |         amd_search   = self._get_vars(171, 1)
 48 |         yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
 49 |             formdata= intel_search, callback= self.parse_search)
 50 |         yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
 51 |             formdata= amd_search, callback= self.parse_search)
 52 | 
 53 |     def parse_search(self, response):
 54 |         sel = Selector(response)
 55 | 
 56 |         ### Parse each sub-product type.
 57 |         searches = []
 58 |         product_selector = sel.css(".mr20").xpath("@no")
 59 |         if product_selector:
 60 |             pno = product_selector.extract()[0]
 61 | 
 62 |             products = sel.css(".ProdSel-item")
 63 |             for product in products:
 64 |                 no = product.xpath("@no").extract()[0]
 65 |                 searches.append((no, pno))
 66 |         #print searches
 67 | 
 68 |         ### Parse the actual products/boards.
 69 |         boards = []
 70 |         items = sel.css(".Prod-item")
 71 |         for item in items:
 72 |             title = item.xpath("@title").extract()[0]
 73 |             no = item.xpath("@no").extract()[0]
 74 |             boards.append((title, no))
 75 |         #print boards
 76 | 
 77 |         for sub_search in searches:
 78 |             search_vars = self._get_vars(sub_search[0], sub_search[1])
 79 |             yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
 80 |                 formdata= search_vars, callback= self.parse_search)
 81 | 
 82 |         for board in boards:
 83 |             url = "http://us.msi.com/product/mb/%s.html" % board[0]
 84 |             item = MsiUpdateLinkItem()
 85 |             item["id"] = board[1]
 86 |             item["title"] = board[0]
 87 |             item["url"] = url
 88 | 
 89 |             yield Request(url= "%s#/?div=BIOS" % url, callback= self.parse_board, 
 90 |                 meta= {"attrs": item})
 91 |         pass
 92 | 
 93 |     def parse_board(self, response):
 94 |         def extract_field(field_sel):
 95 |             return field_sel.xpath(".//text()").extract()[0]
 96 |         sel = Selector(response)
 97 | 
 98 |         updates = []
 99 |         update_sels = sel.css(".div-BIOS").css(".table_gray")
100 |         for update in update_sels:
101 |             item = MsiUpdatePageItem()
102 |             fields = update.css("td")
103 |             item["desc"] = extract_field(fields[2])
104 |             item["version"] = extract_field(fields[4])
105 |             item["driver_type"] = extract_field(fields[6])
106 |             item["date"] = extract_field(fields[8])
107 |             try:
108 |                 item["bios_url"] = fields[10].xpath(".//a/@href").extract()[0]
109 |             except Exception, e: 
110 |                 #print response.meta["attrs"]["title"], str(e)
111 |                 continue
112 |             item["binary_name"] = item["bios_url"].split("/")[-1]
113 |             item["item_id"] = item["binary_name"].split(".", 1)[0]
114 |             item["attrs"] = dict(response.meta["attrs"])
115 |             updates.append(item)
116 | 
117 |         for update in updates:
118 |             yield Request(url= update["bios_url"], callback= self.parse_binary, 
119 |                 meta= {"item": update})
120 |             
121 |     def parse_binary(self, response):
122 |         item = response.meta["item"]
123 |         item["binary"] = response.body
124 | 
125 |         yield item
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/uefispider/spiders/intel_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import FormRequest, Request
  5 | 
  6 | from uefispider.items import *
  7 | 
  8 | import json
  9 | 
 10 | #from scrapy.shell import inspect_response
 11 | #inspect_response(response)
 12 | 
 13 | class IntelSpider(UefiSpider):
 14 |   name = 'IntelSpider'
 15 |   allowed_domains = [
 16 |     "downloadcenter.intel.com", 
 17 |     "downloadmirror.intel.com",
 18 |     "search.intel.com",
 19 |   ]
 20 | 
 21 |   start_urls = [
 22 |     "https://downloadcenter.intel.com/Default.aspx?lang=eng",
 23 |   ]
 24 | 
 25 |   def parse(self, response):
 26 |     url = "https://downloadcenter.intel.com/SearchResult.aspx?lang=eng"
 27 | 
 28 |     search_form = {
 29 |       "search_downloads": ".BIO",
 30 |       "ctl00$body$submit_search_downloads": "Search downloads",
 31 |       "ctl00$body$searchKeyword": "BIO"
 32 |     }
 33 | 
 34 |     return [FormRequest(url= url, method= "POST",
 35 |       formdata= search_form, callback= self.parse_form)]
 36 | 
 37 |   def parse_form(self, response):
 38 |     '''Walking 'to' a form is not required, but just incase act like a human.'''
 39 | 
 40 |     ### The form will response with HTML, but data is refreshed with an XMLHTTP request.
 41 |     url = "https://downloadcenter.intel.com/JSONDataProvider.aspx?DownloadType=BIOS&pg=1&sortDir=descending&Hits=%d&keyword=BIO&lang=eng&refresh=filters&dataType=json&type=GET"
 42 | 
 43 |     sel = Selector(response)
 44 |     num_results = sel.css("span#num_results")
 45 |     if len(num_results) != 1:
 46 |       print "Error no results found?"
 47 |       return
 48 | 
 49 |     ### Example NNNN matching result(s)
 50 |     num_results = num_results.css("::text").extract()[0].split(" ")[0]
 51 |     try:
 52 |       num_results = int(num_results)
 53 |     except Exception, e:
 54 |       print "Cannot format results count as number? (%s)" % str(e)
 55 |       return
 56 | 
 57 |     ### Now send an AJAX request for ALL matching items.
 58 |     json_data = {
 59 |       "DownloadType": "BIOS",
 60 |       "pg": "1",
 61 |       "sortDir": "descending",
 62 |       "Hits": "%d" % num_results,
 63 |       "keyword": "\"BIO\"",
 64 |       "lang": "eng",
 65 |       "refresh": "filters",
 66 |       "dataType": "json",
 67 |       "type": "GET"
 68 |     }
 69 | 
 70 |     json_headers = {
 71 |       "X-Requested-With": "XMLHttpRequest",
 72 |       "Accept": "application/json, text/javascript, */*",
 73 |     }
 74 | 
 75 |     return [FormRequest(url= url % num_results, method= "POST", headers= json_headers, 
 76 |       formdata= json_data, callback= self.parse_json)]
 77 | 
 78 |   def parse_json(self, response):
 79 |     '''A JSON object of the search results.'''
 80 | 
 81 |     download_url = "https://downloadcenter.intel.com%s"
 82 | 
 83 |     ### The result response SHOULD be JSON.
 84 |     try:
 85 |       results = json.loads(response.body)
 86 |     except Exception, e:
 87 |       print "Cannot load JSON results. (%s)" % str(e)
 88 |       return
 89 | 
 90 |     items = []
 91 |     updates= results["results"]
 92 |     for update in updates:
 93 |       item = IntelBiosUpdateLinkItem()
 94 |       item["item_id"] = update["title"]["downloadid"]
 95 |       item["url"] = update["title"]["href"]
 96 |       item["name"] = update["title"]["header"]
 97 |       item["date"] = update["date"]
 98 |       item["version"] = update["version"]
 99 |       item["desc"] = update["title"]["description"]
100 |       item["status"] = update["status"]
101 | 
102 |       yield Request(url= download_url % item["url"], callback= self.parse_download,
103 |         meta= {"attrs": item})
104 | 
105 |   def parse_download(self, response):
106 |     '''The download page (usually) offers multiple download links, we want just the update.'''
107 | 
108 |     sel = Selector(response)
109 | 
110 |     link_notes = None
111 |     link_bios  = None
112 | 
113 |     links = sel.css('a').xpath('@href').extract()
114 |     for link in links:
115 |       ### Release notes are cool too, though they are in PDF form.
116 |       if link.find("ReleaseNotes") >= 0:
117 |         link_notes = link
118 |       if link.find(".BIO") >= 0:
119 |         link_bios = link
120 | 
121 |     if link_bios is None:
122 |       return
123 |     
124 |     item = IntelBiosUpdatePageItem()
125 |     link_bios = link_bios[link_bios.find("httpDown=")+len("httpDown="): link_bios.find(".BIO")+len(".BIO")]
126 |     item['bios_url'] = link_bios
127 |     item['notes_url'] = link_notes if link_notes is not None else ""
128 | 
129 |     ### Supported products is nice too.
130 |     products = []
131 |     products_sel = sel.css('div#prodos')
132 |     if len(products_sel) > 0:
133 |       products_sel = products_sel.xpath(".//table/tr/td/text()").extract()
134 |       for product in products_sel:
135 |         products.append("".join([c for c in product if c not in ['\t', '\n', '\r']]))
136 |     item['products'] = products
137 |     item['attrs'] = dict(response.meta['attrs'])
138 |     item['item_id'] = item['attrs']['item_id']
139 | 
140 |     #yield item
141 |     yield Request(url= link_bios, callback= self.parse_binary,
142 |       meta= {"item": item})
143 |     pass
144 | 
145 |   def parse_binary(self, response):
146 |     item = response.meta["item"]
147 |     item["binary"] = response.body
148 | 
149 |     yield item
150 | 


--------------------------------------------------------------------------------
/uefispider/spiders/gigabyte_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request, FormRequest
  5 | 
  6 | from uefispider.items import *
  7 | from urlparse import urlparse
  8 | 
  9 | import json
 10 | import sys
 11 | import os
 12 | 
 13 | base_search   = "http://www.gigabyte.us:80/support-downloads/category-level_ajax.aspx?%s"
 14 | submit_search = "http://www.gigabyte.us/support-downloads/download-center_ajax.aspx?%s"
 15 | bios_search   = "http://www.gigabyte.us/products/product-page_ajax.aspx?%s"
 16 | 
 17 | def _search_url(ck, lev, val):
 18 |     return base_search % ("ck=%s&lev=%s&val=%s" % (ck, lev, val))
 19 | 
 20 | def _submit_url(p, ck, pid):
 21 |     ### p=1&kw=&ck=2&pid=3752
 22 |     return submit_search % ("p=%s&kw=&ck=%s&pid=%s" % (str(p), ck, pid))
 23 | 
 24 | def _bios_url(pid):
 25 |     #return bios_search % ("t=dl&pid=%s&dlt=%s&cg=%s&ck=%s&h=bios&MDA2=" % (
 26 |     #    pid, dlt, cg, ck
 27 |     #))
 28 |     return bios_search % ("t=dl&pid=%s&dlt=2" % pid)
 29 |     pass
 30 | 
 31 | def _url_params(url):
 32 |     url = urlparse(url)
 33 |     params = {p.split("=")[0]: p.split("=")[1] for p in url.query.split("&")}
 34 |     return params
 35 | 
 36 | class GigabyteSpider(UefiSpider):
 37 |     name = 'GigabyteSpider'
 38 |     allowed_domains = [
 39 |         "gigabyte.us",
 40 |     ]
 41 | 
 42 |     start_urls = [
 43 |         ### Motherboards
 44 |         _search_url(2, 1, 2),
 45 |         _search_url(101, 1, 101),
 46 |         _search_url(112, 1, 112),
 47 |         ### Notebook/Netbook
 48 |         _search_url(5, 1, 5),
 49 |         ### Slate PC (tablet)
 50 |         _search_url(71, 1, 71),
 51 |         ### Set top boxes
 52 |         _search_url(131, 1, 131),
 53 |         _search_url(133, 1, 133),
 54 |         ### Barebones
 55 |         _search_url(102, 1, 102),
 56 |         _search_url(122, 1, 122),
 57 |         ### NAS
 58 |         _search_url(132, 1, 132),
 59 |     ]
 60 | 
 61 |     def parse(self, response):
 62 |         ### Each search returns a JSON response of Rows (classes of products)
 63 |         try:
 64 |             json_response = json.loads(response.body)
 65 |         except Exception, e:
 66 |             print "Cannot load JSON from category search."
 67 |             return
 68 | 
 69 |         params = _url_params(response.url)
 70 |         level = params["lev"] if "lev" in params else "0"
 71 | 
 72 |         if "ck" not in params:
 73 |             print "Cannot find CK value in response params?"
 74 |             return
 75 |         if "node" not in json_response:
 76 |             print "Cannot find NODE value in response response?"
 77 | 
 78 |         for row in json_response["rows"]:
 79 |             if row["value"] == "":
 80 |                 continue
 81 |             ### node=1 indicates a bottom-level search, each row is an item.
 82 |             if json_response["node"] == "0":
 83 |                 yield Request(url= _search_url(params["ck"], int(level)+1, row["value"]))
 84 |             else:
 85 |                 yield Request(url= _submit_url(1, params["ck"], row["value"]), 
 86 |                     callback= self.parse_submit)
 87 |         pass
 88 | 
 89 |     def parse_product(self, response):
 90 |         sel = Selector(response)
 91 | 
 92 |         results =  sel.css(".tbl_driver")
 93 |         if not results:
 94 |             return
 95 | 
 96 |         rows = results.css("tr")
 97 |         for i in xrange(len(rows)-1):
 98 |             data = rows[i+1].css("td")
 99 |             ### Most common (no bios) will not include results
100 |             if len(data) == 0:
101 |                 continue
102 |             item = GigabyteUpdateItem()
103 |             ### DLT=2 may be mapped differently.
104 |             try:
105 |                 item["version"] = data[0].xpath(".//text()").extract()[0]
106 |             except Exception, e:
107 |                 continue
108 | 
109 |             item["date"] = data[2].xpath(".//text()").extract()[0]
110 |             links = data[3].css("a")
111 |             ### Links may be malformed.
112 |             if len(links) < 3:
113 |                 continue
114 |             item["bios_url"] = data[3].css("a")[2].xpath("@href").extract()[0]
115 |             ### Handle a lack-of-desc.
116 |             try:
117 |                 item["desc"] = data[4].xpath(".//text()").extract()[0]
118 |             except Exception, e:
119 |                 item["desc"] = ""
120 |             #print item_id, response.url
121 |             #print version, date, bios_url, desc
122 |             basename = os.path.basename(urlparse(item["bios_url"]).path)
123 |             item["item_id"] = os.path.splitext(basename)[0]
124 |             item["binary_name"] = basename
125 |             item["attrs"] = dict(response.meta["item"])
126 | 
127 |             yield Request(url= item["bios_url"], callback= self.parse_binary,
128 |                 meta= {"item": item})
129 |         pass
130 | 
131 |     def parse_submit(self, response):
132 |         ### After navigating the search menus, parse a list of results.
133 |         sel = Selector(response)
134 | 
135 |         results = sel.css("tr")
136 |         for result in results:
137 |             item = GigabyteLinkItem()
138 |             item["driver_type"] = result.css(".text2").xpath(".//text()").extract()[0]
139 |             item["name"] = result.css(".title3").css("a").xpath(".//text()").extract()[0]
140 |             item["url"] = result.css(".title3").css("a").xpath("@href").extract()[0]
141 |             params = _url_params(item["url"])
142 |             yield Request(url= _bios_url(params["pid"]),
143 |                 callback= self.parse_product,
144 |                 meta= {"item": item})
145 |         pass
146 | 
147 |     def parse_binary(self, response):
148 |         item = response.meta["item"]
149 |         item["binary"] = response.body
150 | 
151 |         yield item
152 | 
153 | 


--------------------------------------------------------------------------------
/uefispider/spiders/dell_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request
  5 | 
  6 | from uefispider.items import *
  7 | 
  8 | import json
  9 | import re
 10 | 
 11 | #from scrapy.shell import inspect_response
 12 | #inspect_response(response)
 13 | 
 14 | class DellSpider(UefiSpider):
 15 |   name = 'DellSpider'
 16 |   allowed_domains = [
 17 |     "search.dell.com", 
 18 |     "www.dell.com",
 19 |     "dell.com"
 20 |     #"downloadmirror.intel.com",
 21 |     #"search.intel.com",
 22 |   ]
 23 | 
 24 |   dell_search_vars = {
 25 |     "c":       "us",     # country
 26 |     "l":       "en",     # language
 27 |     "s":       "gen",    # search type (home, business, generic)
 28 |     "cat":     "sup",
 29 |     "k":       "BIOS",   # input
 30 |     "rpp":     "20",     # results per-page? does not change
 31 |     "p":       "1",      # page index
 32 |     "subcat":  "dyd",
 33 |     "rf":      "all",
 34 |     "nk":      "f",
 35 |     "sort":    "K",
 36 |     "nf":      "catn~BI",
 37 |     "navla":   "catn~BI",
 38 |     "ira":     "False",
 39 |     "~srd":    "False",
 40 |     "ipsys":   "False",
 41 |     "advsrch": "False",
 42 |     "~ck":     "anav"
 43 |   }
 44 | 
 45 |   filetype_blacklist = ["txt", "sign", "pdf"]
 46 | 
 47 |   results_url = "http://search.dell.com/results.aspx?%s"
 48 |   start_urls = [
 49 |     results_url % 
 50 |       ("&".join(["%s=%s" % (k, v) for k, v in dell_search_vars.iteritems()]))
 51 |   ]
 52 | 
 53 |   ### List of crawled item IDs
 54 |   item_ids = []
 55 | 
 56 |   def _get_item_id(self, url):
 57 |     driver_id = url.find("driverId=")
 58 |     item = url[driver_id + len("driverId="):]
 59 |     if item.find("&") >= 0:
 60 |       item = item[:item.find("&")]
 61 |     return item
 62 | 
 63 |   def parse(self, response):
 64 |     sel = Selector(response)
 65 | 
 66 |     page_number = int(self.dell_search_vars["p"])
 67 |     print "Debug: On Page %d" % page_number
 68 | 
 69 |     total_regex = r".* (\d+) Results"
 70 |     total_results = sel.css(".PaginationCtrlResltTxt")
 71 |     if len(total_results) < 1:
 72 |       ### Cannot determine the number of search results
 73 |       print "Error: cannot determine search results."
 74 |       return 
 75 | 
 76 |     total_string = total_results.extract()[0]
 77 |     total_match = re.search(total_regex, total_string)
 78 |     if total_match is None:
 79 |       print "Error: cannot determine search results."
 80 |       return
 81 | 
 82 |     ### It turns out this is just a guestimate by Dell, let's double it?!
 83 |     total_results = int(total_match.group(1)) #* 2
 84 |     ### There's 20 results per page, and I cannot change this!?
 85 |     total_pages = (total_results / 20) + 1
 86 | 
 87 |     ### Parse this initial page's results.
 88 |     for result in self.parse_results(response):
 89 |       yield result
 90 | 
 91 |     for page in xrange(2, total_pages):
 92 |       self.dell_search_vars["p"] = str(page)
 93 |       yield Request(
 94 |         url= self.results_url % 
 95 |           ("&".join(["%s=%s" % (k, v) for k, v in self.dell_search_vars.iteritems()])),
 96 |         callback= self.parse_results)
 97 |     pass
 98 | 
 99 |   def parse_results(self, response):
100 |     ### Parse update results from search page, yield the links to the updates
101 |     sel = Selector(response)
102 |     drivers = sel.css("div.driver_container")
103 |     if len(drivers) == 0:
104 |       ### No items on this page
105 |       print "Debug: reached the end."
106 |       return
107 | 
108 |     result_items = []
109 |     for driver in drivers:
110 |       result_item = DellBiosUpdateLinkItem()
111 |       compatibility = driver.css("input.hdnCompProduct").xpath("@value")
112 |       if len(compatibility) != 0:
113 |         ### Compatibility tells us the model and thus mainboard/config.
114 |         systems = compatibility.extract()[0].strip().split("#")
115 |         result_item["compatibility"] = []
116 |         for system in systems:
117 |           if system.find("DesktopLatitudeOptiplexPrecisionVostro") >= 0:
118 |             result_item["compatibility"].append("XPS Notebook R720")
119 |           else:
120 |             result_item["compatibility"].append(system.strip())
121 | 
122 |       url = driver.css("input.hdnDriverURL").xpath("@value")
123 |       if len(url) == 0:
124 |         print "ERROR: No URL for update?"
125 |         continue
126 |       ### Driver type is saved as a sanity check.
127 |       result_item["url"] = url.extract()[0]
128 |       details = driver.css("div.driver_detail::text").extract()
129 |       result_item["driver_type"] = details[0][2:]
130 |       ### Release date only includes the date, the previous versions include a timestamp.
131 |       result_item["release_date"] = details[1][2:]
132 |       result_items.append(result_item)
133 | 
134 |     for item in result_items:
135 |       item_id = self._get_item_id(item["url"])
136 |       ### Do not attempt duplicate update parsing.
137 |       if item_id in self.item_ids:
138 |         continue
139 |       yield Request(url= item["url"], meta= {"result_item": item}, callback= self.parse_update)
140 |     pass
141 | 
142 |   def parse_update(self, response):
143 |     sel = Selector(response)
144 | 
145 |     ### There may be multiple downloads, the link is held in a javascript call.
146 |     notes_link = ""
147 |     driver_links = sel.css("#GetDriver").xpath("@href")
148 |     if len(driver_links) == 0:
149 |       raise Exception("Debug: No driver links found.")
150 |     try:
151 |       driver_links = [link.split(",")[1].strip("' ") for link in driver_links.extract()]
152 |     except Exception, e:
153 |       raise Exception("Error: cannot extract links. (%s)" % str(e))
154 | 
155 |     ### Save the release link separately (if it exists).
156 |     for link in driver_links:
157 |       if link.find("Release") >= 0 and link[-3:] == "txt":
158 |         notes_link = link
159 | 
160 |     driver_names = sel.css("p.DriverDetails_FileFormat_Names::text")
161 |     if len(driver_names) == 0:
162 |       raise Exception("Debug: No driver names found.")
163 |     driver_names = driver_names.extract()
164 | 
165 |     ### Update version provided in header.
166 |     version = sel.css("a#dellVendorVersionToolTipId::text").extract()[0]
167 | 
168 |     ### There is inconsistency in naming previous versions, which may include spaces and commas.
169 |     previous_versions = []
170 |     pversions = sel.css("a#Versions")
171 |     for pversion in pversions:
172 |       version_name = "".join([c for c in pversion.xpath("text()").extract()[0] if c not in [",", " "]])
173 |       version_link = "http://www.dell.com/%s" % pversion.xpath("@href").extract()[0].split("&", 1)[0]
174 |       version_date = pversion.xpath("../../following-sibling::td/text()").extract()[0].strip()
175 |       version_id   = version_link[version_link.find("driverId=") + len("driverId="):]
176 |       previous_versions.append((version_name, version_link, version_date, version_id))
177 | 
178 |     #print previous_versions
179 |     importance = "Unknown"
180 |     fixes = ""
181 | 
182 |     ### Parse optional importance label and fixes/enhancement content.
183 |     expands = sel.xpath("//h3")
184 |     for expand in expands:
185 |       expand_text = expand.css("::text").extract()[0]
186 |       if expand_text.find("Level of Importance") == 0:
187 |         importance = expand_text[expand_text.find(":")+1:]
188 |       if expand_text.find("Fixes") == 0:
189 |         try:
190 |           expand_body = expand.xpath(".//following-sibling::div")[0].\
191 |             css(".DriverDetails_RowData::text").extract()
192 |           fixes = expand_body[0]
193 |         except Exception, e:
194 |           print e
195 |           pass
196 |       if expand_text.find("Compatibility") == 0:
197 |         system_set = expand.xpath("./following-sibling")
198 | 
199 |     item = DellBiosUpdatePageItem()
200 |     item["notes_url"] = notes_link
201 |     item["bios_urls"] = [l for l in driver_links if l.split(".")[-1] not in self.filetype_blacklist]
202 |     item["file_names"] = [n for n in driver_names if n.split(".")[-1] not in self.filetype_blacklist]
203 |     item["previous_versions"] = previous_versions
204 |     item["version"] = version
205 |     item["importance"] = importance
206 |     item["fixes"] = fixes
207 |     #item["attrs"] = dict(response.meta["result_item"])
208 | 
209 |     link_item = response.meta["result_item"]
210 | 
211 |     ### Try to get date (again)
212 |     details = sel.css(".DriverDetails_Table_ItemLabel")
213 |     if len(details) > 0:
214 |       date = details[0].xpath("./following-sibling::td/text()").extract()
215 |       if len(date) > 0:
216 |         link_item["release_date"] = date[0].strip()
217 |     item["attrs"] = dict(link_item)
218 | 
219 |     ### Set the item ID as the driver/update link ID.
220 |     item["item_id"] = self._get_item_id(item["attrs"]["url"])
221 |     self.item_ids.append(item["item_id"])
222 | 
223 |     for i in xrange(len(item["bios_urls"])):
224 |       if item["bios_urls"][i].split(".")[-1].lower() != "exe":
225 |         continue
226 |       ### Download each file associated
227 |       yield Request(url= item["bios_urls"][i], callback= self.parse_binary,
228 |         meta= {"name": item["file_names"][i], "item_id": item["item_id"]})
229 |       ### For now, only download the first exe.
230 |       break
231 | 
232 |     ### Crawl the update versions (may be duplicates) for this system
233 |     for update in previous_versions:
234 |       update_item = DellBiosUpdateLinkItem()
235 |       update_item["url"] = update[1]
236 |       update_item["release_date"] = update[2]
237 |       update_item["compatibility"] = link_item["compatibility"]
238 |       #update_item["desc"] = link_item["desc"]
239 |       yield Request(url= update[1], meta= {"result_item": update_item}, 
240 |         callback= self.parse_update)
241 | 
242 |     yield item
243 |     pass
244 | 
245 |   def parse_binary(self, response):
246 |     item = BinaryItem()
247 |     item["binary"] = response.body
248 |     item["binary_name"] = response.meta["name"]
249 |     item["item_id"] = response.meta["item_id"]
250 | 
251 |     yield item
252 | 
253 | 


--------------------------------------------------------------------------------
/uefispider/spiders/lenovo_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request, FormRequest
  5 | from uefi_firmware.utils import red, blue
  6 | 
  7 | from uefispider.items import *
  8 | 
  9 | import json
 10 | import re
 11 | import copy
 12 | import os
 13 | 
 14 | lenovo_component = "1343112652574"
 15 | product_search = "http://support.lenovo.com/en_US/downloads/default/%s.ajax?%s"
 16 | product_select = "http://support.lenovo.com/en_US/downloads/default.page?%s"
 17 | download_select = "http://download.lenovo.com/lenovo/content/ddfm/%s-%s-%s.html"
 18 | 
 19 | '''
 20 | Usage:
 21 | scrapy crawl -a dump=/tmp/spiders LenovoSpider
 22 | 
 23 | Requirements:
 24 | innoextract, 7-zip, cabextract, unrar; are all helpful.
 25 | http://constexpr.org/innoextract/files/innoextract-1.4.tar.gz
 26 | 
 27 | XML structure:
 28 | Properties->
 29 |   Data->
 30 |     Result->
 31 |       ProductSelectorResults->
 32 |         Options->
 33 |           [Option, value="id"].//text()=name
 34 | '''
 35 | 
 36 | def _search_url(tree):
 37 |     if len(tree) > 3: 
 38 |         return None
 39 |     select_types = ["getSeries", "getSubseries", "getMachineTypes"]
 40 |     selection = "-".join(tree)
 41 |     if len(tree) == 5:
 42 |         selection += "+" * 4
 43 |     else:
 44 |         selection += "-" * (5-len(tree))
 45 |     return product_search % (
 46 |         lenovo_component,
 47 |         "method=%s&productSelection=%s" % (select_types[len(tree)-1], selection)
 48 |     )
 49 | 
 50 | def _select_url(tree):
 51 |     if len(tree) > 3:
 52 |         return None
 53 |     query = {
 54 |         "submit": "true",
 55 |         "componentID": lenovo_component,
 56 |         "iwPreActions": "SetProduct",
 57 |         "prodId": "-".join(tree) + "--",
 58 |         "os": ""
 59 |     }
 60 |     ### This will set cookies and redirect, similar to HP.
 61 |     return product_select % "&".join(["%s=%s" % (k, v) for k, v in query.iteritems()])
 62 | 
 63 | def _download_url(series, subseries, product):
 64 |     return download_select % (series, subseries, product)
 65 | 
 66 | class LenovoSpider(UefiSpider):
 67 |     name = 'LenovoSpider'
 68 |     allowed_domains = [
 69 |         "lenovo.com",
 70 |     ]
 71 | 
 72 |     series_list = [
 73 |         "P014", # Laptops & Tablets
 74 |         #"P013", # Desktop & All-In-Ones
 75 |         #"P022", # Workstations
 76 |         #"P023", # Servers
 77 |     ]
 78 |     
 79 |     start_urls = [
 80 |         "http://support.lenovo.com/en_US/downloads/default.page"
 81 |     ]
 82 | 
 83 |     ### Hold a list of products/documents which are processed serially.
 84 |     #products = {}
 85 |     doc_ids = []
 86 | 
 87 |     def _get_results(self, response):
 88 |         sel = Selector(response)
 89 | 
 90 |         results = []
 91 |         options = sel.css("Properties").xpath("./Data/Result/ProductSelectorResults/Options/option")
 92 |         for option in options:
 93 |             value = option.xpath("./@value").extract()[0]
 94 |             name  = option.xpath("./text()").extract()[0]
 95 |             results.append((value, name))
 96 |         return results        
 97 | 
 98 |     def parse(self, response):
 99 |         for series in self.series_list:
100 |             yield Request(url= _search_url([series]), callback= self.parse_series,
101 |                 meta= {"series": series, "dont_merge_cookies": True})
102 | 
103 | 
104 |     def parse_series(self, response):
105 |         results = self._get_results(response)
106 |         series = response.meta["series"]
107 | 
108 |         ### Now we have a set of subseries IDs.
109 |         for result in results:
110 |             yield Request(url= _search_url([series, result[0]]), 
111 |             #yield Request(url= _search_url([series, "S006"]),
112 |                 callback= self.parse_subseries,
113 |                 meta= {"series": series, "subseries": result[0]})
114 | 
115 |     def parse_subseries(self, response):
116 |         results = self._get_results(response)
117 |         series = response.meta["series"]
118 |         subseries = response.meta["subseries"]
119 | 
120 |         for result in results:
121 |             yield Request(url = _download_url(series, subseries, result[0]),
122 |             #yield Request(url= _download_url(series, "S006", "SS2500"),
123 |                 callback= self.parse_product,
124 |                 meta= {"cookiejar": result[1], "item_details": result},
125 |                 dont_filter= True)
126 | 
127 |     def parse_product(self, response):
128 |         def is_bios_update(name):
129 |             #valid_names = ["BIOS Update"]
130 |             valid_names = ["BIOS Update Utility"]
131 |             ### The "utility" documents provide historic information.
132 |             for valid in valid_names:
133 |                 if name.find(valid) >= 0:
134 |                     return True
135 |             return False
136 | 
137 |         sel = Selector(response)
138 | 
139 |         ### There's a lot of information on this page, but the update document
140 |         ### repeats this information and includes historic data.
141 |         updates = []
142 |         rows = sel.css("#BIOS").css("#table1").xpath(".//tr")[1:]
143 |         for row in rows:
144 |             cells = row.xpath(".//td")
145 |             name = cells[0].xpath("./text()").extract()[0]
146 |             if not is_bios_update(name):
147 |                 ### This is not the droid we're looking for
148 |                 continue
149 |             links = cells[0].xpath(".//a/@href").extract()
150 |             updates.append(links[0])
151 | 
152 |         for update in updates:
153 |             doc_id = update.split("DocID=")[1]
154 |             ### Begin critical section
155 |             if doc_id in self.doc_ids:
156 |                 continue
157 |             self.doc_ids.append(doc_id)
158 |             ### End critical section
159 |             yield Request(url= update,
160 |                 callback= self.parse_document, 
161 |                 meta= {"item_details": response.meta["item_details"], "doc_id": doc_id})
162 | 
163 |         pass
164 | 
165 |     def parse_document(self, response):
166 |         sel = Selector(response)
167 | 
168 |         systems = None
169 |         changes = None
170 |         uefi = False
171 |         packages = None
172 | 
173 |         ### Extract information for the current release from the downloads table.
174 |         ### This information is NOT repeated in the version table below.
175 |         downloads = sel.css(".downloadTable").xpath(".//tbody/tr")
176 |         if len(downloads) == 0:
177 |             ### Sometimes there is no downloads table!?
178 |             ### Todo: http://support.lenovo.com/en_US/downloads/detail.page?DocID=DS024464
179 |             print red("Error: no downloads table found.")
180 |             return
181 | 
182 |         binary_url = downloads[0].xpath(".//td")[0].xpath(".//a/@href").extract()[0]
183 |         notes_url = downloads[1].xpath(".//td")[0].xpath(".//a/@href").extract()[0]
184 |         date = downloads[0].xpath(".//td")[3].xpath("./text()").extract()[0] 
185 | 
186 |         ### This is ugly!
187 |         tables = sel.css(".v14-header-1")
188 |         for table in tables:
189 |             if len(table.xpath("./text()")) == 0:
190 |                 ### This may be a server without parsed HTML.
191 |                 continue
192 |             table_name = table.xpath("./text()").extract()[0]
193 |             if table_name.find("Supported") > -1 and table_name.find("Operating") == -1:
194 |                 ### Could be "Systems"/"System"
195 |                 systems = table
196 |             if table_name.find("Summary of Changes") > -1:
197 |                 changes = table
198 |             if table_name.find("UEFI") > -1:
199 |                 uefi = True
200 |             if table_name.find("Package") > -1:
201 |                 packages = table
202 | 
203 |         if not uefi:
204 |             ### Documents might be missing UEFI instructions.
205 |             uefi = (response.body.find("UEFI") >= 0)
206 | 
207 |         print blue(response.meta["doc_id"])
208 |         print response.url
209 |         print "date:", date
210 |         print "UEFI:", "True" if uefi else red("False")
211 |         print "True" if systems is not None else red("False"), 
212 |         print "True" if packages is not None else red("False")
213 | 
214 |         ### Todo: server pages are not found as UEFI
215 |         ### http://support.lenovo.com/en_US/downloads/detail.page?DocID=DS032912
216 | 
217 |         if not uefi:
218 |             ### This is not a UEFI update.
219 |             return
220 | 
221 |         ### There are cases where the package table is not found.
222 |         if packages is None:
223 |             headers = sel.css("th")
224 |             for header in headers:
225 |                 table_name = header.xpath("./text()").extract()[0]
226 |                 if table_name.find("Package") >= 0:
227 |                     packages = header
228 |                     break
229 | 
230 |         if packages is None:
231 |             ### Could not recover.
232 |             print red("Error: no package list found.")
233 |             return
234 | 
235 |         systems_list = []
236 |         if systems is None:
237 |             ### Might not have a correctly formatted table for systems support
238 |             ### http://support.lenovo.com/en_US/downloads/detail.page?DocID=DS013468
239 |             cells = sel.xpath("//td/text()").extract()
240 |             scanning_supported = False
241 |             for cell in cells:
242 |                 if cell.find("Supported") >= 0 and cell.find("Operating") == -1:
243 |                     scanning_supported = True
244 |                 if scanning_supported:
245 |                     if cell[0] != "-":
246 |                         scanning_supported = False
247 |                         break
248 |                     systems_list.append(cell[1:].strip())
249 |             pass
250 |         else:
251 |             systems = systems.xpath("../../../following-sibling::ul")[0].xpath(".//li")
252 |             for system in systems:
253 |                 systems_list.append(system.xpath("./text()").extract()[0])
254 | 
255 |         if len(systems_list) == 0:
256 |             print red("Error: no systems found")
257 |             return
258 | 
259 |         update_list = []
260 |         updates = packages.xpath("../../..//tr")[1:]
261 |         for i, update in enumerate(updates):
262 |             cells = update.xpath(".//td")
263 |             if len(cells) < 3:
264 |                 ### Might have a row-span (DocID=DS035105)
265 |                 continue
266 |             ### This format will be X.XX (NAME)
267 |             version = cells[1].xpath(".//text()").extract()[0].split("(")
268 |             version = "%s (%s" % (version[0].strip(), version[1].strip()) 
269 |             release = cells[-1]
270 |             #print version, release
271 |             if i == 0:
272 |                 update_list.append((version, binary_url, notes_url))
273 |                 continue
274 |             urls = release.xpath(".//a/@href").extract()
275 |             if len(urls) < 2:
276 |                 ### Could be unreleased (Not release to the web) (DocID=DS029726)
277 |                 urls = ["unknown", "unknown"] 
278 |             update_list.append((version, urls[0], urls[1]))
279 | 
280 |         meta = {
281 |             "systems": systems_list,
282 |             "updates": update_list,
283 |             "date": date,
284 |             "url": response.url
285 |         }
286 | 
287 |         yield Request(url= notes_url, callback= self.parse_notes, meta= meta)
288 | 
289 |     def parse_notes(self, response):
290 |         ### This is a text-only document containing the versions and release notes.
291 |         text = response.body.split("\r\n")
292 | 
293 |         dates_list = []
294 |         release_notes = []
295 | 
296 |         document = response.meta
297 | 
298 |         line_num = 0
299 |         scanning_changes = False
300 |         scanning_version = None
301 |         version_notes = []
302 | 
303 |         while line_num < len(text):
304 |             line = text[line_num]
305 |             line_num += 1
306 | 
307 |             ### Scan for "Package (ID)", next line is a set of delims, then updates until blank-line
308 |             if line.find("Package") >= 0 and line.find("Issue Date") >= 0:
309 |                 line_num += 1
310 |                 for i in xrange(len(document["updates"])):
311 |                     line = text[line_num]
312 |                     line_num += 1
313 |                     if len(line) == 0:
314 |                         ### Problem!
315 |                         print red("Warning: no lines left while scanning updates.")
316 |                         break
317 |                     version_info = line.split(" ")
318 |                     dates_list.append(version_info[-1])
319 |                 continue
320 | 
321 |             ### While scan for "<" as first character
322 |             ### Version <X.XX>, add lines until blank-line
323 |             if line.find("Summary of Changes") >= 0:
324 |                 scanning_changes = True
325 |             if scanning_changes:
326 |                 if len(line) == 0 and line_num <= len(text) and len(text[line_num]) == 0:
327 |                     ### Double return, break
328 |                     scanning_changes = False
329 |                     continue
330 |                 if scanning_version and len(line) == 0:
331 |                     ### Append and reset version notes.
332 |                     scanning_version = False
333 |                     release_notes.append(version_notes)
334 |                     version_notes = []
335 |                     continue
336 |                 if scanning_version:
337 |                     version_notes.append(line.strip())
338 |                     continue
339 |                 if len(line) > 0 and line[0] == "<":
340 |                     scanning_version = True
341 |                     continue
342 |             pass
343 | 
344 |         ### Finally download the binaries
345 |         for i, update in enumerate(document["updates"]):
346 |             item = LenovoUpdateItem()
347 |             item["url"] = document["url"]
348 |             item["products"] = document["systems"]
349 |             item["version"] = update[0]
350 |             item["bios_url"] = update[1]
351 |             item["notes_url"] = update[2]
352 |             item["date"] = dates_list[i] if i < len(dates_list) else "unknown"
353 |             item["desc"] = release_notes[i] if i < len(release_notes) else "unknown"
354 |             item["item_id"] = update[0]
355 | 
356 |             if item["bios_url"] == "unknown":
357 |                 print red("Warning: BIOS url unknown")
358 |                 yield item
359 |             else:
360 |                 if item["bios_url"][0:len("http://")] != "http://":
361 |                     ### Might be missing...
362 |                     ### download.lenovo.com/ibmdl/pub/pc/pccbbs/mobiles/h6uj03ww.exe
363 |                     item["bios_url"] = "http://" + item["bios_url"]
364 |                 yield Request(url= item["bios_url"],
365 |                     callback= self.parse_binary,
366 |                     meta= {"item": item})
367 |         pass
368 | 
369 |     def parse_binary(self, response):
370 |         item = response.meta["item"]
371 |         item["binary"] = response.body
372 |         item["binary_name"] = os.path.basename(response.url)
373 | 
374 |         yield item
375 |         pass
376 | 


--------------------------------------------------------------------------------
/uefispider/spiders/hp_spider.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from uefispider.spiders import UefiSpider
  3 | from scrapy.selector import Selector
  4 | from scrapy.http import Request, FormRequest
  5 | from scrapy.http.cookies import CookieJar
  6 | 
  7 | from uefispider.items import *
  8 | 
  9 | import json
 10 | import re
 11 | import urllib
 12 | import sys
 13 | 
 14 | #from scrapy.shell import inspect_response
 15 | #inspect_response(response)
 16 | 
 17 | class HPSpider(UefiSpider):
 18 |   name = 'HPSpider'
 19 |   allowed_domains = [
 20 |     "www2.hp.com",
 21 |     "hp.com"
 22 |   ]
 23 | 
 24 |   cookies = {
 25 |     "HP_SPF_HOST":      "h20566.www2.hp.com",
 26 |     "HP_SPF_LOCALE":    "en-US",
 27 |     "HP_SPF_SITE":      "hpsc",
 28 |   }
 29 | 
 30 |   download_url = "http://ftp.hp.com/pub/softpaq/sp%d-%d/%s"
 31 | 
 32 |   start_urls = [
 33 |     "http://h20566.www2.hp.com/portal/site/hpsc/template.PAGE/public/kb/search/"
 34 |   ]
 35 | 
 36 |   crawled_items = {}
 37 |   ### Store all of the crawled search results
 38 | 
 39 |   def _get_download_link(self, filename, sp_number= None):
 40 |     ### An update file name may include a distinct "SP" number.
 41 |     if sp_number is None:
 42 |       sp_number = filename
 43 | 
 44 |     update_id = sp_number[2:sp_number.find(".")]
 45 |     try:
 46 |       update_id = int(update_id)
 47 |       url = self.download_url % (1 + (update_id/500)*500, (update_id/500)*500 + 500, filename)
 48 |     except Exception, e:
 49 |       ### Cannot parse the filename, was an sp_number provided?
 50 |       url = None
 51 |     return url
 52 | 
 53 |   def _get_update_id(self, update_link):
 54 |     index = update_link.find("swItem%253D") + len("swItem%253D")
 55 |     end_tok = [update_link.find("&", index), update_link.find("%25", index)]
 56 |     if end_tok[0] >= 0 and end_tok[1] >= 0:
 57 |       end_tok = min(end_tok[0], end_tok[1])
 58 |     elif end_tok[0] >= 0: end_tok = end_tok[0]
 59 |     else: end_tok = end_tok[1]
 60 |     update_id = update_link[index: end_tok]
 61 |     return update_id
 62 |     pass
 63 | 
 64 |   def _write_results(self):
 65 |     print "Debug: Finished reading search results, writing."
 66 |     with open("hp-output.json", 'w') as fh:
 67 |       fh.write(json.dumps([dict(item) for i, item in self.crawled_items.iteritems()]))
 68 |     return
 69 | 
 70 |   def parse(self, response):
 71 |     ### The initial hit of the search page, generate all additional searches, accept the cookies and begin.
 72 |     months = range(0, 12)         # Month is 0-counted
 73 |     years  = range(2006, 2014+1)  # Years is actual-counted
 74 | 
 75 |     monthly_searches = []
 76 |     for year in years:
 77 |       for month in months:
 78 |         end_year = year if month != 11 else year+1
 79 |         end_month = month+1 if month != 11 else 0
 80 |         #print (month, year, end_month, end_year)
 81 |         monthly_searches.append((year, month, end_year, end_month))
 82 | 
 83 |     response.meta["searches"] = monthly_searches
 84 |     return self.parse_begin(response)
 85 | 
 86 |   def parse_begin(self, response):
 87 |     ### Hit the page we were redirected to with the cookies set.
 88 | 
 89 |     return Request(url = response.url, cookies= self.cookies, 
 90 |       meta= {"searches": response.meta["searches"]}, 
 91 |       callback= self.parse_accept)
 92 | 
 93 |   def parse_accept(self, response):
 94 |     ### At the search form, begin to generate monthly searches, alert if >100 results.
 95 |     sel = Selector(response)
 96 | 
 97 |     ### This will select the REAL url (with appended query string "tokens").
 98 |     url_path = ""
 99 |     forms = sel.xpath("//form")
100 |     for form in forms:
101 |       form_ids = form.xpath("@id").extract()
102 |       if len(form_ids) == 0: 
103 |         continue
104 |       if form_ids[0] == "refineSearchForm":
105 |         url_path = form.xpath("@action").extract()[0]
106 | 
107 |     ### The search load-balances
108 |     domain = response.url[len("http://"):response.url.find(".")]
109 | 
110 |     url = "http://%s.www2.hp.com/%s"
111 |     form_data = {
112 |       "didYouMean": "",
113 |       "searchCrit": "allwords",
114 |       "docType":"Drivers",
115 |       #"docType":"Patch",
116 |       "dateRange":"all",
117 |       "dateSearchType":"dateRange",
118 |       "startDateYear": None,
119 |       "startDateMonth": None,
120 |       "startDateDay": "1",
121 |       "endDateYear": None,
122 |       "endDateMonth": None,
123 |       "endDateDay":"1",
124 |       "resPerPage":"100",
125 |       "sortCrit":"date",
126 |       "showSummary":"yesX",
127 |       "calledBy":"Search_Main",
128 |       "mode":"text",
129 |       "searchString":"BIOS Update",
130 |       "searchRes":"Search",
131 |       "advSearchFlag":"true",
132 |     }
133 | 
134 |     ### Pull off the remaining searchs, and fill in vars for the 'next' search.
135 |     remaining_searches = response.meta["searches"]
136 | 
137 |     form_data["startDateYear"] = str(remaining_searches[0][0])
138 |     form_data["startDateMonth"] = str(remaining_searches[0][1])
139 |     form_data["endDateYear"] = str(remaining_searches[0][2])
140 |     form_data["endDateMonth"] = str(remaining_searches[0][3])
141 | 
142 |     return FormRequest(url= url % (domain, url_path) + "&month=%d&year=%d" % (remaining_searches[0][1], remaining_searches[0][0]), 
143 |       headers= {"Content-Type": "application/x-www-form-urlencoded"},
144 |       formdata= form_data, method= "POST", cookies= self.cookies,
145 |       meta= {"searches": remaining_searches[1:], "this": (form_data["startDateYear"], form_data["startDateMonth"], form_data["endDateYear"], form_data["endDateMonth"])},
146 |       dont_filter= True,
147 |       callback= self.parse_search)
148 |     pass
149 | 
150 |   def parse_search(self, response):
151 |     ### The search results
152 |     sel = Selector(response)
153 | 
154 |     results = sel.css("table[title='Search Results Index']").xpath(".//tr")[1:]
155 |     if len(results) == 100:
156 |       ### The search will only return 100 results.
157 |       ### If 100 is reached, the search must be repeated with better accuracy.
158 |       #raise Exception("Reached 100 results, day-granularity not implemented.")
159 |       with open('overflow_months.log', 'a+') as fh: 
160 |         fh.write('%s %s %s %s\n' % (response.meta["this"][0], response.meta["this"][1], response.meta["this"][2], response.meta["this"][3]))
161 |       print "Reached 100 results, consider day-granularity."
162 | 
163 |     for result in results:
164 |       download_type = "".join(result.xpath(".//td")[2].xpath(".//text()").extract()).strip()
165 |       if download_type != "BIOS":
166 |         continue
167 | 
168 |       item = HPBiosUpdateLinkItem()
169 |       item["url"]  = "".join(result.xpath(".//td")[1].xpath(".//a/@href").extract()).strip()
170 |       item["name"] = "".join(result.xpath(".//td")[1].xpath(".//a//text()").extract())
171 |       item["date"] = "".join(result.xpath(".//td")[3].xpath(".//text()").extract())
172 | 
173 |       item["item_id"] = self._get_update_id(item["url"])
174 | 
175 |       if item["item_id"] in self.crawled_items:
176 |         #raise Exception("Found duplicate: (%s, %s, %s)" % (item["item_id"], item["name"], item["date"]))
177 |         print "Found duplicate: (%s, %s, %s)" % (item["item_id"], item["name"], item["date"])
178 |         continue
179 |       ### Store the item in the object-global item stash.
180 |       self.crawled_items[item["item_id"]] = item
181 | 
182 |     remaining_search_count = len(response.meta["searches"])
183 |     if remaining_search_count > 0:
184 |       ### The are more searches, repeat.
185 |       yield Request(url= self.start_urls[0], #+ "?%d" % remaining_search_count, 
186 |         meta= {"searches": response.meta["searches"]}, 
187 |         dont_filter= True,
188 |         callback= self.parse_accept)
189 |       return
190 | 
191 |     ### Debugging, make this an argument/option later
192 |     self._write_results()
193 | 
194 |     ### The searches are complete, parse responses.
195 |     for item_id, item in self.crawled_items.iteritems():
196 |       #callback = self.parse_me_update if item["name"].find("ME Firmware Update") >= 0 else self.parse_update
197 |       callback = self.parse_update
198 |       yield Request(url= item["url"], callback= callback, meta= {"result_item": item})
199 |       
200 |   def parse_update(self, response):
201 |     ### The update (download) page for the BIOS.
202 |     sel = Selector(response) 
203 | 
204 |     fields = sel.css("table.m10").xpath(".//tr/td")
205 |     version = fields[1].xpath(".//text()").extract()[0]
206 |     version = version[:version.find("(")].strip()
207 | 
208 |     name = fields[3].xpath(".//text()").extract()[0]
209 |     name = name[:name.find("(")].strip()
210 | 
211 |     ### Try to parse the "name" as an "SP" number
212 |     download_link = self._get_download_link(name)
213 | 
214 |     item = HPBiosUpdatePageItem()
215 |     item["bios_url"] = download_link
216 |     item["version"] = version
217 |     item["binary_name"] = name
218 |     item["attrs"] = dict(response.meta["result_item"])
219 |     item["item_id"] = item["attrs"]["item_id"]
220 | 
221 |     ### Updates have their description in different tabs
222 |     tab_names = {}
223 |     tabs = sel.css("tr#device-nav").xpath(".//td")
224 |     for i, tab in enumerate(tabs):
225 |       tab_name = " ".join(tab.xpath(".//text()").extract()).lower()
226 |       tab_link = "".join(tab.xpath(".//a/@href").extract())
227 |       tab_names[tab_name] = tab_link
228 | 
229 |     ### Set the release notes url, this may be optional?
230 |     item["notes_url"] = tab_names["release notes"] if "release notes" in tab_names else None
231 | 
232 |     if "revision history" in tab_names:
233 |       ### A version history is optional, this will parse the release notes afterward.
234 |       return Request(url= tab_names["revision history"], callback= self.parse_versions,
235 |         meta= {"page_item": item})
236 | 
237 |     if item["notes_url"] is not None:
238 |       return Request(url= item["notes_url"], callback= self.parse_notes, meta= {"page_item": item})
239 | 
240 |     ### We are finished, sadly, without much meta-information
241 |     if download_link is None:
242 |       raise Exception("Cannot parse notes and bad download (%s)." % item["binary_name"])
243 |     return Request(url= download_link, callback= self.parse_binary, meta= {"page_item": item})
244 | 
245 |     pass
246 | 
247 |   def parse_versions(self, response):
248 |     ### Parse an optional version history
249 |     sel = Selector(response)
250 |     item = response.meta["page_item"]
251 | 
252 |     previous_versions = []
253 |     versions = sel.css("div#tabContent").css("a.udrline")
254 |     for pversion in versions:
255 |       version_link = "".join(pversion.xpath("@href").extract()).strip()
256 |       version_text = "".join(pversion.xpath(".//text()").extract()).strip()
257 |       version_text = version_text[version_text.find(":")+1:]
258 | 
259 |       version_id = self._get_update_id(version_link)
260 |       ### Because Dell is stored as an array ...(JSON).
261 |       previous_versions.append([version_text, version_link, version_id])
262 | 
263 |     item["previous_versions"] = previous_versions
264 | 
265 |     ### Must now parse notes!
266 |     if item["notes_url"] is not None:
267 |       return Request(url= item["notes_url"], callback= self.parse_notes, meta= {"page_item": item})
268 | 
269 |     if item["bios_url"] is None:
270 |       raise Exception("Cannot parse notes (after versions) and bad download (%s)." % item["binary_name"])
271 |     return Request(url= item["bios_url"], callback= self.parse_binary, meta= {"page_item": item})
272 | 
273 |     pass
274 | 
275 |   def parse_notes(self, response):
276 |     ### Parse a potentially optional release notes section (url).
277 |     sel = Selector(response)
278 |     item = response.meta["page_item"]
279 | 
280 |     if len(sel.css("div#tabContent").xpath(".//font").css(".heading")) > 0:
281 |       return self.parse_advanced_notes(response)
282 | 
283 |     ### This content is a textual-dump
284 |     sections = {}
285 |     content = [line.strip() for line in sel.css("div#tabContent").xpath(".//td//text()").extract()]
286 | 
287 |     active_section = None
288 |     for line in content:
289 |       ### Find a SECTION: Value, or SECTION: (where the value follows on newlines).
290 |       match = re.search(r"([A-Z\(\) ]+):(.*)", line)
291 |       if match is None:
292 |         if active_section is None:
293 |           continue
294 |         ### Add this line to the previously-found section.
295 |         sections[active_section].append(line.strip())
296 |       else:
297 |         match = match.groups()
298 |         if len(match[1]) == 0:
299 |           ### Expect content to follow
300 |           active_section = match[0]
301 |           sections[match[0]] = []
302 |         else:
303 |           active_section = None
304 |           sections[match[0]] = [match[1].strip()]
305 | 
306 |     #print sections
307 |     section_fields = [
308 |       ("SSM SUPPORTED", "ssm", True),
309 |       ("DESCRIPTION", "desc", False),
310 |       ("PURPOSE", "importance", True),
311 |       ("HARDWARE PRODUCT MODEL(S)", "compatibility", False),
312 |       ("FIXES", "fixes", False)
313 |     ]
314 | 
315 |     for section_field in section_fields:
316 |       if section_field[0] in sections:
317 |         item[section_field[1]] = sections[section_field[0]]
318 |         if section_field[2] and type(sections[section_field[0]]) == list:
319 |           item[section_field[1]] = item[section_field[1]][0]
320 | 
321 |     ### Finally, download the BIOS
322 |     if item["bios_url"] is None:
323 |       sp_number = sections["SOFTPAQ NUMBER"][0][:7] if "SOFTPAQ NUMBER" in sections else "0"
324 |       download_link = self._get_download_link(item["binary_name"], sp_number= sp_number)
325 |       if download_link is None:
326 |         raise Exception("Cannot create download (%s), (%s)." % (item["binary_name"], sp_number))
327 |       item["bios_url"] = download_link
328 | 
329 |     return Request(url= item["bios_url"], callback= self.parse_binary, meta= {"page_item": item})
330 |     pass
331 | 
332 |   def parse_advanced_notes(self, response):
333 |     sel = Selector(response)
334 | 
335 |     sections = sel.css("div#tabContent").xpath(".//font").css(".heading")
336 |     content = sel.css("div#tabContent").xpath(".//font").css(".body")
337 | 
338 |     item = response.meta["page_item"]
339 | 
340 |     sp_section = "SoftPaq"
341 |     sp_number = None
342 |     section_fields = {
343 |       "SSM": "ssm",
344 |       "DESCRIPTION": "desc",
345 |       "PURPOSE": "importance",
346 |       "HARDWARE": "compatibility",
347 |       "FIXES": "fixes"
348 |     }
349 | 
350 |     for i, section in enumerate(sections):
351 |       section_name = "".join(section.xpath(".//text()").extract())
352 |       section_body = "\n".join([line.strip() for line in content[i].xpath(".//text()").extract() if len(line.strip()) > 0])
353 |       for field, key in section_fields.iteritems():
354 |         if section_name.find(field) == 0:
355 |           if field == "HARDWARE":
356 |             section_body = section_body.split("\n")
357 |           item[key] = section_body
358 |       if section_name.find(sp_section) == 0:
359 |         sp_number = section_body[:7]
360 | 
361 |       pass
362 | 
363 |     ### Finally, download the BIOS
364 |     if item["bios_url"] is None:
365 |       download_link = self._get_download_link(item["binary_name"], sp_number= sp_number)
366 |       if download_link is None:
367 |         raise Exception("Cannot create download (%s), (%s)." % (item["binary_name"], sp_number))
368 |       item["bios_url"] = download_link
369 | 
370 |     return Request(url= item["bios_url"], callback= self.parse_binary, meta= {"page_item": item})
371 |     pass
372 | 
373 |   def parse_binary(self, response):
374 |     item = response.meta["page_item"]
375 | 
376 |     if item["binary_name"] == "Obtain\u00a0softwar":
377 |       ### This is an odd handling of this error-case, a EULA is required.
378 |       item["binary_name"] = "EULA.html"
379 |     item["binary"] = response.body
380 | 
381 |     yield item
382 | 


--------------------------------------------------------------------------------