├── uefispider ├── __init__.py ├── spiders │ ├── __init__.py │ ├── asrock_spider.py │ ├── asus_spider.py │ ├── msi_spider.py │ ├── intel_spider.py │ ├── gigabyte_spider.py │ ├── dell_spider.py │ ├── lenovo_spider.py │ └── hp_spider.py ├── settings.py ├── pipelines.py └── items.py ├── scrapy.cfg ├── .gitignore ├── README.rst └── LICENSE /uefispider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = uefispider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = uefispider 12 | -------------------------------------------------------------------------------- /uefispider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | 6 | from scrapy.spider import Spider 7 | import os 8 | 9 | class UefiSpider(Spider): 10 | name = 'UefiSpider' 11 | 12 | def __init__(self, dump= 'output'): 13 | self.output = dump 14 | if self.output[0] != '/': 15 | self.output = os.path.join(os.getcwd(), self.output) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Testing resources 2 | output 3 | 4 | *.py[cod] 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | bin 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | __pycache__ 24 | 25 | # Installer logs 26 | pip-log.txt 27 | 28 | # Unit test / coverage reports 29 | .coverage 30 | .tox 31 | nosetests.xml 32 | 33 | # Translations 34 | *.mo 35 | 36 | # Mr Developer 37 | .mr.developer.cfg 38 | .project 39 | .pydevproject 40 | -------------------------------------------------------------------------------- /uefispider/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for uefispider project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'uefispider' 10 | 11 | SPIDER_MODULES = ['uefispider.spiders'] 12 | NEWSPIDER_MODULE = 'uefispider.spiders' 13 | 14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 15 | USER_AGENT = 'uefispider (+https://github.com/theopolis/uefi-spider)' 16 | 17 | ITEM_PIPELINES = { 18 | 'uefispider.pipelines.UefispiderPipeline': 1 19 | } 20 | 21 | COOKIES_DEBUG = True -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | UEFI Spider 2 | =========== 3 | The UEFI Spider is a set of HIGHLY specific scripts containing spidering logic for 4 | ISV/OEMs providing downloadable UEFI firmware updates. Each spider will attempt to document (in JSON) and download every identified UEFI firmware update. 5 | 6 | **WARNING:** Using this tool is dangerous, upon running each spider you will have downloaded well over 50G of firmware updates. This is highly taxing on both your bandwidth and the services hosting the updates. Please read the EULA for each site before spidering. This code is provided for reference only; this project and its authors do not encourage using the spiders. 7 | 8 | Installation 9 | ------------ 10 | **Requirements** 11 | :: 12 | 13 | $ apt-get install libxml2-dev libxslt1-dev python-dev 14 | $ pip install scrapy 15 | 16 | Usage 17 | ----- 18 | :: 19 | 20 | $ scrapy crawl -a dump=/path/to/spider/output DellSpider 21 | 22 | **Supported Vendors** 23 | 24 | - ASRock 25 | - Dell 26 | - Gigabyte 27 | - Intel 28 | - Lenovo 29 | - HP 30 | - MSI 31 | - VMware 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Teddy Reed 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /uefispider/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import json 7 | import os 8 | 9 | from uefispider.items import * 10 | 11 | class UefispiderPipeline(object): 12 | def process_item(self, item, spider): 13 | spider_name = spider.name 14 | item_id = item["item_id"] 15 | 16 | print spider.output, spider_name, item_id 17 | output_dir = os.path.join(spider.output, spider_name, item_id) 18 | 19 | binary = item["binary"] if "binary" in dict(item) else "" 20 | item["binary"] = "" 21 | 22 | binary_name = "uefi.bin" 23 | if "binary_name" in dict(item): 24 | binary_name = item["binary_name"] 25 | 26 | try: 27 | os.makedirs(output_dir) 28 | except Exception, e: 29 | print "Cannot make directories (%s). (%s)" % (output_dir, str(e)) 30 | 31 | try: 32 | if type(item) is not BinaryItem: 33 | ### Only write JSON if this is not a binary-only item. 34 | data = json.dumps(dict(item)) 35 | with open(os.path.join(output_dir, "details.json"), "w") as fh: 36 | fh.write(data) 37 | 38 | if len(binary) > 0: 39 | ### An item may only include meta data. 40 | with open(os.path.join(output_dir, binary_name), "wb") as fh: 41 | fh.write(binary) 42 | except Exception, e: 43 | print "Cannot write data (%s). (%s)" % (output_dir, str(e)) 44 | 45 | #return item 46 | -------------------------------------------------------------------------------- /uefispider/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class UefispiderItem(Item): 9 | item_id = Field() 10 | binary = Field() 11 | binary_name = Field() 12 | pass 13 | 14 | class BinaryItem(UefispiderItem): 15 | ### This item will only write a binary object. 16 | binary_name = Field() 17 | pass 18 | 19 | class GigabyteLinkItem(UefispiderItem): 20 | driver_type = Field() 21 | url = Field() 22 | name = Field() 23 | 24 | class GigabyteUpdateItem(UefispiderItem): 25 | version = Field() 26 | date = Field() 27 | desc = Field() 28 | bios_url = Field() 29 | attrs = Field() 30 | 31 | class LenovoUpdateItem(UefispiderItem): 32 | version = Field() 33 | date = Field() 34 | desc = Field() 35 | bios_url = Field() 36 | url = Field() 37 | products = Field() 38 | notes_url = Field() 39 | 40 | class AsrockLinkItem(UefispiderItem): 41 | chipset = Field() 42 | product = Field() 43 | url = Field() 44 | 45 | class AsrockUpdateItem(UefispiderItem): 46 | version = Field() 47 | date = Field() 48 | desc = Field() 49 | bios_type = Field() 50 | bios_url = Field() 51 | attrs = Field() 52 | 53 | class MsiUpdateLinkItem(UefispiderItem): 54 | url = Field() 55 | title = Field() 56 | id = Field() 57 | 58 | class MsiUpdatePageItem(UefispiderItem): 59 | desc = Field() 60 | driver_type = Field() 61 | bios_url = Field() 62 | version = Field() 63 | date = Field() 64 | attrs = Field() 65 | 66 | class HPBiosUpdateLinkItem(UefispiderItem): 67 | url = Field() 68 | date = Field() 69 | name = Field() 70 | 71 | class HPBiosUpdatePageItem(UefispiderItem): 72 | bios_url = Field() 73 | notes_url = Field() 74 | version = Field() 75 | download_name = Field() 76 | attrs = Field() 77 | 78 | ### From revision history 79 | previous_versions = Field() 80 | 81 | ### From a textual-update 82 | importance = Field() 83 | compatibility = Field() 84 | ssm = Field() # remote update 85 | desc = Field() 86 | fixes = Field() 87 | 88 | class IntelBiosUpdateLinkItem(UefispiderItem): 89 | url = Field() 90 | name = Field() 91 | date = Field() 92 | version = Field() 93 | desc = Field() 94 | status = Field() 95 | 96 | class IntelBiosUpdatePageItem(UefispiderItem): 97 | bios_url = Field() 98 | notes_url = Field() 99 | products = Field() 100 | attrs = Field() # attributes from LinkItem 101 | 102 | class DellBiosResultsItem(Item): 103 | total = Field() 104 | 105 | class DellBiosUpdateLinkItem(UefispiderItem): 106 | url = Field() 107 | release_date = Field() 108 | driver_type = Field() 109 | compatibility = Field() 110 | desc = Field() 111 | 112 | class DellBiosUpdatePageItem(UefispiderItem): 113 | bios_urls = Field() 114 | file_names = Field() 115 | notes_url = Field() 116 | previous_versions = Field() 117 | importance = Field() 118 | version = Field() 119 | fixes = Field() 120 | attrs = Field() # attributes from LinkItem 121 | -------------------------------------------------------------------------------- /uefispider/spiders/asrock_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request, FormRequest 5 | 6 | from uefispider.items import * 7 | 8 | import json 9 | import re 10 | import copy 11 | 12 | class AsrockSpider(UefiSpider): 13 | name = 'AsrockSpider' 14 | allowed_domains = [ 15 | "asrock.com", 16 | "66.226.78.22" 17 | ] 18 | 19 | start_urls = [ 20 | "http://www.asrock.com/support/download.asp?c=All" 21 | ] 22 | 23 | def parse(self, response): 24 | sel = Selector(response) 25 | 26 | machines = [] 27 | rows = sel.css("tr") 28 | for row in rows: 29 | bgcolor = row.xpath("@bgcolor") 30 | if not bgcolor or len(bgcolor) == 0: 31 | continue 32 | bgcolor = bgcolor.extract()[0] 33 | if bgcolor not in ["white", "#e8e8e8"]: 34 | continue 35 | cells = row.css("td") 36 | chipset = cells[0].xpath(".//text()").extract()[0] 37 | if chipset in ["Chipset"]: 38 | continue 39 | name = cells[1].xpath(".//text()").extract()[0] 40 | link = cells[1].css("a").xpath("@href").extract()[0] 41 | #print chipset, name, link 42 | item = AsrockLinkItem() 43 | item["chipset"] = chipset 44 | item["product"] = name 45 | item["url"] = "http://www.asrock.com%s" % link 46 | 47 | machines.append(item) 48 | 49 | for machine in machines: 50 | yield Request(machine["url"], callback= self.parse_machine, 51 | meta= {"item": machine}) 52 | 53 | def parse_downloads(self, response): 54 | def extract_field(field_sel): 55 | return field_sel.xpath(".//text()").extract()[0] 56 | sel = Selector(response) 57 | 58 | updates = [] 59 | rows = sel.css("tr") 60 | for row in rows: 61 | cells = row.css("td") 62 | if len(cells) != 10: 63 | continue 64 | item = AsrockUpdateItem() 65 | item["version"] = extract_field(cells[0]) 66 | item["date"] = extract_field(cells[1]) 67 | item["bios_type"] = extract_field(cells[2]) 68 | if item["bios_type"] not in ["Instant Flash"]: 69 | continue 70 | item["desc"] = extract_field(cells[4]) 71 | item["bios_url"] = cells[8].css("a").xpath("@href").extract()[0] 72 | item["binary_name"] = item["bios_url"].split("/")[-1] 73 | item["item_id"] = item["binary_name"].replace(".zip", "") 74 | 75 | item["attrs"] = dict(response.meta["item"]) 76 | #print dict(item) 77 | updates.append(item) 78 | 79 | for update in updates: 80 | yield Request(url= update["bios_url"], callback= self.parse_binary, 81 | meta= {"item": update}) 82 | pass 83 | pass 84 | 85 | def parse_machine(self, response): 86 | sel = Selector(response) 87 | 88 | download_link = None 89 | list_items = sel.css("#LeftMenu").css("li") 90 | for item in list_items: 91 | text = item.xpath(".//text()").extract()[0] 92 | if text.find("Download") < 0: 93 | continue 94 | try: 95 | download_link = item.css("a").xpath("@href").extract()[0] 96 | except: 97 | continue 98 | 99 | if download_link is not None: 100 | yield Request(url= "http://www.asrock.com%s&os=BIOS" % download_link, 101 | callback= self.parse_downloads, 102 | meta= {"item": response.meta["item"]}) 103 | pass 104 | 105 | def parse_binary(self, response): 106 | item = response.meta["item"] 107 | item["binary"] = response.body 108 | 109 | yield item 110 | -------------------------------------------------------------------------------- /uefispider/spiders/asus_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request, FormRequest 5 | ### Need to change useragent 6 | from scrapy.utils.project import get_project_settings 7 | 8 | from uefispider.items import * 9 | 10 | import json 11 | import re 12 | import copy 13 | 14 | def _select_form(index, categories): 15 | ### Start at Repeater{index} and add each category in the tuple. 16 | repeater = [] 17 | for category in categories: 18 | repeater.append("Repeater%d$%s" % (index, category)) 19 | index += 1 20 | form = { 21 | "ScriptManager1": "ScriptManager1|%s$LinkButton1" % "$".join(repeater), 22 | "langNormal": "en", 23 | "hd_l_series": "Series", 24 | "hd_l_model": "Model", 25 | "hd_l_os": "OS", 26 | "hd_select_type": "1", 27 | "__EVENTTARGET": "%s$LinkButton1" % "$".join(repeater), 28 | "__EVENTARGUMENT": "", 29 | "__ASYNCPOST": "true" 30 | } 31 | return form 32 | 33 | class AsusSpider(UefiSpider): 34 | name = 'AsusSpider' 35 | allowed_domains = [ 36 | "asus.com" 37 | ] 38 | 39 | product_types = [ 40 | ("ct100", "ct100"), # Laptops 41 | ("ct101", "ct100"), # Tablets 42 | ("ct102", "ct100"), # Motherboards 43 | ("ct103", "ct100"), # Barebones 44 | ("ct103", "ct101"), # Desktops 45 | ("ct103", "ct102"), # All-in-Ones 46 | ("ct104", "ct100"), # Servers 47 | ] 48 | 49 | start_urls = [ 50 | ### Start at model selector. 51 | "http://support.asus.com/download/options.aspx?SLanguage=en", 52 | ] 53 | 54 | select_urls = [ 55 | "http://support.asus.com/Select/ModelSelect.aspx?SLanguage=en&type=1&KeepThis=true", 56 | ] 57 | 58 | def _get_uas(self): 59 | ### Edit user agent 60 | settings = get_project_settings() 61 | return " ".join([ 62 | settings.get("USER_AGENT"), 63 | ### The ASP.NET application is checking for async-compatible browsers. 64 | "Mozilla/5.0 (Windows NT 6.1; WOW64)" 65 | #"AppleWebKit/537.36 (KHTML, like Gecko)", 66 | #"Chrome/34.0.1847.116", 67 | #"Safari/537.36", 68 | ]) 69 | pass 70 | 71 | def parse(self, response): 72 | 73 | yield Request(url= self.select_urls[0], 74 | headers= {"User-Agent": self._get_uas()}, 75 | #meta= {"cookiejar": "GLOBAL"}, 76 | callback= self.parse_again) 77 | 78 | def parse_again(self, response): 79 | sel = Selector(response) 80 | 81 | hidden_fields = {} 82 | inputs = sel.xpath("//input") 83 | for ele in inputs: 84 | input_type = ele.xpath(".//@type").extract()[0] 85 | value = ele.xpath(".//@value").extract()[0] 86 | name = ele.xpath(".//@name").extract()[0] 87 | if input_type not in ["hidden"]: 88 | continue 89 | hidden_fields[name] = value 90 | 91 | for product_type in self.product_types: 92 | ### Create a POST form and apply a generated ScriptManager 93 | form_data = _select_form(1, product_type) 94 | for field in hidden_fields: 95 | ### Replace static fields with page-generated inputs. 96 | form_data[field] = hidden_fields[field] 97 | #print form_data 98 | yield FormRequest(formdata= form_data, method= "POST", 99 | headers= { 100 | "Content-Type": "application/x-www-form-urlencoded", 101 | #"X-MicrosoftAjax": "Delta=true", 102 | "X-Requested-With": "XMLHttpRequest", 103 | "User-Agent": self._get_uas() 104 | }, 105 | url= self.select_urls[0], 106 | #meta= {"cookiejar": "GLOBAL"}, 107 | callback= self.parse_series) 108 | return 109 | 110 | def parse_series(self, response): 111 | sel = Selector(response) 112 | 113 | from scrapy.shell import inspect_response 114 | inspect_response(response) -------------------------------------------------------------------------------- /uefispider/spiders/msi_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request, FormRequest 5 | 6 | from uefispider.items import * 7 | 8 | import json 9 | import re 10 | import copy 11 | 12 | json_headers = { 13 | "X-Requested-With": "XMLHttpRequest", 14 | "Accept": "application/json, text/javascript, */*", 15 | } 16 | 17 | class MsiSpider(UefiSpider): 18 | name = 'MsiSpider' 19 | allowed_domains = [ 20 | "msi.com" 21 | ] 22 | 23 | start_urls = [ 24 | "http://us.msi.com/download/pages/list_ajax" 25 | ] 26 | 27 | msi_search_vars = { 28 | "p": "service", 29 | "d": "list", 30 | "c": "download", 31 | "no": "", 32 | "cat": "mb", 33 | "pno": "", 34 | "switch": "ProductSelector", 35 | "sw": "ajax" 36 | } 37 | 38 | def _get_vars(self, no, pno): 39 | search_vars = copy.copy(self.msi_search_vars) 40 | search_vars["no"] = str(no) 41 | search_vars["pno"] = str(pno) 42 | return search_vars 43 | 44 | def parse(self, response): 45 | ### Generate a search for AMD and Intel chips 46 | intel_search = self._get_vars(170, 1) 47 | amd_search = self._get_vars(171, 1) 48 | yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers, 49 | formdata= intel_search, callback= self.parse_search) 50 | yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers, 51 | formdata= amd_search, callback= self.parse_search) 52 | 53 | def parse_search(self, response): 54 | sel = Selector(response) 55 | 56 | ### Parse each sub-product type. 57 | searches = [] 58 | product_selector = sel.css(".mr20").xpath("@no") 59 | if product_selector: 60 | pno = product_selector.extract()[0] 61 | 62 | products = sel.css(".ProdSel-item") 63 | for product in products: 64 | no = product.xpath("@no").extract()[0] 65 | searches.append((no, pno)) 66 | #print searches 67 | 68 | ### Parse the actual products/boards. 69 | boards = [] 70 | items = sel.css(".Prod-item") 71 | for item in items: 72 | title = item.xpath("@title").extract()[0] 73 | no = item.xpath("@no").extract()[0] 74 | boards.append((title, no)) 75 | #print boards 76 | 77 | for sub_search in searches: 78 | search_vars = self._get_vars(sub_search[0], sub_search[1]) 79 | yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers, 80 | formdata= search_vars, callback= self.parse_search) 81 | 82 | for board in boards: 83 | url = "http://us.msi.com/product/mb/%s.html" % board[0] 84 | item = MsiUpdateLinkItem() 85 | item["id"] = board[1] 86 | item["title"] = board[0] 87 | item["url"] = url 88 | 89 | yield Request(url= "%s#/?div=BIOS" % url, callback= self.parse_board, 90 | meta= {"attrs": item}) 91 | pass 92 | 93 | def parse_board(self, response): 94 | def extract_field(field_sel): 95 | return field_sel.xpath(".//text()").extract()[0] 96 | sel = Selector(response) 97 | 98 | updates = [] 99 | update_sels = sel.css(".div-BIOS").css(".table_gray") 100 | for update in update_sels: 101 | item = MsiUpdatePageItem() 102 | fields = update.css("td") 103 | item["desc"] = extract_field(fields[2]) 104 | item["version"] = extract_field(fields[4]) 105 | item["driver_type"] = extract_field(fields[6]) 106 | item["date"] = extract_field(fields[8]) 107 | try: 108 | item["bios_url"] = fields[10].xpath(".//a/@href").extract()[0] 109 | except Exception, e: 110 | #print response.meta["attrs"]["title"], str(e) 111 | continue 112 | item["binary_name"] = item["bios_url"].split("/")[-1] 113 | item["item_id"] = item["binary_name"].split(".", 1)[0] 114 | item["attrs"] = dict(response.meta["attrs"]) 115 | updates.append(item) 116 | 117 | for update in updates: 118 | yield Request(url= update["bios_url"], callback= self.parse_binary, 119 | meta= {"item": update}) 120 | 121 | def parse_binary(self, response): 122 | item = response.meta["item"] 123 | item["binary"] = response.body 124 | 125 | yield item 126 | 127 | 128 | -------------------------------------------------------------------------------- /uefispider/spiders/intel_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import FormRequest, Request 5 | 6 | from uefispider.items import * 7 | 8 | import json 9 | 10 | #from scrapy.shell import inspect_response 11 | #inspect_response(response) 12 | 13 | class IntelSpider(UefiSpider): 14 | name = 'IntelSpider' 15 | allowed_domains = [ 16 | "downloadcenter.intel.com", 17 | "downloadmirror.intel.com", 18 | "search.intel.com", 19 | ] 20 | 21 | start_urls = [ 22 | "https://downloadcenter.intel.com/Default.aspx?lang=eng", 23 | ] 24 | 25 | def parse(self, response): 26 | url = "https://downloadcenter.intel.com/SearchResult.aspx?lang=eng" 27 | 28 | search_form = { 29 | "search_downloads": ".BIO", 30 | "ctl00$body$submit_search_downloads": "Search downloads", 31 | "ctl00$body$searchKeyword": "BIO" 32 | } 33 | 34 | return [FormRequest(url= url, method= "POST", 35 | formdata= search_form, callback= self.parse_form)] 36 | 37 | def parse_form(self, response): 38 | '''Walking 'to' a form is not required, but just incase act like a human.''' 39 | 40 | ### The form will response with HTML, but data is refreshed with an XMLHTTP request. 41 | url = "https://downloadcenter.intel.com/JSONDataProvider.aspx?DownloadType=BIOS&pg=1&sortDir=descending&Hits=%d&keyword=BIO&lang=eng&refresh=filters&dataType=json&type=GET" 42 | 43 | sel = Selector(response) 44 | num_results = sel.css("span#num_results") 45 | if len(num_results) != 1: 46 | print "Error no results found?" 47 | return 48 | 49 | ### Example NNNN matching result(s) 50 | num_results = num_results.css("::text").extract()[0].split(" ")[0] 51 | try: 52 | num_results = int(num_results) 53 | except Exception, e: 54 | print "Cannot format results count as number? (%s)" % str(e) 55 | return 56 | 57 | ### Now send an AJAX request for ALL matching items. 58 | json_data = { 59 | "DownloadType": "BIOS", 60 | "pg": "1", 61 | "sortDir": "descending", 62 | "Hits": "%d" % num_results, 63 | "keyword": "\"BIO\"", 64 | "lang": "eng", 65 | "refresh": "filters", 66 | "dataType": "json", 67 | "type": "GET" 68 | } 69 | 70 | json_headers = { 71 | "X-Requested-With": "XMLHttpRequest", 72 | "Accept": "application/json, text/javascript, */*", 73 | } 74 | 75 | return [FormRequest(url= url % num_results, method= "POST", headers= json_headers, 76 | formdata= json_data, callback= self.parse_json)] 77 | 78 | def parse_json(self, response): 79 | '''A JSON object of the search results.''' 80 | 81 | download_url = "https://downloadcenter.intel.com%s" 82 | 83 | ### The result response SHOULD be JSON. 84 | try: 85 | results = json.loads(response.body) 86 | except Exception, e: 87 | print "Cannot load JSON results. (%s)" % str(e) 88 | return 89 | 90 | items = [] 91 | updates= results["results"] 92 | for update in updates: 93 | item = IntelBiosUpdateLinkItem() 94 | item["item_id"] = update["title"]["downloadid"] 95 | item["url"] = update["title"]["href"] 96 | item["name"] = update["title"]["header"] 97 | item["date"] = update["date"] 98 | item["version"] = update["version"] 99 | item["desc"] = update["title"]["description"] 100 | item["status"] = update["status"] 101 | 102 | yield Request(url= download_url % item["url"], callback= self.parse_download, 103 | meta= {"attrs": item}) 104 | 105 | def parse_download(self, response): 106 | '''The download page (usually) offers multiple download links, we want just the update.''' 107 | 108 | sel = Selector(response) 109 | 110 | link_notes = None 111 | link_bios = None 112 | 113 | links = sel.css('a').xpath('@href').extract() 114 | for link in links: 115 | ### Release notes are cool too, though they are in PDF form. 116 | if link.find("ReleaseNotes") >= 0: 117 | link_notes = link 118 | if link.find(".BIO") >= 0: 119 | link_bios = link 120 | 121 | if link_bios is None: 122 | return 123 | 124 | item = IntelBiosUpdatePageItem() 125 | link_bios = link_bios[link_bios.find("httpDown=")+len("httpDown="): link_bios.find(".BIO")+len(".BIO")] 126 | item['bios_url'] = link_bios 127 | item['notes_url'] = link_notes if link_notes is not None else "" 128 | 129 | ### Supported products is nice too. 130 | products = [] 131 | products_sel = sel.css('div#prodos') 132 | if len(products_sel) > 0: 133 | products_sel = products_sel.xpath(".//table/tr/td/text()").extract() 134 | for product in products_sel: 135 | products.append("".join([c for c in product if c not in ['\t', '\n', '\r']])) 136 | item['products'] = products 137 | item['attrs'] = dict(response.meta['attrs']) 138 | item['item_id'] = item['attrs']['item_id'] 139 | 140 | #yield item 141 | yield Request(url= link_bios, callback= self.parse_binary, 142 | meta= {"item": item}) 143 | pass 144 | 145 | def parse_binary(self, response): 146 | item = response.meta["item"] 147 | item["binary"] = response.body 148 | 149 | yield item 150 | -------------------------------------------------------------------------------- /uefispider/spiders/gigabyte_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request, FormRequest 5 | 6 | from uefispider.items import * 7 | from urlparse import urlparse 8 | 9 | import json 10 | import sys 11 | import os 12 | 13 | base_search = "http://www.gigabyte.us:80/support-downloads/category-level_ajax.aspx?%s" 14 | submit_search = "http://www.gigabyte.us/support-downloads/download-center_ajax.aspx?%s" 15 | bios_search = "http://www.gigabyte.us/products/product-page_ajax.aspx?%s" 16 | 17 | def _search_url(ck, lev, val): 18 | return base_search % ("ck=%s&lev=%s&val=%s" % (ck, lev, val)) 19 | 20 | def _submit_url(p, ck, pid): 21 | ### p=1&kw=&ck=2&pid=3752 22 | return submit_search % ("p=%s&kw=&ck=%s&pid=%s" % (str(p), ck, pid)) 23 | 24 | def _bios_url(pid): 25 | #return bios_search % ("t=dl&pid=%s&dlt=%s&cg=%s&ck=%s&h=bios&MDA2=" % ( 26 | # pid, dlt, cg, ck 27 | #)) 28 | return bios_search % ("t=dl&pid=%s&dlt=2" % pid) 29 | pass 30 | 31 | def _url_params(url): 32 | url = urlparse(url) 33 | params = {p.split("=")[0]: p.split("=")[1] for p in url.query.split("&")} 34 | return params 35 | 36 | class GigabyteSpider(UefiSpider): 37 | name = 'GigabyteSpider' 38 | allowed_domains = [ 39 | "gigabyte.us", 40 | ] 41 | 42 | start_urls = [ 43 | ### Motherboards 44 | _search_url(2, 1, 2), 45 | _search_url(101, 1, 101), 46 | _search_url(112, 1, 112), 47 | ### Notebook/Netbook 48 | _search_url(5, 1, 5), 49 | ### Slate PC (tablet) 50 | _search_url(71, 1, 71), 51 | ### Set top boxes 52 | _search_url(131, 1, 131), 53 | _search_url(133, 1, 133), 54 | ### Barebones 55 | _search_url(102, 1, 102), 56 | _search_url(122, 1, 122), 57 | ### NAS 58 | _search_url(132, 1, 132), 59 | ] 60 | 61 | def parse(self, response): 62 | ### Each search returns a JSON response of Rows (classes of products) 63 | try: 64 | json_response = json.loads(response.body) 65 | except Exception, e: 66 | print "Cannot load JSON from category search." 67 | return 68 | 69 | params = _url_params(response.url) 70 | level = params["lev"] if "lev" in params else "0" 71 | 72 | if "ck" not in params: 73 | print "Cannot find CK value in response params?" 74 | return 75 | if "node" not in json_response: 76 | print "Cannot find NODE value in response response?" 77 | 78 | for row in json_response["rows"]: 79 | if row["value"] == "": 80 | continue 81 | ### node=1 indicates a bottom-level search, each row is an item. 82 | if json_response["node"] == "0": 83 | yield Request(url= _search_url(params["ck"], int(level)+1, row["value"])) 84 | else: 85 | yield Request(url= _submit_url(1, params["ck"], row["value"]), 86 | callback= self.parse_submit) 87 | pass 88 | 89 | def parse_product(self, response): 90 | sel = Selector(response) 91 | 92 | results = sel.css(".tbl_driver") 93 | if not results: 94 | return 95 | 96 | rows = results.css("tr") 97 | for i in xrange(len(rows)-1): 98 | data = rows[i+1].css("td") 99 | ### Most common (no bios) will not include results 100 | if len(data) == 0: 101 | continue 102 | item = GigabyteUpdateItem() 103 | ### DLT=2 may be mapped differently. 104 | try: 105 | item["version"] = data[0].xpath(".//text()").extract()[0] 106 | except Exception, e: 107 | continue 108 | 109 | item["date"] = data[2].xpath(".//text()").extract()[0] 110 | links = data[3].css("a") 111 | ### Links may be malformed. 112 | if len(links) < 3: 113 | continue 114 | item["bios_url"] = data[3].css("a")[2].xpath("@href").extract()[0] 115 | ### Handle a lack-of-desc. 116 | try: 117 | item["desc"] = data[4].xpath(".//text()").extract()[0] 118 | except Exception, e: 119 | item["desc"] = "" 120 | #print item_id, response.url 121 | #print version, date, bios_url, desc 122 | basename = os.path.basename(urlparse(item["bios_url"]).path) 123 | item["item_id"] = os.path.splitext(basename)[0] 124 | item["binary_name"] = basename 125 | item["attrs"] = dict(response.meta["item"]) 126 | 127 | yield Request(url= item["bios_url"], callback= self.parse_binary, 128 | meta= {"item": item}) 129 | pass 130 | 131 | def parse_submit(self, response): 132 | ### After navigating the search menus, parse a list of results. 133 | sel = Selector(response) 134 | 135 | results = sel.css("tr") 136 | for result in results: 137 | item = GigabyteLinkItem() 138 | item["driver_type"] = result.css(".text2").xpath(".//text()").extract()[0] 139 | item["name"] = result.css(".title3").css("a").xpath(".//text()").extract()[0] 140 | item["url"] = result.css(".title3").css("a").xpath("@href").extract()[0] 141 | params = _url_params(item["url"]) 142 | yield Request(url= _bios_url(params["pid"]), 143 | callback= self.parse_product, 144 | meta= {"item": item}) 145 | pass 146 | 147 | def parse_binary(self, response): 148 | item = response.meta["item"] 149 | item["binary"] = response.body 150 | 151 | yield item 152 | 153 | -------------------------------------------------------------------------------- /uefispider/spiders/dell_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request 5 | 6 | from uefispider.items import * 7 | 8 | import json 9 | import re 10 | 11 | #from scrapy.shell import inspect_response 12 | #inspect_response(response) 13 | 14 | class DellSpider(UefiSpider): 15 | name = 'DellSpider' 16 | allowed_domains = [ 17 | "search.dell.com", 18 | "www.dell.com", 19 | "dell.com" 20 | #"downloadmirror.intel.com", 21 | #"search.intel.com", 22 | ] 23 | 24 | dell_search_vars = { 25 | "c": "us", # country 26 | "l": "en", # language 27 | "s": "gen", # search type (home, business, generic) 28 | "cat": "sup", 29 | "k": "BIOS", # input 30 | "rpp": "20", # results per-page? does not change 31 | "p": "1", # page index 32 | "subcat": "dyd", 33 | "rf": "all", 34 | "nk": "f", 35 | "sort": "K", 36 | "nf": "catn~BI", 37 | "navla": "catn~BI", 38 | "ira": "False", 39 | "~srd": "False", 40 | "ipsys": "False", 41 | "advsrch": "False", 42 | "~ck": "anav" 43 | } 44 | 45 | filetype_blacklist = ["txt", "sign", "pdf"] 46 | 47 | results_url = "http://search.dell.com/results.aspx?%s" 48 | start_urls = [ 49 | results_url % 50 | ("&".join(["%s=%s" % (k, v) for k, v in dell_search_vars.iteritems()])) 51 | ] 52 | 53 | ### List of crawled item IDs 54 | item_ids = [] 55 | 56 | def _get_item_id(self, url): 57 | driver_id = url.find("driverId=") 58 | item = url[driver_id + len("driverId="):] 59 | if item.find("&") >= 0: 60 | item = item[:item.find("&")] 61 | return item 62 | 63 | def parse(self, response): 64 | sel = Selector(response) 65 | 66 | page_number = int(self.dell_search_vars["p"]) 67 | print "Debug: On Page %d" % page_number 68 | 69 | total_regex = r".* (\d+) Results" 70 | total_results = sel.css(".PaginationCtrlResltTxt") 71 | if len(total_results) < 1: 72 | ### Cannot determine the number of search results 73 | print "Error: cannot determine search results." 74 | return 75 | 76 | total_string = total_results.extract()[0] 77 | total_match = re.search(total_regex, total_string) 78 | if total_match is None: 79 | print "Error: cannot determine search results." 80 | return 81 | 82 | ### It turns out this is just a guestimate by Dell, let's double it?! 83 | total_results = int(total_match.group(1)) #* 2 84 | ### There's 20 results per page, and I cannot change this!? 85 | total_pages = (total_results / 20) + 1 86 | 87 | ### Parse this initial page's results. 88 | for result in self.parse_results(response): 89 | yield result 90 | 91 | for page in xrange(2, total_pages): 92 | self.dell_search_vars["p"] = str(page) 93 | yield Request( 94 | url= self.results_url % 95 | ("&".join(["%s=%s" % (k, v) for k, v in self.dell_search_vars.iteritems()])), 96 | callback= self.parse_results) 97 | pass 98 | 99 | def parse_results(self, response): 100 | ### Parse update results from search page, yield the links to the updates 101 | sel = Selector(response) 102 | drivers = sel.css("div.driver_container") 103 | if len(drivers) == 0: 104 | ### No items on this page 105 | print "Debug: reached the end." 106 | return 107 | 108 | result_items = [] 109 | for driver in drivers: 110 | result_item = DellBiosUpdateLinkItem() 111 | compatibility = driver.css("input.hdnCompProduct").xpath("@value") 112 | if len(compatibility) != 0: 113 | ### Compatibility tells us the model and thus mainboard/config. 114 | systems = compatibility.extract()[0].strip().split("#") 115 | result_item["compatibility"] = [] 116 | for system in systems: 117 | if system.find("DesktopLatitudeOptiplexPrecisionVostro") >= 0: 118 | result_item["compatibility"].append("XPS Notebook R720") 119 | else: 120 | result_item["compatibility"].append(system.strip()) 121 | 122 | url = driver.css("input.hdnDriverURL").xpath("@value") 123 | if len(url) == 0: 124 | print "ERROR: No URL for update?" 125 | continue 126 | ### Driver type is saved as a sanity check. 127 | result_item["url"] = url.extract()[0] 128 | details = driver.css("div.driver_detail::text").extract() 129 | result_item["driver_type"] = details[0][2:] 130 | ### Release date only includes the date, the previous versions include a timestamp. 131 | result_item["release_date"] = details[1][2:] 132 | result_items.append(result_item) 133 | 134 | for item in result_items: 135 | item_id = self._get_item_id(item["url"]) 136 | ### Do not attempt duplicate update parsing. 137 | if item_id in self.item_ids: 138 | continue 139 | yield Request(url= item["url"], meta= {"result_item": item}, callback= self.parse_update) 140 | pass 141 | 142 | def parse_update(self, response): 143 | sel = Selector(response) 144 | 145 | ### There may be multiple downloads, the link is held in a javascript call. 146 | notes_link = "" 147 | driver_links = sel.css("#GetDriver").xpath("@href") 148 | if len(driver_links) == 0: 149 | raise Exception("Debug: No driver links found.") 150 | try: 151 | driver_links = [link.split(",")[1].strip("' ") for link in driver_links.extract()] 152 | except Exception, e: 153 | raise Exception("Error: cannot extract links. (%s)" % str(e)) 154 | 155 | ### Save the release link separately (if it exists). 156 | for link in driver_links: 157 | if link.find("Release") >= 0 and link[-3:] == "txt": 158 | notes_link = link 159 | 160 | driver_names = sel.css("p.DriverDetails_FileFormat_Names::text") 161 | if len(driver_names) == 0: 162 | raise Exception("Debug: No driver names found.") 163 | driver_names = driver_names.extract() 164 | 165 | ### Update version provided in header. 166 | version = sel.css("a#dellVendorVersionToolTipId::text").extract()[0] 167 | 168 | ### There is inconsistency in naming previous versions, which may include spaces and commas. 169 | previous_versions = [] 170 | pversions = sel.css("a#Versions") 171 | for pversion in pversions: 172 | version_name = "".join([c for c in pversion.xpath("text()").extract()[0] if c not in [",", " "]]) 173 | version_link = "http://www.dell.com/%s" % pversion.xpath("@href").extract()[0].split("&", 1)[0] 174 | version_date = pversion.xpath("../../following-sibling::td/text()").extract()[0].strip() 175 | version_id = version_link[version_link.find("driverId=") + len("driverId="):] 176 | previous_versions.append((version_name, version_link, version_date, version_id)) 177 | 178 | #print previous_versions 179 | importance = "Unknown" 180 | fixes = "" 181 | 182 | ### Parse optional importance label and fixes/enhancement content. 183 | expands = sel.xpath("//h3") 184 | for expand in expands: 185 | expand_text = expand.css("::text").extract()[0] 186 | if expand_text.find("Level of Importance") == 0: 187 | importance = expand_text[expand_text.find(":")+1:] 188 | if expand_text.find("Fixes") == 0: 189 | try: 190 | expand_body = expand.xpath(".//following-sibling::div")[0].\ 191 | css(".DriverDetails_RowData::text").extract() 192 | fixes = expand_body[0] 193 | except Exception, e: 194 | print e 195 | pass 196 | if expand_text.find("Compatibility") == 0: 197 | system_set = expand.xpath("./following-sibling") 198 | 199 | item = DellBiosUpdatePageItem() 200 | item["notes_url"] = notes_link 201 | item["bios_urls"] = [l for l in driver_links if l.split(".")[-1] not in self.filetype_blacklist] 202 | item["file_names"] = [n for n in driver_names if n.split(".")[-1] not in self.filetype_blacklist] 203 | item["previous_versions"] = previous_versions 204 | item["version"] = version 205 | item["importance"] = importance 206 | item["fixes"] = fixes 207 | #item["attrs"] = dict(response.meta["result_item"]) 208 | 209 | link_item = response.meta["result_item"] 210 | 211 | ### Try to get date (again) 212 | details = sel.css(".DriverDetails_Table_ItemLabel") 213 | if len(details) > 0: 214 | date = details[0].xpath("./following-sibling::td/text()").extract() 215 | if len(date) > 0: 216 | link_item["release_date"] = date[0].strip() 217 | item["attrs"] = dict(link_item) 218 | 219 | ### Set the item ID as the driver/update link ID. 220 | item["item_id"] = self._get_item_id(item["attrs"]["url"]) 221 | self.item_ids.append(item["item_id"]) 222 | 223 | for i in xrange(len(item["bios_urls"])): 224 | if item["bios_urls"][i].split(".")[-1].lower() != "exe": 225 | continue 226 | ### Download each file associated 227 | yield Request(url= item["bios_urls"][i], callback= self.parse_binary, 228 | meta= {"name": item["file_names"][i], "item_id": item["item_id"]}) 229 | ### For now, only download the first exe. 230 | break 231 | 232 | ### Crawl the update versions (may be duplicates) for this system 233 | for update in previous_versions: 234 | update_item = DellBiosUpdateLinkItem() 235 | update_item["url"] = update[1] 236 | update_item["release_date"] = update[2] 237 | update_item["compatibility"] = link_item["compatibility"] 238 | #update_item["desc"] = link_item["desc"] 239 | yield Request(url= update[1], meta= {"result_item": update_item}, 240 | callback= self.parse_update) 241 | 242 | yield item 243 | pass 244 | 245 | def parse_binary(self, response): 246 | item = BinaryItem() 247 | item["binary"] = response.body 248 | item["binary_name"] = response.meta["name"] 249 | item["item_id"] = response.meta["item_id"] 250 | 251 | yield item 252 | 253 | -------------------------------------------------------------------------------- /uefispider/spiders/lenovo_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request, FormRequest 5 | from uefi_firmware.utils import red, blue 6 | 7 | from uefispider.items import * 8 | 9 | import json 10 | import re 11 | import copy 12 | import os 13 | 14 | lenovo_component = "1343112652574" 15 | product_search = "http://support.lenovo.com/en_US/downloads/default/%s.ajax?%s" 16 | product_select = "http://support.lenovo.com/en_US/downloads/default.page?%s" 17 | download_select = "http://download.lenovo.com/lenovo/content/ddfm/%s-%s-%s.html" 18 | 19 | ''' 20 | Usage: 21 | scrapy crawl -a dump=/tmp/spiders LenovoSpider 22 | 23 | Requirements: 24 | innoextract, 7-zip, cabextract, unrar; are all helpful. 25 | http://constexpr.org/innoextract/files/innoextract-1.4.tar.gz 26 | 27 | XML structure: 28 | Properties-> 29 | Data-> 30 | Result-> 31 | ProductSelectorResults-> 32 | Options-> 33 | [Option, value="id"].//text()=name 34 | ''' 35 | 36 | def _search_url(tree): 37 | if len(tree) > 3: 38 | return None 39 | select_types = ["getSeries", "getSubseries", "getMachineTypes"] 40 | selection = "-".join(tree) 41 | if len(tree) == 5: 42 | selection += "+" * 4 43 | else: 44 | selection += "-" * (5-len(tree)) 45 | return product_search % ( 46 | lenovo_component, 47 | "method=%s&productSelection=%s" % (select_types[len(tree)-1], selection) 48 | ) 49 | 50 | def _select_url(tree): 51 | if len(tree) > 3: 52 | return None 53 | query = { 54 | "submit": "true", 55 | "componentID": lenovo_component, 56 | "iwPreActions": "SetProduct", 57 | "prodId": "-".join(tree) + "--", 58 | "os": "" 59 | } 60 | ### This will set cookies and redirect, similar to HP. 61 | return product_select % "&".join(["%s=%s" % (k, v) for k, v in query.iteritems()]) 62 | 63 | def _download_url(series, subseries, product): 64 | return download_select % (series, subseries, product) 65 | 66 | class LenovoSpider(UefiSpider): 67 | name = 'LenovoSpider' 68 | allowed_domains = [ 69 | "lenovo.com", 70 | ] 71 | 72 | series_list = [ 73 | "P014", # Laptops & Tablets 74 | #"P013", # Desktop & All-In-Ones 75 | #"P022", # Workstations 76 | #"P023", # Servers 77 | ] 78 | 79 | start_urls = [ 80 | "http://support.lenovo.com/en_US/downloads/default.page" 81 | ] 82 | 83 | ### Hold a list of products/documents which are processed serially. 84 | #products = {} 85 | doc_ids = [] 86 | 87 | def _get_results(self, response): 88 | sel = Selector(response) 89 | 90 | results = [] 91 | options = sel.css("Properties").xpath("./Data/Result/ProductSelectorResults/Options/option") 92 | for option in options: 93 | value = option.xpath("./@value").extract()[0] 94 | name = option.xpath("./text()").extract()[0] 95 | results.append((value, name)) 96 | return results 97 | 98 | def parse(self, response): 99 | for series in self.series_list: 100 | yield Request(url= _search_url([series]), callback= self.parse_series, 101 | meta= {"series": series, "dont_merge_cookies": True}) 102 | 103 | 104 | def parse_series(self, response): 105 | results = self._get_results(response) 106 | series = response.meta["series"] 107 | 108 | ### Now we have a set of subseries IDs. 109 | for result in results: 110 | yield Request(url= _search_url([series, result[0]]), 111 | #yield Request(url= _search_url([series, "S006"]), 112 | callback= self.parse_subseries, 113 | meta= {"series": series, "subseries": result[0]}) 114 | 115 | def parse_subseries(self, response): 116 | results = self._get_results(response) 117 | series = response.meta["series"] 118 | subseries = response.meta["subseries"] 119 | 120 | for result in results: 121 | yield Request(url = _download_url(series, subseries, result[0]), 122 | #yield Request(url= _download_url(series, "S006", "SS2500"), 123 | callback= self.parse_product, 124 | meta= {"cookiejar": result[1], "item_details": result}, 125 | dont_filter= True) 126 | 127 | def parse_product(self, response): 128 | def is_bios_update(name): 129 | #valid_names = ["BIOS Update"] 130 | valid_names = ["BIOS Update Utility"] 131 | ### The "utility" documents provide historic information. 132 | for valid in valid_names: 133 | if name.find(valid) >= 0: 134 | return True 135 | return False 136 | 137 | sel = Selector(response) 138 | 139 | ### There's a lot of information on this page, but the update document 140 | ### repeats this information and includes historic data. 141 | updates = [] 142 | rows = sel.css("#BIOS").css("#table1").xpath(".//tr")[1:] 143 | for row in rows: 144 | cells = row.xpath(".//td") 145 | name = cells[0].xpath("./text()").extract()[0] 146 | if not is_bios_update(name): 147 | ### This is not the droid we're looking for 148 | continue 149 | links = cells[0].xpath(".//a/@href").extract() 150 | updates.append(links[0]) 151 | 152 | for update in updates: 153 | doc_id = update.split("DocID=")[1] 154 | ### Begin critical section 155 | if doc_id in self.doc_ids: 156 | continue 157 | self.doc_ids.append(doc_id) 158 | ### End critical section 159 | yield Request(url= update, 160 | callback= self.parse_document, 161 | meta= {"item_details": response.meta["item_details"], "doc_id": doc_id}) 162 | 163 | pass 164 | 165 | def parse_document(self, response): 166 | sel = Selector(response) 167 | 168 | systems = None 169 | changes = None 170 | uefi = False 171 | packages = None 172 | 173 | ### Extract information for the current release from the downloads table. 174 | ### This information is NOT repeated in the version table below. 175 | downloads = sel.css(".downloadTable").xpath(".//tbody/tr") 176 | if len(downloads) == 0: 177 | ### Sometimes there is no downloads table!? 178 | ### Todo: http://support.lenovo.com/en_US/downloads/detail.page?DocID=DS024464 179 | print red("Error: no downloads table found.") 180 | return 181 | 182 | binary_url = downloads[0].xpath(".//td")[0].xpath(".//a/@href").extract()[0] 183 | notes_url = downloads[1].xpath(".//td")[0].xpath(".//a/@href").extract()[0] 184 | date = downloads[0].xpath(".//td")[3].xpath("./text()").extract()[0] 185 | 186 | ### This is ugly! 187 | tables = sel.css(".v14-header-1") 188 | for table in tables: 189 | if len(table.xpath("./text()")) == 0: 190 | ### This may be a server without parsed HTML. 191 | continue 192 | table_name = table.xpath("./text()").extract()[0] 193 | if table_name.find("Supported") > -1 and table_name.find("Operating") == -1: 194 | ### Could be "Systems"/"System" 195 | systems = table 196 | if table_name.find("Summary of Changes") > -1: 197 | changes = table 198 | if table_name.find("UEFI") > -1: 199 | uefi = True 200 | if table_name.find("Package") > -1: 201 | packages = table 202 | 203 | if not uefi: 204 | ### Documents might be missing UEFI instructions. 205 | uefi = (response.body.find("UEFI") >= 0) 206 | 207 | print blue(response.meta["doc_id"]) 208 | print response.url 209 | print "date:", date 210 | print "UEFI:", "True" if uefi else red("False") 211 | print "True" if systems is not None else red("False"), 212 | print "True" if packages is not None else red("False") 213 | 214 | ### Todo: server pages are not found as UEFI 215 | ### http://support.lenovo.com/en_US/downloads/detail.page?DocID=DS032912 216 | 217 | if not uefi: 218 | ### This is not a UEFI update. 219 | return 220 | 221 | ### There are cases where the package table is not found. 222 | if packages is None: 223 | headers = sel.css("th") 224 | for header in headers: 225 | table_name = header.xpath("./text()").extract()[0] 226 | if table_name.find("Package") >= 0: 227 | packages = header 228 | break 229 | 230 | if packages is None: 231 | ### Could not recover. 232 | print red("Error: no package list found.") 233 | return 234 | 235 | systems_list = [] 236 | if systems is None: 237 | ### Might not have a correctly formatted table for systems support 238 | ### http://support.lenovo.com/en_US/downloads/detail.page?DocID=DS013468 239 | cells = sel.xpath("//td/text()").extract() 240 | scanning_supported = False 241 | for cell in cells: 242 | if cell.find("Supported") >= 0 and cell.find("Operating") == -1: 243 | scanning_supported = True 244 | if scanning_supported: 245 | if cell[0] != "-": 246 | scanning_supported = False 247 | break 248 | systems_list.append(cell[1:].strip()) 249 | pass 250 | else: 251 | systems = systems.xpath("../../../following-sibling::ul")[0].xpath(".//li") 252 | for system in systems: 253 | systems_list.append(system.xpath("./text()").extract()[0]) 254 | 255 | if len(systems_list) == 0: 256 | print red("Error: no systems found") 257 | return 258 | 259 | update_list = [] 260 | updates = packages.xpath("../../..//tr")[1:] 261 | for i, update in enumerate(updates): 262 | cells = update.xpath(".//td") 263 | if len(cells) < 3: 264 | ### Might have a row-span (DocID=DS035105) 265 | continue 266 | ### This format will be X.XX (NAME) 267 | version = cells[1].xpath(".//text()").extract()[0].split("(") 268 | version = "%s (%s" % (version[0].strip(), version[1].strip()) 269 | release = cells[-1] 270 | #print version, release 271 | if i == 0: 272 | update_list.append((version, binary_url, notes_url)) 273 | continue 274 | urls = release.xpath(".//a/@href").extract() 275 | if len(urls) < 2: 276 | ### Could be unreleased (Not release to the web) (DocID=DS029726) 277 | urls = ["unknown", "unknown"] 278 | update_list.append((version, urls[0], urls[1])) 279 | 280 | meta = { 281 | "systems": systems_list, 282 | "updates": update_list, 283 | "date": date, 284 | "url": response.url 285 | } 286 | 287 | yield Request(url= notes_url, callback= self.parse_notes, meta= meta) 288 | 289 | def parse_notes(self, response): 290 | ### This is a text-only document containing the versions and release notes. 291 | text = response.body.split("\r\n") 292 | 293 | dates_list = [] 294 | release_notes = [] 295 | 296 | document = response.meta 297 | 298 | line_num = 0 299 | scanning_changes = False 300 | scanning_version = None 301 | version_notes = [] 302 | 303 | while line_num < len(text): 304 | line = text[line_num] 305 | line_num += 1 306 | 307 | ### Scan for "Package (ID)", next line is a set of delims, then updates until blank-line 308 | if line.find("Package") >= 0 and line.find("Issue Date") >= 0: 309 | line_num += 1 310 | for i in xrange(len(document["updates"])): 311 | line = text[line_num] 312 | line_num += 1 313 | if len(line) == 0: 314 | ### Problem! 315 | print red("Warning: no lines left while scanning updates.") 316 | break 317 | version_info = line.split(" ") 318 | dates_list.append(version_info[-1]) 319 | continue 320 | 321 | ### While scan for "<" as first character 322 | ### Version , add lines until blank-line 323 | if line.find("Summary of Changes") >= 0: 324 | scanning_changes = True 325 | if scanning_changes: 326 | if len(line) == 0 and line_num <= len(text) and len(text[line_num]) == 0: 327 | ### Double return, break 328 | scanning_changes = False 329 | continue 330 | if scanning_version and len(line) == 0: 331 | ### Append and reset version notes. 332 | scanning_version = False 333 | release_notes.append(version_notes) 334 | version_notes = [] 335 | continue 336 | if scanning_version: 337 | version_notes.append(line.strip()) 338 | continue 339 | if len(line) > 0 and line[0] == "<": 340 | scanning_version = True 341 | continue 342 | pass 343 | 344 | ### Finally download the binaries 345 | for i, update in enumerate(document["updates"]): 346 | item = LenovoUpdateItem() 347 | item["url"] = document["url"] 348 | item["products"] = document["systems"] 349 | item["version"] = update[0] 350 | item["bios_url"] = update[1] 351 | item["notes_url"] = update[2] 352 | item["date"] = dates_list[i] if i < len(dates_list) else "unknown" 353 | item["desc"] = release_notes[i] if i < len(release_notes) else "unknown" 354 | item["item_id"] = update[0] 355 | 356 | if item["bios_url"] == "unknown": 357 | print red("Warning: BIOS url unknown") 358 | yield item 359 | else: 360 | if item["bios_url"][0:len("http://")] != "http://": 361 | ### Might be missing... 362 | ### download.lenovo.com/ibmdl/pub/pc/pccbbs/mobiles/h6uj03ww.exe 363 | item["bios_url"] = "http://" + item["bios_url"] 364 | yield Request(url= item["bios_url"], 365 | callback= self.parse_binary, 366 | meta= {"item": item}) 367 | pass 368 | 369 | def parse_binary(self, response): 370 | item = response.meta["item"] 371 | item["binary"] = response.body 372 | item["binary_name"] = os.path.basename(response.url) 373 | 374 | yield item 375 | pass 376 | -------------------------------------------------------------------------------- /uefispider/spiders/hp_spider.py: -------------------------------------------------------------------------------- 1 | 2 | from uefispider.spiders import UefiSpider 3 | from scrapy.selector import Selector 4 | from scrapy.http import Request, FormRequest 5 | from scrapy.http.cookies import CookieJar 6 | 7 | from uefispider.items import * 8 | 9 | import json 10 | import re 11 | import urllib 12 | import sys 13 | 14 | #from scrapy.shell import inspect_response 15 | #inspect_response(response) 16 | 17 | class HPSpider(UefiSpider): 18 | name = 'HPSpider' 19 | allowed_domains = [ 20 | "www2.hp.com", 21 | "hp.com" 22 | ] 23 | 24 | cookies = { 25 | "HP_SPF_HOST": "h20566.www2.hp.com", 26 | "HP_SPF_LOCALE": "en-US", 27 | "HP_SPF_SITE": "hpsc", 28 | } 29 | 30 | download_url = "http://ftp.hp.com/pub/softpaq/sp%d-%d/%s" 31 | 32 | start_urls = [ 33 | "http://h20566.www2.hp.com/portal/site/hpsc/template.PAGE/public/kb/search/" 34 | ] 35 | 36 | crawled_items = {} 37 | ### Store all of the crawled search results 38 | 39 | def _get_download_link(self, filename, sp_number= None): 40 | ### An update file name may include a distinct "SP" number. 41 | if sp_number is None: 42 | sp_number = filename 43 | 44 | update_id = sp_number[2:sp_number.find(".")] 45 | try: 46 | update_id = int(update_id) 47 | url = self.download_url % (1 + (update_id/500)*500, (update_id/500)*500 + 500, filename) 48 | except Exception, e: 49 | ### Cannot parse the filename, was an sp_number provided? 50 | url = None 51 | return url 52 | 53 | def _get_update_id(self, update_link): 54 | index = update_link.find("swItem%253D") + len("swItem%253D") 55 | end_tok = [update_link.find("&", index), update_link.find("%25", index)] 56 | if end_tok[0] >= 0 and end_tok[1] >= 0: 57 | end_tok = min(end_tok[0], end_tok[1]) 58 | elif end_tok[0] >= 0: end_tok = end_tok[0] 59 | else: end_tok = end_tok[1] 60 | update_id = update_link[index: end_tok] 61 | return update_id 62 | pass 63 | 64 | def _write_results(self): 65 | print "Debug: Finished reading search results, writing." 66 | with open("hp-output.json", 'w') as fh: 67 | fh.write(json.dumps([dict(item) for i, item in self.crawled_items.iteritems()])) 68 | return 69 | 70 | def parse(self, response): 71 | ### The initial hit of the search page, generate all additional searches, accept the cookies and begin. 72 | months = range(0, 12) # Month is 0-counted 73 | years = range(2006, 2014+1) # Years is actual-counted 74 | 75 | monthly_searches = [] 76 | for year in years: 77 | for month in months: 78 | end_year = year if month != 11 else year+1 79 | end_month = month+1 if month != 11 else 0 80 | #print (month, year, end_month, end_year) 81 | monthly_searches.append((year, month, end_year, end_month)) 82 | 83 | response.meta["searches"] = monthly_searches 84 | return self.parse_begin(response) 85 | 86 | def parse_begin(self, response): 87 | ### Hit the page we were redirected to with the cookies set. 88 | 89 | return Request(url = response.url, cookies= self.cookies, 90 | meta= {"searches": response.meta["searches"]}, 91 | callback= self.parse_accept) 92 | 93 | def parse_accept(self, response): 94 | ### At the search form, begin to generate monthly searches, alert if >100 results. 95 | sel = Selector(response) 96 | 97 | ### This will select the REAL url (with appended query string "tokens"). 98 | url_path = "" 99 | forms = sel.xpath("//form") 100 | for form in forms: 101 | form_ids = form.xpath("@id").extract() 102 | if len(form_ids) == 0: 103 | continue 104 | if form_ids[0] == "refineSearchForm": 105 | url_path = form.xpath("@action").extract()[0] 106 | 107 | ### The search load-balances 108 | domain = response.url[len("http://"):response.url.find(".")] 109 | 110 | url = "http://%s.www2.hp.com/%s" 111 | form_data = { 112 | "didYouMean": "", 113 | "searchCrit": "allwords", 114 | "docType":"Drivers", 115 | #"docType":"Patch", 116 | "dateRange":"all", 117 | "dateSearchType":"dateRange", 118 | "startDateYear": None, 119 | "startDateMonth": None, 120 | "startDateDay": "1", 121 | "endDateYear": None, 122 | "endDateMonth": None, 123 | "endDateDay":"1", 124 | "resPerPage":"100", 125 | "sortCrit":"date", 126 | "showSummary":"yesX", 127 | "calledBy":"Search_Main", 128 | "mode":"text", 129 | "searchString":"BIOS Update", 130 | "searchRes":"Search", 131 | "advSearchFlag":"true", 132 | } 133 | 134 | ### Pull off the remaining searchs, and fill in vars for the 'next' search. 135 | remaining_searches = response.meta["searches"] 136 | 137 | form_data["startDateYear"] = str(remaining_searches[0][0]) 138 | form_data["startDateMonth"] = str(remaining_searches[0][1]) 139 | form_data["endDateYear"] = str(remaining_searches[0][2]) 140 | form_data["endDateMonth"] = str(remaining_searches[0][3]) 141 | 142 | return FormRequest(url= url % (domain, url_path) + "&month=%d&year=%d" % (remaining_searches[0][1], remaining_searches[0][0]), 143 | headers= {"Content-Type": "application/x-www-form-urlencoded"}, 144 | formdata= form_data, method= "POST", cookies= self.cookies, 145 | meta= {"searches": remaining_searches[1:], "this": (form_data["startDateYear"], form_data["startDateMonth"], form_data["endDateYear"], form_data["endDateMonth"])}, 146 | dont_filter= True, 147 | callback= self.parse_search) 148 | pass 149 | 150 | def parse_search(self, response): 151 | ### The search results 152 | sel = Selector(response) 153 | 154 | results = sel.css("table[title='Search Results Index']").xpath(".//tr")[1:] 155 | if len(results) == 100: 156 | ### The search will only return 100 results. 157 | ### If 100 is reached, the search must be repeated with better accuracy. 158 | #raise Exception("Reached 100 results, day-granularity not implemented.") 159 | with open('overflow_months.log', 'a+') as fh: 160 | fh.write('%s %s %s %s\n' % (response.meta["this"][0], response.meta["this"][1], response.meta["this"][2], response.meta["this"][3])) 161 | print "Reached 100 results, consider day-granularity." 162 | 163 | for result in results: 164 | download_type = "".join(result.xpath(".//td")[2].xpath(".//text()").extract()).strip() 165 | if download_type != "BIOS": 166 | continue 167 | 168 | item = HPBiosUpdateLinkItem() 169 | item["url"] = "".join(result.xpath(".//td")[1].xpath(".//a/@href").extract()).strip() 170 | item["name"] = "".join(result.xpath(".//td")[1].xpath(".//a//text()").extract()) 171 | item["date"] = "".join(result.xpath(".//td")[3].xpath(".//text()").extract()) 172 | 173 | item["item_id"] = self._get_update_id(item["url"]) 174 | 175 | if item["item_id"] in self.crawled_items: 176 | #raise Exception("Found duplicate: (%s, %s, %s)" % (item["item_id"], item["name"], item["date"])) 177 | print "Found duplicate: (%s, %s, %s)" % (item["item_id"], item["name"], item["date"]) 178 | continue 179 | ### Store the item in the object-global item stash. 180 | self.crawled_items[item["item_id"]] = item 181 | 182 | remaining_search_count = len(response.meta["searches"]) 183 | if remaining_search_count > 0: 184 | ### The are more searches, repeat. 185 | yield Request(url= self.start_urls[0], #+ "?%d" % remaining_search_count, 186 | meta= {"searches": response.meta["searches"]}, 187 | dont_filter= True, 188 | callback= self.parse_accept) 189 | return 190 | 191 | ### Debugging, make this an argument/option later 192 | self._write_results() 193 | 194 | ### The searches are complete, parse responses. 195 | for item_id, item in self.crawled_items.iteritems(): 196 | #callback = self.parse_me_update if item["name"].find("ME Firmware Update") >= 0 else self.parse_update 197 | callback = self.parse_update 198 | yield Request(url= item["url"], callback= callback, meta= {"result_item": item}) 199 | 200 | def parse_update(self, response): 201 | ### The update (download) page for the BIOS. 202 | sel = Selector(response) 203 | 204 | fields = sel.css("table.m10").xpath(".//tr/td") 205 | version = fields[1].xpath(".//text()").extract()[0] 206 | version = version[:version.find("(")].strip() 207 | 208 | name = fields[3].xpath(".//text()").extract()[0] 209 | name = name[:name.find("(")].strip() 210 | 211 | ### Try to parse the "name" as an "SP" number 212 | download_link = self._get_download_link(name) 213 | 214 | item = HPBiosUpdatePageItem() 215 | item["bios_url"] = download_link 216 | item["version"] = version 217 | item["binary_name"] = name 218 | item["attrs"] = dict(response.meta["result_item"]) 219 | item["item_id"] = item["attrs"]["item_id"] 220 | 221 | ### Updates have their description in different tabs 222 | tab_names = {} 223 | tabs = sel.css("tr#device-nav").xpath(".//td") 224 | for i, tab in enumerate(tabs): 225 | tab_name = " ".join(tab.xpath(".//text()").extract()).lower() 226 | tab_link = "".join(tab.xpath(".//a/@href").extract()) 227 | tab_names[tab_name] = tab_link 228 | 229 | ### Set the release notes url, this may be optional? 230 | item["notes_url"] = tab_names["release notes"] if "release notes" in tab_names else None 231 | 232 | if "revision history" in tab_names: 233 | ### A version history is optional, this will parse the release notes afterward. 234 | return Request(url= tab_names["revision history"], callback= self.parse_versions, 235 | meta= {"page_item": item}) 236 | 237 | if item["notes_url"] is not None: 238 | return Request(url= item["notes_url"], callback= self.parse_notes, meta= {"page_item": item}) 239 | 240 | ### We are finished, sadly, without much meta-information 241 | if download_link is None: 242 | raise Exception("Cannot parse notes and bad download (%s)." % item["binary_name"]) 243 | return Request(url= download_link, callback= self.parse_binary, meta= {"page_item": item}) 244 | 245 | pass 246 | 247 | def parse_versions(self, response): 248 | ### Parse an optional version history 249 | sel = Selector(response) 250 | item = response.meta["page_item"] 251 | 252 | previous_versions = [] 253 | versions = sel.css("div#tabContent").css("a.udrline") 254 | for pversion in versions: 255 | version_link = "".join(pversion.xpath("@href").extract()).strip() 256 | version_text = "".join(pversion.xpath(".//text()").extract()).strip() 257 | version_text = version_text[version_text.find(":")+1:] 258 | 259 | version_id = self._get_update_id(version_link) 260 | ### Because Dell is stored as an array ...(JSON). 261 | previous_versions.append([version_text, version_link, version_id]) 262 | 263 | item["previous_versions"] = previous_versions 264 | 265 | ### Must now parse notes! 266 | if item["notes_url"] is not None: 267 | return Request(url= item["notes_url"], callback= self.parse_notes, meta= {"page_item": item}) 268 | 269 | if item["bios_url"] is None: 270 | raise Exception("Cannot parse notes (after versions) and bad download (%s)." % item["binary_name"]) 271 | return Request(url= item["bios_url"], callback= self.parse_binary, meta= {"page_item": item}) 272 | 273 | pass 274 | 275 | def parse_notes(self, response): 276 | ### Parse a potentially optional release notes section (url). 277 | sel = Selector(response) 278 | item = response.meta["page_item"] 279 | 280 | if len(sel.css("div#tabContent").xpath(".//font").css(".heading")) > 0: 281 | return self.parse_advanced_notes(response) 282 | 283 | ### This content is a textual-dump 284 | sections = {} 285 | content = [line.strip() for line in sel.css("div#tabContent").xpath(".//td//text()").extract()] 286 | 287 | active_section = None 288 | for line in content: 289 | ### Find a SECTION: Value, or SECTION: (where the value follows on newlines). 290 | match = re.search(r"([A-Z\(\) ]+):(.*)", line) 291 | if match is None: 292 | if active_section is None: 293 | continue 294 | ### Add this line to the previously-found section. 295 | sections[active_section].append(line.strip()) 296 | else: 297 | match = match.groups() 298 | if len(match[1]) == 0: 299 | ### Expect content to follow 300 | active_section = match[0] 301 | sections[match[0]] = [] 302 | else: 303 | active_section = None 304 | sections[match[0]] = [match[1].strip()] 305 | 306 | #print sections 307 | section_fields = [ 308 | ("SSM SUPPORTED", "ssm", True), 309 | ("DESCRIPTION", "desc", False), 310 | ("PURPOSE", "importance", True), 311 | ("HARDWARE PRODUCT MODEL(S)", "compatibility", False), 312 | ("FIXES", "fixes", False) 313 | ] 314 | 315 | for section_field in section_fields: 316 | if section_field[0] in sections: 317 | item[section_field[1]] = sections[section_field[0]] 318 | if section_field[2] and type(sections[section_field[0]]) == list: 319 | item[section_field[1]] = item[section_field[1]][0] 320 | 321 | ### Finally, download the BIOS 322 | if item["bios_url"] is None: 323 | sp_number = sections["SOFTPAQ NUMBER"][0][:7] if "SOFTPAQ NUMBER" in sections else "0" 324 | download_link = self._get_download_link(item["binary_name"], sp_number= sp_number) 325 | if download_link is None: 326 | raise Exception("Cannot create download (%s), (%s)." % (item["binary_name"], sp_number)) 327 | item["bios_url"] = download_link 328 | 329 | return Request(url= item["bios_url"], callback= self.parse_binary, meta= {"page_item": item}) 330 | pass 331 | 332 | def parse_advanced_notes(self, response): 333 | sel = Selector(response) 334 | 335 | sections = sel.css("div#tabContent").xpath(".//font").css(".heading") 336 | content = sel.css("div#tabContent").xpath(".//font").css(".body") 337 | 338 | item = response.meta["page_item"] 339 | 340 | sp_section = "SoftPaq" 341 | sp_number = None 342 | section_fields = { 343 | "SSM": "ssm", 344 | "DESCRIPTION": "desc", 345 | "PURPOSE": "importance", 346 | "HARDWARE": "compatibility", 347 | "FIXES": "fixes" 348 | } 349 | 350 | for i, section in enumerate(sections): 351 | section_name = "".join(section.xpath(".//text()").extract()) 352 | section_body = "\n".join([line.strip() for line in content[i].xpath(".//text()").extract() if len(line.strip()) > 0]) 353 | for field, key in section_fields.iteritems(): 354 | if section_name.find(field) == 0: 355 | if field == "HARDWARE": 356 | section_body = section_body.split("\n") 357 | item[key] = section_body 358 | if section_name.find(sp_section) == 0: 359 | sp_number = section_body[:7] 360 | 361 | pass 362 | 363 | ### Finally, download the BIOS 364 | if item["bios_url"] is None: 365 | download_link = self._get_download_link(item["binary_name"], sp_number= sp_number) 366 | if download_link is None: 367 | raise Exception("Cannot create download (%s), (%s)." % (item["binary_name"], sp_number)) 368 | item["bios_url"] = download_link 369 | 370 | return Request(url= item["bios_url"], callback= self.parse_binary, meta= {"page_item": item}) 371 | pass 372 | 373 | def parse_binary(self, response): 374 | item = response.meta["page_item"] 375 | 376 | if item["binary_name"] == "Obtain\u00a0softwar": 377 | ### This is an odd handling of this error-case, a EULA is required. 378 | item["binary_name"] = "EULA.html" 379 | item["binary"] = response.body 380 | 381 | yield item 382 | --------------------------------------------------------------------------------