├── msdn-to-docset.py └── static ├── DASH_LICENSE ├── Info.plist ├── icon.png └── icon@2x.png /msdn-to-docset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sqlite3 4 | import os 5 | import sys 6 | import glob 7 | import re 8 | import shutil 9 | import logging 10 | import json 11 | import tarfile 12 | import tempfile 13 | import argparse 14 | import urllib.parse 15 | import urllib 16 | import time 17 | import collections 18 | import zipfile 19 | 20 | from ruamel.yaml import YAML 21 | import requests 22 | from requests.adapters import HTTPAdapter 23 | from requests.packages.urllib3.util.retry import Retry 24 | from requests.exceptions import ConnectionError 25 | from bs4 import BeautifulSoup as bs, Tag # pip install bs4 26 | from selenium import webdriver 27 | # from selenium.webdriver import Firefox 28 | # from selenium.webdriver.firefox.options import Options 29 | # from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 30 | 31 | from selenium.webdriver import Chrome 32 | from selenium.webdriver.chrome.options import Options 33 | 34 | class PoshWebDriver: 35 | """ Thin wrapper for selenium webdriver for page content retrieval """ 36 | 37 | def __init__(self, executable_path = None): 38 | 39 | self.options = Options() 40 | self.options.add_argument("--headless") 41 | self.options.add_argument("--window-size=1920x1080") 42 | 43 | self.driver = webdriver.Chrome(options=self.options) 44 | 45 | 46 | def get_url_page(self, url): 47 | """ retrieve the full html content of a page after Javascript execution """ 48 | 49 | index_html = None 50 | try: 51 | self.driver.get(url) 52 | index_html = self.driver.page_source 53 | except (ConnectionResetError, urllib.error.URLError) as e: 54 | # we may have a triggered a anti-scraping time ban 55 | # Lay low for several seconds and get back to it. 56 | 57 | self.driver.quit() 58 | time.sleep(2) 59 | 60 | 61 | self.driver = webdriver.Chrome(options=self.options) 62 | 63 | index_html = None 64 | 65 | # try a second time, and raise error if fail 66 | if not index_html: 67 | self.driver.get(url) 68 | index_html = self.driver.page_source 69 | 70 | return index_html 71 | 72 | def quit(): 73 | return self.driver.quit() 74 | 75 | 76 | class Configuration: 77 | 78 | # STATIC CONSTANTS 79 | docset_name = 'MSDN' 80 | 81 | domain = "docs.microsoft.com" 82 | default_theme_uri = "_themes/docs.theme/master/en-us/_themes" 83 | 84 | def __init__(self, args): 85 | 86 | 87 | # # selected powershell api version 88 | # self.powershell_version = args.version 89 | 90 | # # The modules and cmdlets pages are "versionned" using additional params in the GET request 91 | # self.powershell_version_param = "view=powershell-{0:s}".format(self.powershell_version) 92 | 93 | # build folder (must be cleaned afterwards) 94 | # self.build_folder = os.path.join(os.getcwd(), "_build_{0:s}".format(self.powershell_version)) 95 | self.build_folder = os.path.join(os.getcwd(), "_build_msdn") 96 | 97 | # output file 98 | self.output_filepath = os.path.realpath(args.output) 99 | 100 | # powershell docs start page 101 | self.api_index_url = "https://docs.microsoft.com/en-us/windows/win32/api/" 102 | 103 | self.docs_index_url = "https://docs.microsoft.com/en-us/windows/win32/desktop-app-technologies" 104 | 105 | 106 | 107 | # # powershell docs table of contents url 108 | # self.docs_toc_url = "https://{0:s}/psdocs/toc.json?{2:s}".format( 109 | # Configuration.base_url, 110 | # self.powershell_version, 111 | # self.powershell_version_param 112 | # ) 113 | 114 | # self.windows_toc_url = "https://{0:s}/windowsserver2019-ps/toc.json?view=windowsserver2019-ps".format( 115 | # Configuration.base_url 116 | # ) 117 | 118 | # selenium webdriver 119 | self.webdriver = PoshWebDriver() 120 | 121 | self.crawl_contents = True 122 | 123 | # selected module 124 | # self.filter_modules = [module.lower() for module in args.modules] 125 | 126 | 127 | # Global session for several retries 128 | session = requests.Session() 129 | retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) 130 | session.mount('http://', HTTPAdapter(max_retries=retries)) 131 | 132 | 133 | def download_binary(url, output_filename): 134 | """ Download GET request as binary file """ 135 | global session 136 | 137 | logging.debug("download_binary : %s -> %s" % (url, output_filename)) 138 | 139 | # ensure the folder path actually exist 140 | os.makedirs(os.path.dirname(output_filename), exist_ok = True) 141 | 142 | r = session.get(url, stream=True) 143 | with open(output_filename, 'wb') as f: 144 | for data in r.iter_content(32*1024): 145 | f.write(data) 146 | 147 | def download_textfile(url : str , output_filename : str, params : dict = None): 148 | """ Download GET request as utf-8 text file """ 149 | global session 150 | 151 | logging.debug("download_textfile : %s -> %s" % (url, output_filename)) 152 | 153 | # ensure the folder path actually exist 154 | os.makedirs(os.path.dirname(output_filename), exist_ok = True) 155 | 156 | while True: 157 | try: 158 | r = session.get(url, data = params) 159 | except ConnectionError: 160 | logging.debug("caught ConnectionError, retrying...") 161 | time.sleep(2) 162 | else: 163 | break 164 | 165 | # do not write 404 pages on disk 166 | if r.status_code != 200: 167 | return False 168 | 169 | r.encoding = 'utf-8' 170 | with open(output_filename, 'w', encoding="utf-8") as f: 171 | f.write(r.text) 172 | 173 | return True 174 | 175 | 176 | def make_docset(source_dir, dst_filepath, filename): 177 | """ 178 | Tar-gz the build directory while conserving the relative folder tree paths. 179 | Copied from : https://stackoverflow.com/a/17081026/1741450 180 | """ 181 | dst_dir = os.path.dirname(dst_filepath) 182 | tar_filepath = os.path.join(dst_dir, '%s.tar' % filename) 183 | 184 | with tarfile.open(tar_filepath, "w:gz") as tar: 185 | tar.add(source_dir, arcname=os.path.basename(source_dir)) 186 | 187 | shutil.move(tar_filepath, dst_filepath) 188 | 189 | 190 | 191 | def download_page_contents(configuration, uri, output_filepath): 192 | """ Download a page using it's uri from the TOC """ 193 | 194 | # Resolving "absolute" url et use appropriate version 195 | full_url = urllib.parse.urljoin(configuration.docs_toc_url, uri) 196 | versionned_url = "{0:s}?{1:s}".format(full_url, configuration.powershell_version_param) 197 | 198 | download_textfile(versionned_url, output_filepath) 199 | 200 | 201 | def download_module_contents(configuration, module_name, module_uri, module_dir, cmdlets, root_dir): 202 | """ Download a modules contents """ 203 | 204 | module_filepath = os.path.join(module_dir, "%s.html" % module_name) 205 | 206 | logging.debug("downloading %s module index page -> %s" % (module_name, module_filepath)) 207 | if module_uri: 208 | download_page_contents(configuration, module_uri, module_filepath) 209 | 210 | cmdlets_infos = [] 211 | 212 | # Downloading cmdlet contents 213 | for cmdlet in cmdlets: 214 | 215 | cmdlet_name = cmdlet['toc_title'] 216 | if cmdlet_name.lower() in ("about", "functions", "providers", "provider"): # skip special toc 217 | continue 218 | 219 | cmdlet_uri = cmdlet["href"] 220 | cmdlet_filepath = os.path.join(module_dir, "%s.html" % cmdlet_name) 221 | 222 | logging.debug("downloading %s cmdlet doc -> %s" % (cmdlet_name, cmdlet_filepath)) 223 | download_page_contents(configuration, cmdlet_uri, cmdlet_filepath) 224 | 225 | cmdlets_infos.append({ 226 | 'name' : cmdlet_name, 227 | 'path' : os.path.relpath(cmdlet_filepath, root_dir), 228 | }) 229 | 230 | module_infos = { 231 | 'name' : module_name, 232 | 'index' : os.path.relpath(module_filepath, root_dir), 233 | 'cmdlets' : cmdlets_infos 234 | } 235 | 236 | return module_infos 237 | 238 | def _findname(obj, key): 239 | """ return the 'toc_title' value associated to a 'href' node """ 240 | # print("%r == %s" % (obj.get('href', None), key)) 241 | if obj.get('href', None)==key:return obj['toc_title'] 242 | for k, v in obj.items(): 243 | if isinstance(v,dict): 244 | item = _findname(v, key) 245 | if item is not None: 246 | return item 247 | if isinstance(v,list): 248 | for i in v: 249 | item = _findname(i, key) 250 | if item is not None: 251 | return item 252 | 253 | def crawl_sdk_api_folder(configuration: Configuration, download_dir : str, source_dir: str, directory : str, api_content_toc : dict): 254 | 255 | 256 | for markdown_filepath in glob.glob(os.path.join(source_dir,directory , "*.md")): 257 | 258 | page_filename, page_ext = os.path.splitext(os.path.basename(markdown_filepath)) 259 | realarb = os.path.relpath(os.path.dirname(markdown_filepath), source_dir) 260 | 261 | # already processed 262 | if page_filename == "index": 263 | continue 264 | 265 | url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}/{1:s}".format(realarb, page_filename) 266 | filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}/{1:s}.html".format(realarb, page_filename)) 267 | logging.info("[+] download page %s -> %s " % (url, filepath)) 268 | success = download_textfile(url, filepath) 269 | 270 | if not success: 271 | logging.info("[X] could not download page %s -> %s " % (url, filepath)) 272 | continue 273 | 274 | 275 | url_relpath = "/windows/win32/api/{0:s}/{1:s}".format(realarb, page_filename) 276 | page_title = _findname(api_content_toc['toc'][directory]['items'][0], url_relpath) 277 | #logging.info("[+] %s => title '%s'" % (url_relpath, page_title)) 278 | 279 | if page_filename.startswith("nc-"): 280 | category = "callbacks" 281 | elif page_filename.startswith("ne-"): 282 | category = "enums" 283 | elif page_filename.startswith("nf-"): 284 | category = "functions" 285 | elif page_filename.startswith("nn-"): 286 | category = "interfaces" 287 | elif page_filename.startswith("ns-"): 288 | category = "structures" 289 | elif page_filename.startswith("nl-"): 290 | category = "classes" 291 | else: 292 | category = "entries" 293 | 294 | api_content_toc[category].append({ 295 | 'name' : page_title, 296 | 'path' : "docs.microsoft.com/en-us{0:s}.html".format(url_relpath), 297 | }) 298 | 299 | return api_content_toc 300 | 301 | 302 | def crawl_sdk_api_contents(configuration: Configuration, download_dir : str, source_dir : str): 303 | """ Download sdk-api entries based on TOC """ 304 | 305 | api_content_toc = { 306 | 'categories' : [], 307 | 'files' : [], 308 | 'callbacks' : [], 309 | 'functions' : [], 310 | 'enums' : [], 311 | 'interfaces' : [], 312 | 'structures' : [], 313 | 'classes' : [], 314 | 315 | 'entries' : [], 316 | 'toc' : {} 317 | } 318 | 319 | content_dir = os.path.join(source_dir, "sdk-api-docs", "sdk-api-src", "content") 320 | 321 | for directory in os.listdir(content_dir): 322 | 323 | # download toc for directory 324 | toc_url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}/toc.json".format(directory) 325 | logging.info("[+] download toc for directory %s" % (toc_url)) 326 | toc_r = requests.get(toc_url) 327 | if toc_r.status_code == 200: 328 | api_content_toc['toc'][directory] = json.loads(requests.get(toc_url).text) 329 | else: 330 | logging.warning("[!] directory %s has no TOC !" % (toc_url)) 331 | 332 | # only index folders with a toc 333 | if not api_content_toc['toc'].get(directory, None): 334 | continue 335 | 336 | # "meta" directory 337 | if directory.startswith("_"): 338 | 339 | 340 | 341 | url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}".format( 342 | directory, 343 | ) 344 | filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html") 345 | logging.info("[+] download page %s -> %s " % (url, filepath)) 346 | download_textfile(url, filepath) 347 | 348 | 349 | category_title = api_content_toc['toc'][directory]['items'][0]['toc_title'] 350 | api_content_toc['categories'].append({ 351 | 'name' : category_title, 352 | 'path' : os.path.join("docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html"), 353 | }) 354 | 355 | # directory generated from a file 356 | else: 357 | 358 | url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}".format( 359 | directory, 360 | ) 361 | filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html") 362 | logging.info("[+] download page %s -> %s " % (url, filepath)) 363 | download_textfile(url, filepath) 364 | 365 | category_title = directory 366 | if api_content_toc['toc'].get(directory, None): 367 | category_title = api_content_toc['toc'][directory]['items'][0]['toc_title'] 368 | 369 | api_content_toc['files'].append({ 370 | 'name' : category_title, 371 | 'path' : os.path.join("docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html"), 372 | }) 373 | 374 | 375 | api_content_toc = crawl_sdk_api_folder(configuration, download_dir, content_dir, directory, api_content_toc) 376 | 377 | return api_content_toc 378 | 379 | 380 | def crawl_msdn_contents(configuration: Configuration, download_dir : str, source_dir : str): 381 | """ Download MSDN modules and content pages based on TOC """ 382 | 383 | content_toc = { 384 | 'attributes' : [], 385 | 'classes' : [], 386 | 'entries' : [], 387 | 'guides' : [], 388 | 'toc' : {}, 389 | } 390 | 391 | # counter = 0 392 | for r, d, f in os.walk(os.path.join(source_dir, "win32-docs", "desktop-src"), topdown=True): 393 | 394 | # if counter >=2000: 395 | # break 396 | 397 | for image_file in filter(lambda s: os.path.splitext(s)[1] in [".png", ".jpg", ".jpeg"] ,f): 398 | realarb = os.path.relpath(r, os.path.join(source_dir, "win32-docs", "desktop-src")) 399 | image_dir = os.path.join(download_dir, "docs.microsoft.com/win32", realarb) 400 | filepath = os.path.join(image_dir, image_file) 401 | 402 | os.makedirs(image_dir, exist_ok=True) 403 | shutil.copyfile(os.path.join(r, image_file), filepath) 404 | 405 | for markdown_file in filter(lambda s: os.path.splitext(s)[1] == ".md" ,f): 406 | page_filename, page_ext = os.path.splitext(markdown_file) 407 | 408 | realarb = os.path.relpath(r, os.path.join(source_dir, "win32-docs", "desktop-src")) 409 | url = "https://docs.microsoft.com/en-us/windows/win32/{0:s}/{1:s}".format( 410 | realarb, 411 | page_filename 412 | ) 413 | 414 | # retrieve html of page 415 | page_dir = os.path.join(download_dir, "docs.microsoft.com/win32", realarb) 416 | filepath = os.path.join(page_dir, "%s.html" % page_filename) 417 | logging.debug("[+] download page %s -> %s " % (url, filepath)) 418 | download_textfile(url, filepath) 419 | 420 | # don't care about top level pages 421 | if realarb == '.': 422 | continue 423 | 424 | # First time navigating in this directory 425 | if realarb not in content_toc['toc'].keys(): 426 | 427 | # download toc for page 428 | toc_url = "https://docs.microsoft.com/en-us/windows/win32/{0:s}/toc.json".format( 429 | realarb 430 | ) 431 | logging.info("[+] download toc for page %s" % (toc_url)) 432 | 433 | toc_r = requests.get(toc_url) 434 | if toc_r.status_code != 200: 435 | 436 | # Could not find a toc for this folder 437 | content_toc['toc'][realarb] = { 438 | 'toc' : {'items' : [{}]} 439 | } 440 | 441 | content_toc['guides'].append({ 442 | 'name' : page_filename, 443 | 'path' : os.path.join(os.path.relpath(page_dir, download_dir), "%s.html" % page_filename), 444 | }) 445 | 446 | else: 447 | component_toc = json.loads(requests.get(toc_url).text) 448 | 449 | component_title = component_toc['items'][0]['toc_title'] 450 | component_href = component_toc['items'][0]['href'] 451 | 452 | 453 | content_toc['toc'][realarb] = { 454 | 'toc' : component_toc 455 | } 456 | 457 | content_toc['guides'].append({ 458 | 'name' : component_title, 459 | 'path' : os.path.join(os.path.relpath(page_dir, download_dir), "%s.html" % component_href), 460 | }) 461 | 462 | 463 | # Adding current page to content toc 464 | 465 | 466 | # Class page 467 | if "ADSchema" in realarb and page_filename.startswith("c-"): 468 | logging.info("[+] new class page %s" % (page_filename)) 469 | 470 | page_title = _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename) 471 | if not page_title: 472 | page_title = page_filename 473 | 474 | content_toc['classes'].append({ 475 | 'name' : page_title, 476 | 'path' : os.path.relpath(filepath, download_dir), 477 | }) 478 | 479 | # Attribute page 480 | elif "ADSchema" in realarb and page_filename.startswith("a-"): 481 | logging.debug("[+] new attribute page %s" % (page_filename)) 482 | 483 | page_title = _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename) 484 | if not page_title: 485 | page_title = page_filename 486 | 487 | content_toc['attributes'].append({ 488 | 'name' : page_title, 489 | 'path' : os.path.relpath(filepath, download_dir), 490 | }) 491 | 492 | # Generic entry 493 | else: 494 | page_title = _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename) 495 | if not page_title: 496 | page_title = page_filename 497 | 498 | content_toc['entries'].append({ 499 | 'name' :page_title, 500 | 'path' : os.path.relpath(filepath, download_dir), 501 | }) 502 | 503 | 504 | # counter+=1 505 | 506 | # if counter >=2000: 507 | # break 508 | 509 | 510 | 511 | return content_toc 512 | 513 | def rewrite_soup(configuration : Configuration, soup, html_path : str, documents_dir : str): 514 | """ rewrite html contents by fixing links and remove unnecessary cruft """ 515 | 516 | # Fix navigations links 517 | links = soup.findAll("a", { "data-linktype" : "relative-path"}) # for modules and cmdlet pages 518 | link_pattern = re.compile(r"([\w\.\/-]+)") 519 | 520 | for link in links: 521 | 522 | href = link['href'] 523 | fixed_href = href 524 | 525 | # go back to module 526 | # if href == "./?view=powershell-%s" % configuration.powershell_version: 527 | # fixed_href = "./%s.html" % link.text 528 | 529 | # go to a relative page 530 | targets = link_pattern.findall(href) 531 | if not len(targets): # badly formated 'a' link 532 | continue 533 | 534 | page_target = targets[0] 535 | if page_target[-1] == '/': # module index 536 | fixed_href = "%sindex.html" % page_target 537 | else: 538 | fixed_href = "%s.html" % page_target 539 | 540 | if fixed_href != href: 541 | logging.info("link rewrite : %s -> %s " % ( href, fixed_href)) 542 | link['href'] = fixed_href 543 | 544 | # remove link to external references if we can't support it 545 | for abs_href in soup.findAll("a", { "data-linktype" : "absolute-path"}): 546 | 547 | # some externals hrefs are like this win32 -> api: 548 | # IActivationFactory 549 | if abs_href['href'].startswith("/en-us/windows/win32/api/"): 550 | 551 | # remove prefixing / 552 | prefix, *abs_suffix = abs_href['href'].split("/") 553 | 554 | # strip .html if it exists 555 | html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com"))) 556 | uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", *abs_suffix)) 557 | 558 | rel_href = os.path.relpath(uri_target, html_uri) 559 | 560 | #rel_href = os.path.relpath(full_url_target, full_url_html_page) 561 | if rel_href[-1] == '/': # module index 562 | rel_href = "%sindex.html" % rel_href 563 | else: 564 | rel_href = "%s.html" % rel_href 565 | 566 | logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href)) 567 | abs_href['href'] = rel_href 568 | abs_href['data-linktype'] = "relative-path" 569 | 570 | # some externals hrefs are like this win32 -> win32 : 571 | # DefineDosDevice 572 | elif abs_href['href'].startswith("/en-us/windows/desktop/api/"): 573 | 574 | # rewrite /en-us/windows/desktop/api to /en-us/windows/win32/api 575 | prefix, abs_suffix = abs_href['href'].split("/en-us/windows/desktop/api/") 576 | 577 | 578 | # strip .html if it exists 579 | html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com"))) 580 | uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", "en-us", "windows", "win32", "api" , abs_suffix)) 581 | 582 | rel_href = os.path.relpath(uri_target, html_uri) 583 | 584 | #rel_href = os.path.relpath(full_url_target, full_url_html_page) 585 | if rel_href[-1] == '/': # module index 586 | rel_href = "%sindex.html" % rel_href 587 | else: 588 | rel_href = "%s.html" % rel_href 589 | 590 | logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href)) 591 | abs_href['href'] = rel_href 592 | abs_href['data-linktype'] = "relative-path" 593 | 594 | 595 | # some externals hrefs are like this win32 -> win32 : 596 | # Inspect 597 | elif abs_href['href'].startswith("/en-us/windows/desktop/"): 598 | 599 | # rewrite /en-us/windows/desktop to /win32/ 600 | prefix, abs_suffix = abs_href['href'].split("/en-us/windows/desktop/") 601 | 602 | 603 | # strip .html if it exists 604 | html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com"))) 605 | uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", "win32", abs_suffix)) 606 | 607 | rel_href = os.path.relpath(uri_target, html_uri) 608 | 609 | #rel_href = os.path.relpath(full_url_target, full_url_html_page) 610 | if rel_href[-1] == '/': # module index 611 | rel_href = "%sindex.html" % rel_href 612 | else: 613 | rel_href = "%s.html" % rel_href 614 | 615 | logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href)) 616 | abs_href['href'] = rel_href 617 | abs_href['data-linktype'] = "relative-path" 618 | 619 | # some externals hrefs are like this : 620 | # UISettings.TextScaleFactorChanged Event 621 | elif abs_href['href'].startswith("/en-us/"): 622 | full_url_target = "https://docs.microsoft.com" + abs_href['href'] 623 | abs_href['href'] = full_url_target 624 | 625 | # Remove every other linktype absolute since we don't know how to handle it 626 | else: 627 | # TODO : currently we don't replace it in order to show the broken urls 628 | # abs_href.replace_with(abs_href.text) 629 | pass 630 | 631 | # remove unsupported nav elements 632 | nav_elements = [ 633 | ["nav" , { "class" : "doc-outline", "role" : "navigation"}], 634 | ["ul" , { "class" : "breadcrumbs", "role" : "navigation"}], 635 | ["div" , { "class" : "sidebar", "role" : "navigation"}], 636 | ["div" , { "class" : "dropdown dropdown-full mobilenavi"}], 637 | ["p" , { "class" : "api-browser-description"}], 638 | ["div" , { "class" : "api-browser-search-field-container"}], 639 | ["div" , { "class" : "pageActions"}], 640 | ["div" , { "class" : "container footerContainer"}], 641 | ["div" , { "class" : "dropdown-container"}], 642 | ["div" , { "class" : "binary-rating-buttons"}], 643 | ["ul" , { "class" : "metadata page-metadata"}], 644 | ["div" , { "data-bi-name" : "pageactions"}], 645 | ["div" , { "class" : "page-action-holder"}], 646 | ["div" , { "class" : "header-holder"}], 647 | ["footer" , { "data-bi-name" : "footer", "id" : "footer"}], 648 | ["div" , { "class" : "binary-rating-holder"}], 649 | ["div" , { "id" : "left-container"}], 650 | ] 651 | 652 | for nav in nav_elements: 653 | nav_class, nav_attr = nav 654 | 655 | for nav_tag in soup.findAll(nav_class, nav_attr): 656 | _ = nav_tag.extract() 657 | 658 | # remove script elems 659 | for head_script in soup.head.findAll("script"): 660 | _ = head_script.extract() 661 | 662 | # Extract and rewrite additionnal stylesheets to download 663 | ThemeResourceRecord = collections.namedtuple('ThemeResourceRecord', 'url, path') 664 | 665 | theme_output_dir = os.path.join(documents_dir, Configuration.domain) 666 | theme_resources = [] 667 | 668 | for link in soup.head.findAll("link", { "rel" : "stylesheet"}): 669 | uri_path = link['href'].strip() 670 | 671 | if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri): 672 | continue 673 | 674 | # Construct (url, path) tuple 675 | css_url = "https://%s/%s" % (Configuration.domain, uri_path) 676 | css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/')) 677 | 678 | # Converting href to a relative link 679 | path = os.path.relpath(css_filepath, os.path.dirname(html_path)) 680 | rel_uri = '/'.join(path.split(os.sep)) 681 | link['href'] = rel_uri 682 | 683 | theme_resources.append( ThemeResourceRecord( 684 | url = css_url, 685 | path = os.path.relpath(css_filepath, documents_dir), # stored as relative path 686 | )) 687 | 688 | return soup, set(theme_resources) 689 | 690 | 691 | 692 | def rewrite_html_contents(configuration : Configuration, html_root_dir : str): 693 | """ rewrite every html file downloaded """ 694 | 695 | additional_resources = set() 696 | 697 | for html_file in glob.glob("%s/**/*.html" % html_root_dir, recursive = True): 698 | 699 | logging.info("rewrite html_file : %s" % (html_file)) 700 | 701 | # Read content and parse html 702 | with open(html_file, 'r', encoding='utf8') as i_fd: 703 | html_content = i_fd.read() 704 | 705 | soup = bs(html_content, 'html.parser') 706 | 707 | # rewrite html 708 | soup, resources = rewrite_soup(configuration, soup, html_file, html_root_dir) 709 | additional_resources = additional_resources.union(resources) 710 | 711 | # Export fixed html 712 | fixed_html = soup.prettify("utf-8") 713 | with open(html_file, 'wb') as o_fd: 714 | o_fd.write(fixed_html) 715 | 716 | return additional_resources 717 | 718 | 719 | def download_additional_resources(configuration : Configuration, documents_dir : str, resources_to_dl : set = set()): 720 | """ Download optional resources for "beautification """ 721 | 722 | for resource in resources_to_dl: 723 | 724 | download_textfile( 725 | resource.url, 726 | os.path.join(documents_dir, resource.path) 727 | ) 728 | 729 | # Download index start page 730 | src_index_filepath = os.path.join(documents_dir, Configuration.domain, "win32", "desktop-app-technologies.html") 731 | index_filepath = os.path.join(documents_dir, Configuration.domain, "win32", "index.html") 732 | shutil.copy(src_index_filepath, index_filepath) 733 | 734 | # soup = bs( configuration.webdriver.get_url_page(index_url), 'html.parser') 735 | # soup = rewrite_index_soup(configuration, soup, index_filepath, documents_dir) 736 | # fixed_html = soup.prettify("utf-8") 737 | # with open(index_filepath, 'wb') as o_fd: 738 | # o_fd.write(fixed_html) 739 | 740 | 741 | # # Download module.svg icon for start page 742 | # icon_module_url = '/'.join(["https:/" , Configuration.domain, "en-us", "media", "toolbars", "module.svg"]) 743 | # icon_module_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg") 744 | # download_binary(icon_module_url, icon_module_path) 745 | 746 | 747 | def create_sqlite_database(configuration, content_toc, resources_dir, documents_dir): 748 | """ Indexing the html document in a format Dash can understand """ 749 | 750 | def insert_into_sqlite_db(cursor, name, record_type, path): 751 | """ Insert a new unique record in the sqlite database. """ 752 | try: 753 | cursor.execute('SELECT rowid FROM searchIndex WHERE path = ?', (path,)) 754 | dbpath = cursor.fetchone() 755 | cursor.execute('SELECT rowid FROM searchIndex WHERE name = ?', (name,)) 756 | dbname = cursor.fetchone() 757 | 758 | if dbpath is None and dbname is None: 759 | cursor.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, record_type, path)) 760 | logging.debug('DB add [%s] >> name: %s, path: %s' % (record_type, name, path)) 761 | else: 762 | logging.debug('record exists') 763 | 764 | except: 765 | pass 766 | 767 | sqlite_filepath = os.path.join(resources_dir, "docSet.dsidx") 768 | if os.path.exists(sqlite_filepath): 769 | os.remove(sqlite_filepath) 770 | 771 | db = sqlite3.connect(sqlite_filepath) 772 | cur = db.cursor() 773 | cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);') 774 | cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);') 775 | 776 | 777 | mapping = { 778 | # win32 content 779 | "guides" : "Guide", 780 | "attributes" : "Attribute", 781 | "classes" : "Class", 782 | "entries" : "Entry", 783 | 784 | # api-sdk content 785 | "categories" : "Category", 786 | "files" : "File", 787 | 788 | 'callbacks' : "Callback", 789 | 'functions' : "Function", 790 | 'enums' : "Enum", 791 | 'interfaces' : "Interface", 792 | 'structures' : "Structure", 793 | 794 | } 795 | 796 | # import pdb;pdb.set_trace() 797 | for key in mapping.keys(): 798 | 799 | for _value in content_toc.get(key, []): 800 | 801 | # path should be unix compliant 802 | value_path = _value['path'].replace(os.sep, '/') 803 | insert_into_sqlite_db(cur, _value['name'], mapping[key], value_path) 804 | 805 | 806 | 807 | 808 | # commit and close db 809 | db.commit() 810 | db.close() 811 | 812 | def copy_folder(src_folder : str, dst_folder : str): 813 | """ Copy a full folder tree anew every time """ 814 | 815 | def onerror(func, path, exc_info): 816 | """ 817 | Error handler for ``shutil.rmtree``. 818 | 819 | If the error is due to an access error (read only file) 820 | it attempts to add write permission and then retries. 821 | 822 | If the error is for another reason it re-raises the error. 823 | 824 | Usage : ``shutil.rmtree(path, onerror=onerror)`` 825 | """ 826 | import stat 827 | 828 | if not os.path.exists(path): 829 | return 830 | 831 | if not os.access(path, os.W_OK): 832 | # Is the error an access error ? 833 | os.chmod(path, stat.S_IWUSR) 834 | func(path) 835 | else: 836 | raise 837 | 838 | # print(dst_folder) 839 | shutil.rmtree(dst_folder,ignore_errors=False,onerror=onerror) 840 | shutil.copytree(src_folder, dst_folder) 841 | 842 | def merge_folders(src, dst): 843 | 844 | if os.path.isdir(src): 845 | 846 | if not os.path.exists(dst): 847 | os.makedirs(dst) 848 | 849 | for name in os.listdir(src): 850 | merge_folders( 851 | os.path.join(src, name), 852 | os.path.join(dst, name) 853 | ) 854 | else: 855 | shutil.copyfile(src, dst) 856 | 857 | def main(configuration : Configuration): 858 | 859 | # """ Scheme for content toc : 860 | # { 861 | # module_name : { 862 | # 'name' : str, 863 | # 'index' : relative path, 864 | # 'entries' : [ 865 | # { 866 | # 'name' : str, 867 | # 'path' : relative path, 868 | # }, 869 | # ... 870 | # ] 871 | # }, 872 | # ... 873 | # } 874 | # """ 875 | content_toc = {} 876 | resources_to_dl = set() 877 | 878 | """ 0. Prepare folders """ 879 | source_dir = os.path.join(configuration.build_folder, "_0_win32_source") 880 | api_source_dir = os.path.join(configuration.build_folder, "_0_api_sdk_source") 881 | 882 | download_dir = os.path.join(configuration.build_folder, "_1_downloaded_contents") 883 | html_rewrite_dir = os.path.join(configuration.build_folder, "_2_html_rewrite") 884 | additional_resources_dir = os.path.join(configuration.build_folder, "_3_additional_resources") 885 | package_dir = os.path.join(configuration.build_folder, "_4_ready_to_be_packaged") 886 | 887 | for folder in [source_dir, api_source_dir, download_dir, html_rewrite_dir, additional_resources_dir, package_dir]: 888 | os.makedirs(folder, exist_ok=True) 889 | 890 | # _4_ready_to_be_packaged is the final build dir 891 | docset_dir = os.path.join(package_dir, "%s.docset" % Configuration.docset_name) 892 | content_dir = os.path.join(docset_dir , "Contents") 893 | resources_dir = os.path.join(content_dir, "Resources") 894 | document_dir = os.path.join(resources_dir, "Documents") 895 | 896 | if conf.crawl_contents: 897 | # cloning source directories for scraping contents, extremely long operation 898 | logging.info("Downloading win32 markdown zipped sources : %s -> %s" % ("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip"))) 899 | download_binary("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip")) 900 | 901 | logging.info("Extracting win32 markdown zipped sources : ") 902 | with zipfile.ZipFile(os.path.join(source_dir, "docs.zip"), 'r') as zip_ref: 903 | zip_ref.extractall(source_dir) 904 | 905 | logging.info("Downloading sdk-api markdown zipped sources : %s -> %s" % ("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip"))) 906 | download_binary("https://github.com/MicrosoftDocs/sdk-api/archive/refs/heads/docs.zip", os.path.join(api_source_dir, "docs.zip")) 907 | 908 | logging.info("Extracting api-sdk markdown zipped sources : ") 909 | with zipfile.ZipFile(os.path.join(api_source_dir, "docs.zip"), 'r') as zip_ref: 910 | zip_ref.extractall(api_source_dir) 911 | 912 | 913 | """ 1. Download html pages """ 914 | logging.info("[1] scraping win32 web contents") 915 | content_toc = {} 916 | content_toc = crawl_msdn_contents(configuration, download_dir, source_dir) 917 | 918 | logging.info("[1] scraping sdk-api web contents") 919 | api_content_toc = crawl_sdk_api_contents(configuration, download_dir, api_source_dir) 920 | 921 | # Merge win32 api content 922 | content_toc.update(api_content_toc) 923 | with open(os.path.join(download_dir, "toc.json"), "w") as content: 924 | json.dump(content_toc, content) 925 | else: 926 | # print(os.path.join(download_dir, "toc.json")) 927 | with open(os.path.join(download_dir, "toc.json"), "r") as content: 928 | content_toc = json.load(content) 929 | 930 | """ 2. Parse and rewrite html contents """ 931 | logging.info("[2] rewriting urls and hrefs") 932 | copy_folder(download_dir, html_rewrite_dir) 933 | resources_to_dl = rewrite_html_contents(configuration, html_rewrite_dir) 934 | 935 | """ 3. Download additionnal resources """ 936 | logging.info("[3] download style contents") 937 | copy_folder(html_rewrite_dir, additional_resources_dir ) 938 | download_additional_resources(configuration, additional_resources_dir, resources_to_dl) 939 | 940 | """ 4. Database indexing """ 941 | logging.info("[4] indexing to database") 942 | copy_folder(additional_resources_dir, document_dir ) 943 | create_sqlite_database(configuration, content_toc, resources_dir, document_dir) 944 | 945 | """ 5. Archive packaging """ 946 | src_dir = os.path.dirname(__file__) 947 | shutil.copy(os.path.join(src_dir, "static/Info.plist"), content_dir) 948 | shutil.copy(os.path.join(src_dir, "static/DASH_LICENSE"), os.path.join(resources_dir, "LICENSE")) 949 | shutil.copy(os.path.join(src_dir, "static/icon.png"), docset_dir) 950 | shutil.copy(os.path.join(src_dir, "static/icon@2x.png"), docset_dir) 951 | 952 | output_dir = os.path.dirname(configuration.output_filepath) 953 | os.makedirs(output_dir, exist_ok=True) 954 | 955 | logging.info("[5] packaging as a dash docset") 956 | make_docset( 957 | docset_dir, 958 | configuration.output_filepath, 959 | Configuration.docset_name 960 | ) 961 | 962 | 963 | if __name__ == '__main__': 964 | 965 | 966 | 967 | parser = argparse.ArgumentParser( 968 | description="Dash docset creation script for MSDN's Win32 API" 969 | ) 970 | 971 | parser.add_argument("-vv", "--verbose", 972 | help="increase output verbosity", 973 | action="store_true" 974 | ) 975 | 976 | subparsers = parser.add_subparsers(help='sub-command help', dest='command') 977 | 978 | 979 | parser_create = subparsers.add_parser('create_docset', help='scrap the internet in order to create a docset') 980 | parser_create.add_argument("-t", "--temporary", 981 | help="Use a temporary directory for creating docset, otherwise use current dir.", 982 | default=False, 983 | action="store_true" 984 | ) 985 | 986 | 987 | parser_create.add_argument("-o", "--output", 988 | help="set output filepath", 989 | default = os.path.join(os.getcwd(), "MSDN.tgz"), 990 | ) 991 | 992 | parser_create.add_argument("-s", "--sampling", 993 | help="generate only a 'sample' docset, in order to test if the rewriting rules are corrects", 994 | default=False, 995 | action="store_true" 996 | ) 997 | 998 | parser_rewrite = subparsers.add_parser('rewrite_html', help='rewrite html file in order to test rules') 999 | 1000 | parser_rewrite.add_argument("input", 1001 | help="set input filepath" 1002 | ) 1003 | 1004 | parser_rewrite.add_argument("output", 1005 | help="set output filepath" 1006 | ) 1007 | 1008 | parser_rewrite.add_argument("html_root_dir", 1009 | help="set html_root_dir filepath" 1010 | ) 1011 | 1012 | args = parser.parse_args() 1013 | if args.verbose: 1014 | logging.basicConfig(level=logging.DEBUG) 1015 | logging.getLogger("requests").setLevel(logging.WARNING) 1016 | logging.getLogger("urllib3").setLevel(logging.WARNING) 1017 | else: 1018 | logging.basicConfig(level=logging.INFO) 1019 | 1020 | 1021 | if args.command == "rewrite_html": 1022 | 1023 | conf = Configuration( args ) 1024 | 1025 | # Read content and parse html 1026 | with open(args.input, 'r', encoding='utf8') as i_fd: 1027 | html_content = i_fd.read() 1028 | 1029 | soup = bs(html_content, 'html.parser') 1030 | 1031 | # rewrite html 1032 | soup, resources = rewrite_soup(conf, soup, args.input, args.html_root_dir) 1033 | 1034 | # Export fixed html 1035 | fixed_html = soup.prettify("utf-8") 1036 | with open(args.output, 'wb') as o_fd: 1037 | o_fd.write(fixed_html) 1038 | 1039 | elif args.command == "create_docset": 1040 | conf = Configuration( args ) 1041 | 1042 | if args.temporary: 1043 | 1044 | with tempfile.TemporaryDirectory() as tmp_builddir: 1045 | conf.build_folder = tmp_builddir 1046 | main(conf) 1047 | else: 1048 | main(conf) 1049 | 1050 | else: 1051 | raise NotImplementedError("command not implemented %s" % args.command) 1052 | -------------------------------------------------------------------------------- /static/DASH_LICENSE: -------------------------------------------------------------------------------- 1 | You are not allowed to distribute or make use of any of the files within this folder ("Resources") without written permission from Kapeli or whilst using the Dash app developed by Kapeli. This does not apply to the files located within the "Documents" folder. -------------------------------------------------------------------------------- /static/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleIdentifier 6 | msdn 7 | 8 | CFBundleName 9 | MSDN 10 | 11 | DashDocSetFallbackURL 12 | https://docs.microsoft.com/win32/ 13 | 14 | dashIndexFilePath 15 | docs.microsoft.com/win32/index.html 16 | 17 | DashDocSetFamily 18 | msdn 19 | 20 | DocSetPlatformFamily 21 | msdn 22 | 23 | isDashDocset 24 | 25 | 26 | isJavaScriptEnabled 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /static/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasg/msdn-docset/188d087f030a92f2c13b0dfd8df477cfef7e6876/static/icon.png -------------------------------------------------------------------------------- /static/icon@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasg/msdn-docset/188d087f030a92f2c13b0dfd8df477cfef7e6876/static/icon@2x.png --------------------------------------------------------------------------------