├── msdn-to-docset.py
└── static
    ├── DASH_LICENSE
    ├── Info.plist
    ├── icon.png
    └── icon@2x.png


/msdn-to-docset.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | 
   3 | import sqlite3
   4 | import os
   5 | import sys
   6 | import glob
   7 | import re
   8 | import shutil
   9 | import logging
  10 | import json
  11 | import tarfile
  12 | import tempfile
  13 | import argparse
  14 | import urllib.parse
  15 | import urllib
  16 | import time
  17 | import collections
  18 | import zipfile
  19 | 
  20 | from ruamel.yaml import YAML
  21 | import requests
  22 | from requests.adapters import HTTPAdapter
  23 | from requests.packages.urllib3.util.retry import Retry
  24 | from requests.exceptions import ConnectionError
  25 | from bs4 import BeautifulSoup as bs, Tag # pip install bs4
  26 | from selenium import webdriver
  27 | # from selenium.webdriver import Firefox
  28 | # from selenium.webdriver.firefox.options import Options
  29 | # from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
  30 | 
  31 | from selenium.webdriver import Chrome
  32 | from selenium.webdriver.chrome.options import Options
  33 | 
  34 | class PoshWebDriver:
  35 |     """ Thin wrapper for selenium webdriver for page content retrieval """
  36 | 
  37 |     def __init__(self, executable_path = None):
  38 | 
  39 |         self.options = Options()
  40 |         self.options.add_argument("--headless")
  41 |         self.options.add_argument("--window-size=1920x1080")
  42 | 
  43 |         self.driver = webdriver.Chrome(options=self.options)
  44 | 
  45 | 
  46 |     def get_url_page(self, url):
  47 |         """ retrieve the full html content of a page after Javascript execution """
  48 |         
  49 |         index_html = None
  50 |         try:
  51 |             self.driver.get(url)
  52 |             index_html = self.driver.page_source
  53 |         except (ConnectionResetError, urllib.error.URLError) as e:
  54 |             # we may have a triggered a anti-scraping time ban
  55 |             # Lay low for several seconds and get back to it.
  56 | 
  57 |             self.driver.quit()
  58 |             time.sleep(2)
  59 |             
  60 | 
  61 |             self.driver = webdriver.Chrome(options=self.options)
  62 |                 
  63 |             index_html = None
  64 | 
  65 |         # try a second time, and raise error if fail
  66 |         if not index_html:
  67 |             self.driver.get(url)
  68 |             index_html = self.driver.page_source
  69 | 
  70 |         return index_html
  71 | 
  72 |     def quit():
  73 |         return self.driver.quit()
  74 | 
  75 | 
  76 | class Configuration:
  77 | 
  78 |     # STATIC CONSTANTS
  79 |     docset_name = 'MSDN'
  80 | 
  81 |     domain = "docs.microsoft.com"
  82 |     default_theme_uri = "_themes/docs.theme/master/en-us/_themes"
  83 |     
  84 |     def __init__(self, args):
  85 | 
  86 |         
  87 |         # # selected powershell api version
  88 |         # self.powershell_version = args.version
  89 | 
  90 |         # # The modules and cmdlets pages are "versionned" using additional params in the GET request
  91 |         # self.powershell_version_param = "view=powershell-{0:s}".format(self.powershell_version)
  92 | 
  93 |         # build folder (must be cleaned afterwards)
  94 |         # self.build_folder = os.path.join(os.getcwd(), "_build_{0:s}".format(self.powershell_version))
  95 |         self.build_folder = os.path.join(os.getcwd(), "_build_msdn")
  96 | 
  97 |         # output file
  98 |         self.output_filepath = os.path.realpath(args.output)
  99 | 
 100 |         # powershell docs start page
 101 |         self.api_index_url = "https://docs.microsoft.com/en-us/windows/win32/api/"
 102 | 
 103 |         self.docs_index_url = "https://docs.microsoft.com/en-us/windows/win32/desktop-app-technologies"
 104 | 
 105 |         
 106 | 
 107 |         # # powershell docs table of contents url
 108 |         # self.docs_toc_url =  "https://{0:s}/psdocs/toc.json?{2:s}".format(
 109 |         #     Configuration.base_url, 
 110 |         #     self.powershell_version,
 111 |         #     self.powershell_version_param
 112 |         # )
 113 | 
 114 |         # self.windows_toc_url = "https://{0:s}/windowsserver2019-ps/toc.json?view=windowsserver2019-ps".format(
 115 |         #     Configuration.base_url
 116 |         # )
 117 | 
 118 |         # selenium webdriver
 119 |         self.webdriver = PoshWebDriver()
 120 | 
 121 |         self.crawl_contents  = True
 122 | 
 123 |         # selected module
 124 |         # self.filter_modules = [module.lower() for module in args.modules]
 125 | 
 126 | 
 127 | # Global session for several retries
 128 | session = requests.Session()
 129 | retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
 130 | session.mount('http://', HTTPAdapter(max_retries=retries))
 131 | 
 132 | 
 133 | def download_binary(url, output_filename):
 134 |     """ Download GET request as binary file """
 135 |     global session
 136 |     
 137 |     logging.debug("download_binary : %s -> %s" % (url, output_filename))
 138 | 
 139 |     # ensure the folder path actually exist
 140 |     os.makedirs(os.path.dirname(output_filename), exist_ok = True)
 141 | 
 142 |     r = session.get(url, stream=True)
 143 |     with open(output_filename, 'wb') as f:
 144 |         for data in r.iter_content(32*1024):
 145 |             f.write(data)
 146 | 
 147 | def download_textfile(url : str ,  output_filename : str, params : dict = None):
 148 |     """ Download GET request as utf-8 text file """
 149 |     global session
 150 | 
 151 |     logging.debug("download_textfile : %s -> %s" % (url, output_filename))
 152 | 
 153 |     # ensure the folder path actually exist
 154 |     os.makedirs(os.path.dirname(output_filename), exist_ok = True)
 155 |     
 156 |     while True:
 157 |         try:
 158 |             r = session.get(url, data = params)
 159 |         except ConnectionError:
 160 |             logging.debug("caught ConnectionError, retrying...")
 161 |             time.sleep(2)
 162 |         else:
 163 |             break
 164 | 
 165 |     # do not write 404 pages on disk
 166 |     if r.status_code != 200:
 167 |         return False
 168 |     
 169 |     r.encoding = 'utf-8'
 170 |     with open(output_filename, 'w', encoding="utf-8") as f:
 171 |         f.write(r.text)
 172 | 
 173 |     return True
 174 | 
 175 | 
 176 | def make_docset(source_dir, dst_filepath, filename):
 177 |     """ 
 178 |     Tar-gz the build directory while conserving the relative folder tree paths. 
 179 |     Copied from : https://stackoverflow.com/a/17081026/1741450 
 180 |     """
 181 |     dst_dir = os.path.dirname(dst_filepath)
 182 |     tar_filepath = os.path.join(dst_dir, '%s.tar' % filename)
 183 |     
 184 |     with tarfile.open(tar_filepath, "w:gz") as tar:
 185 |         tar.add(source_dir, arcname=os.path.basename(source_dir))
 186 | 
 187 |     shutil.move(tar_filepath, dst_filepath)
 188 |     
 189 | 
 190 | 
 191 | def download_page_contents(configuration, uri, output_filepath):
 192 |     """ Download a page using it's uri from the TOC """
 193 | 
 194 |     # Resolving "absolute" url et use appropriate version
 195 |     full_url = urllib.parse.urljoin(configuration.docs_toc_url, uri)
 196 |     versionned_url = "{0:s}?{1:s}".format(full_url, configuration.powershell_version_param) 
 197 | 
 198 |     download_textfile(versionned_url, output_filepath)
 199 |     
 200 | 
 201 | def download_module_contents(configuration, module_name, module_uri, module_dir, cmdlets, root_dir):
 202 |     """ Download a modules contents """
 203 |     
 204 |     module_filepath = os.path.join(module_dir, "%s.html" % module_name)
 205 | 
 206 |     logging.debug("downloading %s module index page  -> %s" % (module_name, module_filepath))
 207 |     if module_uri:
 208 |         download_page_contents(configuration, module_uri, module_filepath)
 209 | 
 210 |     cmdlets_infos = []
 211 | 
 212 |     # Downloading cmdlet contents
 213 |     for cmdlet in cmdlets:
 214 | 
 215 |         cmdlet_name = cmdlet['toc_title']
 216 |         if cmdlet_name.lower() in ("about", "functions", "providers", "provider"): # skip special toc
 217 |             continue
 218 | 
 219 |         cmdlet_uri = cmdlet["href"]
 220 |         cmdlet_filepath = os.path.join(module_dir, "%s.html" % cmdlet_name)
 221 | 
 222 |         logging.debug("downloading %s cmdlet doc -> %s" % (cmdlet_name, cmdlet_filepath))
 223 |         download_page_contents(configuration, cmdlet_uri, cmdlet_filepath)
 224 | 
 225 |         cmdlets_infos.append({
 226 |             'name' : cmdlet_name,
 227 |             'path' : os.path.relpath(cmdlet_filepath, root_dir),
 228 |         })
 229 | 
 230 |     module_infos = {
 231 |         'name' : module_name,
 232 |         'index' : os.path.relpath(module_filepath, root_dir),
 233 |         'cmdlets' : cmdlets_infos
 234 |     }
 235 | 
 236 |     return module_infos
 237 | 
 238 | def _findname(obj, key):
 239 |     """ return the 'toc_title' value associated to a 'href' node """
 240 |     # print("%r == %s" % (obj.get('href', None), key))
 241 |     if obj.get('href', None)==key:return obj['toc_title']
 242 |     for k, v in obj.items():
 243 |         if isinstance(v,dict):
 244 |             item = _findname(v, key)
 245 |             if item is not None:
 246 |                 return item
 247 |         if isinstance(v,list):
 248 |             for i in v:
 249 |                 item = _findname(i, key)
 250 |                 if item is not None:
 251 |                     return item
 252 | 
 253 | def crawl_sdk_api_folder(configuration: Configuration, download_dir : str, source_dir: str,  directory : str, api_content_toc : dict):
 254 | 
 255 | 
 256 |     for markdown_filepath in glob.glob(os.path.join(source_dir,directory , "*.md")):
 257 | 
 258 |         page_filename, page_ext = os.path.splitext(os.path.basename(markdown_filepath))
 259 |         realarb = os.path.relpath(os.path.dirname(markdown_filepath), source_dir)
 260 | 
 261 |         # already processed
 262 |         if page_filename == "index":
 263 |             continue
 264 | 
 265 |         url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}/{1:s}".format(realarb, page_filename)
 266 |         filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}/{1:s}.html".format(realarb, page_filename))
 267 |         logging.info("[+] download page %s  -> %s " % (url, filepath))
 268 |         success = download_textfile(url, filepath)
 269 | 
 270 |         if not success:
 271 |             logging.info("[X] could not download page %s  -> %s " % (url, filepath))
 272 |             continue
 273 | 
 274 |         
 275 |         url_relpath = "/windows/win32/api/{0:s}/{1:s}".format(realarb, page_filename)
 276 |         page_title =  _findname(api_content_toc['toc'][directory]['items'][0], url_relpath)
 277 |         #logging.info("[+] %s => title '%s'" % (url_relpath, page_title))
 278 |         
 279 |         if page_filename.startswith("nc-"):
 280 |             category = "callbacks"
 281 |         elif page_filename.startswith("ne-"):
 282 |             category = "enums"
 283 |         elif page_filename.startswith("nf-"):
 284 |             category = "functions"
 285 |         elif page_filename.startswith("nn-"):
 286 |             category = "interfaces"
 287 |         elif page_filename.startswith("ns-"):
 288 |             category = "structures"
 289 |         elif page_filename.startswith("nl-"):
 290 |             category = "classes"
 291 |         else:
 292 |             category = "entries"
 293 | 
 294 |         api_content_toc[category].append({
 295 |             'name' : page_title,
 296 |             'path' : "docs.microsoft.com/en-us{0:s}.html".format(url_relpath),
 297 |         })
 298 | 
 299 |     return api_content_toc
 300 | 
 301 | 
 302 | def crawl_sdk_api_contents(configuration: Configuration, download_dir : str, source_dir : str):
 303 |     """ Download sdk-api entries based on TOC """
 304 |     
 305 |     api_content_toc = {
 306 |         'categories' : [],
 307 |         'files' : [],
 308 |         'callbacks' : [],
 309 |         'functions' : [],
 310 |         'enums' : [],
 311 |         'interfaces' : [],
 312 |         'structures' : [],
 313 |         'classes' : [],
 314 | 
 315 |         'entries' : [],
 316 |         'toc' : {}
 317 |     }
 318 | 
 319 |     content_dir = os.path.join(source_dir, "sdk-api-docs", "sdk-api-src", "content")
 320 | 
 321 |     for directory in os.listdir(content_dir):
 322 | 
 323 |         # download toc for directory
 324 |         toc_url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}/toc.json".format(directory)
 325 |         logging.info("[+] download toc for directory %s" % (toc_url))
 326 |         toc_r = requests.get(toc_url)
 327 |         if toc_r.status_code == 200:
 328 |             api_content_toc['toc'][directory] = json.loads(requests.get(toc_url).text)
 329 |         else:
 330 |             logging.warning("[!] directory %s has no TOC !" % (toc_url))
 331 | 
 332 |         # only index folders with a toc
 333 |         if  not api_content_toc['toc'].get(directory, None):
 334 |             continue
 335 | 
 336 |         # "meta" directory
 337 |         if directory.startswith("_"):
 338 | 
 339 |             
 340 |             
 341 |                 url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(
 342 |                     directory,
 343 |                 )
 344 |                 filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html")
 345 |                 logging.info("[+] download page %s  -> %s " % (url, filepath))
 346 |                 download_textfile(url, filepath)
 347 | 
 348 |                 
 349 |                 category_title = api_content_toc['toc'][directory]['items'][0]['toc_title']
 350 |                 api_content_toc['categories'].append({
 351 |                     'name' : category_title,
 352 |                     'path' : os.path.join("docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html"),
 353 |                 })
 354 | 
 355 |         # directory generated from a file
 356 |         else:
 357 | 
 358 |             url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(
 359 |                 directory,
 360 |             )
 361 |             filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html")
 362 |             logging.info("[+] download page %s  -> %s " % (url, filepath))
 363 |             download_textfile(url, filepath)
 364 | 
 365 |             category_title = directory
 366 |             if  api_content_toc['toc'].get(directory, None):
 367 |                 category_title = api_content_toc['toc'][directory]['items'][0]['toc_title']
 368 | 
 369 |             api_content_toc['files'].append({
 370 |                 'name' : category_title,
 371 |                 'path' : os.path.join("docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html"),
 372 |             })
 373 | 
 374 | 
 375 |         api_content_toc = crawl_sdk_api_folder(configuration, download_dir, content_dir, directory, api_content_toc)
 376 | 
 377 |     return api_content_toc
 378 | 
 379 | 
 380 | def crawl_msdn_contents(configuration: Configuration, download_dir : str, source_dir : str):
 381 |     """ Download MSDN modules and content pages based on TOC """
 382 | 
 383 |     content_toc = {
 384 |         'attributes' : [],
 385 |         'classes' : [],
 386 |         'entries' : [],
 387 |         'guides' : [],
 388 |         'toc' : {},
 389 |     }
 390 | 
 391 |     # counter = 0
 392 |     for r, d, f in os.walk(os.path.join(source_dir, "win32-docs", "desktop-src"), topdown=True):
 393 |         
 394 |         # if counter >=2000:
 395 |         #     break
 396 | 
 397 |         for image_file in filter(lambda s: os.path.splitext(s)[1] in [".png", ".jpg", ".jpeg"] ,f):
 398 |             realarb = os.path.relpath(r, os.path.join(source_dir, "win32-docs", "desktop-src"))
 399 |             image_dir = os.path.join(download_dir, "docs.microsoft.com/win32", realarb)
 400 |             filepath = os.path.join(image_dir, image_file)
 401 | 
 402 |             os.makedirs(image_dir, exist_ok=True)
 403 |             shutil.copyfile(os.path.join(r, image_file), filepath)
 404 | 
 405 |         for markdown_file in filter(lambda s: os.path.splitext(s)[1] == ".md" ,f):
 406 |             page_filename, page_ext = os.path.splitext(markdown_file)
 407 | 
 408 |             realarb = os.path.relpath(r, os.path.join(source_dir, "win32-docs", "desktop-src"))
 409 |             url = "https://docs.microsoft.com/en-us/windows/win32/{0:s}/{1:s}".format(
 410 |                 realarb,
 411 |                 page_filename
 412 |             )
 413 | 
 414 |             # retrieve html of page
 415 |             page_dir = os.path.join(download_dir, "docs.microsoft.com/win32", realarb)
 416 |             filepath = os.path.join(page_dir, "%s.html" % page_filename)
 417 |             logging.debug("[+] download page %s  -> %s " % (url, filepath))
 418 |             download_textfile(url, filepath)
 419 | 
 420 |             # don't care about top level pages
 421 |             if realarb == '.':
 422 |                 continue
 423 | 
 424 |             # First time navigating in this directory
 425 |             if realarb not in content_toc['toc'].keys():
 426 | 
 427 |                 # download toc for page
 428 |                 toc_url = "https://docs.microsoft.com/en-us/windows/win32/{0:s}/toc.json".format(
 429 |                     realarb
 430 |                 )
 431 |                 logging.info("[+] download toc for page %s" % (toc_url))
 432 | 
 433 |                 toc_r = requests.get(toc_url)
 434 |                 if toc_r.status_code != 200:
 435 | 
 436 |                     # Could not find a toc for this folder
 437 |                     content_toc['toc'][realarb] =  {
 438 |                         'toc' : {'items' : [{}]}
 439 |                     }
 440 | 
 441 |                     content_toc['guides'].append({
 442 |                         'name' : page_filename,
 443 |                         'path' : os.path.join(os.path.relpath(page_dir, download_dir), "%s.html" % page_filename),
 444 |                     })
 445 | 
 446 |                 else:
 447 |                     component_toc = json.loads(requests.get(toc_url).text)
 448 | 
 449 |                     component_title = component_toc['items'][0]['toc_title']
 450 |                     component_href = component_toc['items'][0]['href']
 451 | 
 452 | 
 453 |                     content_toc['toc'][realarb] =  {
 454 |                         'toc' : component_toc
 455 |                     }
 456 | 
 457 |                     content_toc['guides'].append({
 458 |                         'name' : component_title,
 459 |                         'path' : os.path.join(os.path.relpath(page_dir, download_dir), "%s.html" % component_href),
 460 |                     })
 461 |   
 462 |             
 463 |             # Adding current page to content toc
 464 | 
 465 | 
 466 |             # Class page
 467 |             if "ADSchema" in realarb and page_filename.startswith("c-"):
 468 |                 logging.info("[+] new class page %s" % (page_filename))
 469 | 
 470 |                 page_title =  _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename)
 471 |                 if not page_title:
 472 |                     page_title = page_filename
 473 | 
 474 |                 content_toc['classes'].append({
 475 |                     'name' : page_title,
 476 |                     'path' : os.path.relpath(filepath, download_dir),
 477 |                 })
 478 | 
 479 |             # Attribute page
 480 |             elif "ADSchema" in realarb and page_filename.startswith("a-"):
 481 |                 logging.debug("[+] new attribute page %s" % (page_filename))
 482 | 
 483 |                 page_title =  _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename)
 484 |                 if not page_title:
 485 |                     page_title = page_filename
 486 | 
 487 |                 content_toc['attributes'].append({
 488 |                     'name' : page_title,
 489 |                     'path' : os.path.relpath(filepath, download_dir),
 490 |                 })
 491 | 
 492 |             # Generic entry
 493 |             else:
 494 |                 page_title =  _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename)
 495 |                 if not page_title:
 496 |                     page_title = page_filename
 497 | 
 498 |                 content_toc['entries'].append({
 499 |                     'name' :page_title,
 500 |                     'path' : os.path.relpath(filepath, download_dir),
 501 |                 })
 502 | 
 503 | 
 504 |             # counter+=1
 505 | 
 506 |             # if counter >=2000:
 507 |             #     break
 508 | 
 509 | 
 510 | 
 511 |     return content_toc
 512 | 
 513 | def rewrite_soup(configuration : Configuration, soup, html_path : str, documents_dir : str):
 514 |     """ rewrite html contents by fixing links and remove unnecessary cruft """
 515 | 
 516 |     # Fix navigations links
 517 |     links = soup.findAll("a", { "data-linktype" : "relative-path"}) # for modules and cmdlet pages
 518 |     link_pattern = re.compile(r"([\w\.\/-]+)")
 519 | 
 520 |     for link in links:
 521 | 
 522 |         href = link['href']
 523 |         fixed_href = href
 524 | 
 525 |         # go back to module
 526 |         # if href == "./?view=powershell-%s" % configuration.powershell_version:
 527 |         #     fixed_href = "./%s.html" % link.text
 528 | 
 529 |         # go to a relative page
 530 |         targets = link_pattern.findall(href)
 531 |         if not len(targets): # badly formated 'a' link
 532 |             continue
 533 | 
 534 |         page_target = targets[0]
 535 |         if page_target[-1] == '/': # module index
 536 |             fixed_href = "%sindex.html" % page_target
 537 |         else:
 538 |             fixed_href = "%s.html" % page_target
 539 |     
 540 |         if fixed_href != href:
 541 |             logging.info("link rewrite : %s -> %s " % ( href, fixed_href))
 542 |             link['href'] = fixed_href
 543 | 
 544 |     # remove link to external references if we can't support it
 545 |     for abs_href in soup.findAll("a", { "data-linktype" : "absolute-path"}):
 546 | 
 547 |         # some externals hrefs are like this win32 -> api:
 548 |         #   <a href="/en-us/windows/win32/api/activation/nn-activation-iactivationfactory" data-linktype="absolute-path">IActivationFactory</a>
 549 |         if abs_href['href'].startswith("/en-us/windows/win32/api/"):
 550 | 
 551 |             # remove prefixing /
 552 |             prefix, *abs_suffix = abs_href['href'].split("/")
 553 | 
 554 |             # strip .html if it exists
 555 |             html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com")))
 556 |             uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com",  *abs_suffix))
 557 | 
 558 |             rel_href =  os.path.relpath(uri_target, html_uri)
 559 | 
 560 |             #rel_href = os.path.relpath(full_url_target, full_url_html_page)
 561 |             if rel_href[-1] == '/': # module index
 562 |                 rel_href = "%sindex.html" % rel_href
 563 |             else:
 564 |                 rel_href = "%s.html" % rel_href
 565 |             
 566 |             logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href))               
 567 |             abs_href['href'] = rel_href
 568 |             abs_href['data-linktype'] = "relative-path"
 569 | 
 570 |         # some externals hrefs are like this win32 -> win32 :
 571 |         # <a href="/en-us/windows/desktop/api/FileAPI/nf-fileapi-definedosdevicew" data-linktype="absolute-path"><strong>DefineDosDevice</strong></a>
 572 |         elif abs_href['href'].startswith("/en-us/windows/desktop/api/"):
 573 |                 
 574 |             # rewrite /en-us/windows/desktop/api to /en-us/windows/win32/api
 575 |             prefix, abs_suffix = abs_href['href'].split("/en-us/windows/desktop/api/")
 576 | 
 577 | 
 578 |             # strip .html if it exists
 579 |             html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com")))
 580 |             uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", "en-us", "windows", "win32", "api" , abs_suffix))
 581 | 
 582 |             rel_href =  os.path.relpath(uri_target, html_uri)
 583 | 
 584 |             #rel_href = os.path.relpath(full_url_target, full_url_html_page)
 585 |             if rel_href[-1] == '/': # module index
 586 |                 rel_href = "%sindex.html" % rel_href
 587 |             else:
 588 |                 rel_href = "%s.html" % rel_href
 589 |             
 590 |             logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href))
 591 |             abs_href['href'] = rel_href
 592 |             abs_href['data-linktype'] = "relative-path"
 593 | 
 594 | 
 595 |         # some externals hrefs are like this win32 -> win32 :
 596 |         #   <a href="/en-us/windows/desktop/winauto/inspect-objects" data-linktype="absolute-path">Inspect</a>
 597 |         elif abs_href['href'].startswith("/en-us/windows/desktop/"):
 598 |                 
 599 |             # rewrite /en-us/windows/desktop to /win32/
 600 |             prefix, abs_suffix = abs_href['href'].split("/en-us/windows/desktop/")
 601 | 
 602 | 
 603 |             # strip .html if it exists
 604 |             html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com")))
 605 |             uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", "win32", abs_suffix))
 606 | 
 607 |             rel_href =  os.path.relpath(uri_target, html_uri)
 608 | 
 609 |             #rel_href = os.path.relpath(full_url_target, full_url_html_page)
 610 |             if rel_href[-1] == '/': # module index
 611 |                 rel_href = "%sindex.html" % rel_href
 612 |             else:
 613 |                 rel_href = "%s.html" % rel_href
 614 |             
 615 |             logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href))
 616 |             abs_href['href'] = rel_href
 617 |             abs_href['data-linktype'] = "relative-path"
 618 | 
 619 |         # some externals hrefs are like this :
 620 |         #   <a href="/en-us/uwp/api/windows.ui.viewmanagement.uisettings.textscalefactorchanged" data-linktype="absolute-path">UISettings.TextScaleFactorChanged Event</a>
 621 |         elif abs_href['href'].startswith("/en-us/"):
 622 |             full_url_target = "https://docs.microsoft.com"  + abs_href['href']
 623 |             abs_href['href'] = full_url_target
 624 | 
 625 |         # Remove every other linktype absolute since we don't know how to handle it
 626 |         else:
 627 |             # TODO : currently we don't replace it in order to show the broken urls
 628 |             # abs_href.replace_with(abs_href.text)
 629 |             pass
 630 | 
 631 |     # remove unsupported nav elements
 632 |     nav_elements = [
 633 |         ["nav"  , { "class" : "doc-outline", "role" : "navigation"}],
 634 |         ["ul"   , { "class" : "breadcrumbs", "role" : "navigation"}],
 635 |         ["div"  , { "class" : "sidebar", "role" : "navigation"}],
 636 |         ["div"  , { "class" : "dropdown dropdown-full mobilenavi"}],
 637 |         ["p"    , { "class" : "api-browser-description"}],
 638 |         ["div"  , { "class" : "api-browser-search-field-container"}],
 639 |         ["div"  , { "class" : "pageActions"}],
 640 |         ["div"  , { "class" : "container footerContainer"}],
 641 |         ["div"  , { "class" : "dropdown-container"}],
 642 |         ["div"  , { "class" : "binary-rating-buttons"}],
 643 |         ["ul"  , { "class" : "metadata page-metadata"}],
 644 |         ["div"  , { "data-bi-name" : "pageactions"}],
 645 |         ["div"  , { "class" : "page-action-holder"}],
 646 |         ["div"  , { "class" : "header-holder"}],
 647 |         ["footer" , { "data-bi-name" : "footer", "id" : "footer"}],
 648 |         ["div"  , { "class" : "binary-rating-holder"}],
 649 |         ["div"  , { "id" : "left-container"}],
 650 |     ]
 651 | 
 652 |     for nav in nav_elements:
 653 |         nav_class, nav_attr = nav
 654 |         
 655 |         for nav_tag in soup.findAll(nav_class, nav_attr):
 656 |             _ = nav_tag.extract()
 657 | 
 658 |     # remove script elems
 659 |     for head_script in soup.head.findAll("script"):
 660 |             _ = head_script.extract()
 661 |     
 662 |     # Extract and rewrite additionnal stylesheets to download
 663 |     ThemeResourceRecord = collections.namedtuple('ThemeResourceRecord', 'url, path')
 664 | 
 665 |     theme_output_dir = os.path.join(documents_dir, Configuration.domain)
 666 |     theme_resources = []
 667 | 
 668 |     for link in soup.head.findAll("link", { "rel" : "stylesheet"}):
 669 |         uri_path = link['href'].strip()
 670 | 
 671 |         if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri):
 672 |             continue
 673 | 
 674 |         # Construct (url, path) tuple
 675 |         css_url = "https://%s/%s" % (Configuration.domain, uri_path)
 676 |         css_filepath =  os.path.join(theme_output_dir, uri_path.lstrip('/'))
 677 | 
 678 |         # Converting href to a relative link
 679 |         path = os.path.relpath(css_filepath, os.path.dirname(html_path))
 680 |         rel_uri = '/'.join(path.split(os.sep))
 681 |         link['href'] = rel_uri
 682 | 
 683 |         theme_resources.append( ThemeResourceRecord( 
 684 |             url = css_url, 
 685 |             path = os.path.relpath(css_filepath, documents_dir), # stored as relative path
 686 |         ))
 687 | 
 688 |     return soup, set(theme_resources)
 689 | 
 690 | 
 691 | 
 692 | def rewrite_html_contents(configuration : Configuration, html_root_dir : str):
 693 |     """ rewrite every html file downloaded """
 694 | 
 695 |     additional_resources = set()
 696 | 
 697 |     for html_file in glob.glob("%s/**/*.html" % html_root_dir, recursive = True):
 698 | 
 699 |         logging.info("rewrite  html_file : %s" % (html_file))
 700 | 
 701 |         # Read content and parse html
 702 |         with open(html_file, 'r', encoding='utf8') as i_fd:
 703 |             html_content = i_fd.read()
 704 | 
 705 |         soup = bs(html_content, 'html.parser')
 706 |         
 707 |         # rewrite html
 708 |         soup, resources = rewrite_soup(configuration, soup, html_file, html_root_dir)
 709 |         additional_resources = additional_resources.union(resources)
 710 | 
 711 |         # Export fixed html
 712 |         fixed_html = soup.prettify("utf-8")
 713 |         with open(html_file, 'wb') as o_fd:
 714 |             o_fd.write(fixed_html)
 715 | 
 716 |     return additional_resources
 717 | 
 718 | 
 719 | def download_additional_resources(configuration : Configuration, documents_dir : str, resources_to_dl : set = set()):
 720 |     """ Download optional resources for "beautification """
 721 | 
 722 |     for resource in resources_to_dl:
 723 |         
 724 |         download_textfile(
 725 |             resource.url, 
 726 |             os.path.join(documents_dir, resource.path)
 727 |         )
 728 | 
 729 |     # Download index start page
 730 |     src_index_filepath = os.path.join(documents_dir, Configuration.domain, "win32", "desktop-app-technologies.html")
 731 |     index_filepath = os.path.join(documents_dir, Configuration.domain, "win32", "index.html")
 732 |     shutil.copy(src_index_filepath, index_filepath)
 733 | 
 734 |     # soup = bs( configuration.webdriver.get_url_page(index_url), 'html.parser')
 735 |     # soup = rewrite_index_soup(configuration, soup, index_filepath, documents_dir)
 736 |     # fixed_html = soup.prettify("utf-8")
 737 |     # with open(index_filepath, 'wb') as o_fd:
 738 |     #     o_fd.write(fixed_html)
 739 | 
 740 | 
 741 |     # # Download module.svg icon for start page
 742 |     # icon_module_url  =     '/'.join(["https:/"   , Configuration.domain, "en-us", "media", "toolbars", "module.svg"])
 743 |     # icon_module_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg")
 744 |     # download_binary(icon_module_url, icon_module_path)
 745 | 
 746 | 
 747 | def create_sqlite_database(configuration, content_toc, resources_dir, documents_dir):
 748 |     """ Indexing the html document in a format Dash can understand """
 749 | 
 750 |     def insert_into_sqlite_db(cursor, name, record_type, path):
 751 |         """ Insert a new unique record in the sqlite database. """
 752 |         try:
 753 |             cursor.execute('SELECT rowid FROM searchIndex WHERE path = ?', (path,))
 754 |             dbpath = cursor.fetchone()
 755 |             cursor.execute('SELECT rowid FROM searchIndex WHERE name = ?', (name,))
 756 |             dbname = cursor.fetchone()
 757 | 
 758 |             if dbpath is None and dbname is None:
 759 |                 cursor.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, record_type, path))
 760 |                 logging.debug('DB add [%s] >> name: %s, path: %s' % (record_type, name, path))
 761 |             else:
 762 |                 logging.debug('record exists')
 763 | 
 764 |         except:
 765 |             pass
 766 | 
 767 |     sqlite_filepath = os.path.join(resources_dir, "docSet.dsidx")
 768 |     if os.path.exists(sqlite_filepath):
 769 |         os.remove(sqlite_filepath)
 770 | 
 771 |     db = sqlite3.connect(sqlite_filepath)
 772 |     cur = db.cursor()
 773 |     cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
 774 |     cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
 775 | 
 776 | 
 777 |     mapping = {
 778 |         # win32 content
 779 |         "guides" : "Guide",
 780 |         "attributes" : "Attribute",
 781 |         "classes" : "Class",
 782 |         "entries" : "Entry",
 783 |         
 784 |         # api-sdk content
 785 |         "categories" : "Category",
 786 |         "files" : "File",
 787 | 
 788 |         'callbacks' : "Callback",
 789 |         'functions' : "Function",
 790 |         'enums' : "Enum",
 791 |         'interfaces' : "Interface",
 792 |         'structures' : "Structure",
 793 | 
 794 |     }
 795 | 
 796 |     # import pdb;pdb.set_trace()
 797 |     for key in mapping.keys():
 798 | 
 799 |         for _value in content_toc.get(key, []):
 800 | 
 801 |             # path should be unix compliant
 802 |             value_path = _value['path'].replace(os.sep, '/')
 803 |             insert_into_sqlite_db(cur, _value['name'], mapping[key], value_path)        
 804 |     
 805 | 
 806 |         
 807 | 
 808 |     # commit and close db
 809 |     db.commit()
 810 |     db.close()
 811 | 
 812 | def copy_folder(src_folder : str, dst_folder : str):
 813 |     """ Copy a full folder tree anew every time """
 814 | 
 815 |     def onerror(func, path, exc_info):
 816 |         """
 817 |         Error handler for ``shutil.rmtree``.
 818 | 
 819 |         If the error is due to an access error (read only file)
 820 |         it attempts to add write permission and then retries.
 821 | 
 822 |         If the error is for another reason it re-raises the error.
 823 | 
 824 |         Usage : ``shutil.rmtree(path, onerror=onerror)``
 825 |         """
 826 |         import stat
 827 | 
 828 |         if not os.path.exists(path):
 829 |             return
 830 | 
 831 |         if not os.access(path, os.W_OK):
 832 |             # Is the error an access error ?
 833 |             os.chmod(path, stat.S_IWUSR)
 834 |             func(path)
 835 |         else:
 836 |             raise
 837 | 
 838 |     # print(dst_folder)
 839 |     shutil.rmtree(dst_folder,ignore_errors=False,onerror=onerror) 
 840 |     shutil.copytree(src_folder, dst_folder)
 841 | 
 842 | def merge_folders(src, dst):
 843 |     
 844 |     if os.path.isdir(src):
 845 |         
 846 |         if not os.path.exists(dst):
 847 |             os.makedirs(dst)
 848 |         
 849 |         for name in os.listdir(src):
 850 |             merge_folders(
 851 |                 os.path.join(src, name),
 852 |                 os.path.join(dst, name)
 853 |             )
 854 |     else:
 855 |         shutil.copyfile(src, dst)
 856 | 
 857 | def main(configuration : Configuration):
 858 | 
 859 |     # """ Scheme for content toc : 
 860 |     # {
 861 |     #     module_name : {
 862 |     #         'name' : str,
 863 |     #         'index' : relative path,
 864 |     #         'entries' : [
 865 |     #             {
 866 |     #                 'name' : str,
 867 |     #                 'path' : relative path, 
 868 |     #             },
 869 |     #             ...
 870 |     #         ]
 871 |     #     },
 872 |     #     ...
 873 |     # }
 874 |     # """
 875 |     content_toc = {}
 876 |     resources_to_dl = set()
 877 | 
 878 |     """ 0. Prepare folders """
 879 |     source_dir = os.path.join(configuration.build_folder, "_0_win32_source")
 880 |     api_source_dir = os.path.join(configuration.build_folder, "_0_api_sdk_source")
 881 | 
 882 |     download_dir = os.path.join(configuration.build_folder, "_1_downloaded_contents")
 883 |     html_rewrite_dir = os.path.join(configuration.build_folder, "_2_html_rewrite")
 884 |     additional_resources_dir = os.path.join(configuration.build_folder, "_3_additional_resources")
 885 |     package_dir = os.path.join(configuration.build_folder, "_4_ready_to_be_packaged")
 886 | 
 887 |     for folder in [source_dir, api_source_dir, download_dir, html_rewrite_dir, additional_resources_dir, package_dir]:
 888 |         os.makedirs(folder, exist_ok=True)
 889 | 
 890 |     # _4_ready_to_be_packaged is the final build dir
 891 |     docset_dir = os.path.join(package_dir, "%s.docset" % Configuration.docset_name)
 892 |     content_dir = os.path.join(docset_dir , "Contents")
 893 |     resources_dir = os.path.join(content_dir, "Resources")
 894 |     document_dir = os.path.join(resources_dir, "Documents")
 895 | 
 896 |     if conf.crawl_contents:
 897 |         # cloning source directories for scraping contents, extremely long operation
 898 |         logging.info("Downloading win32 markdown zipped sources : %s -> %s" % ("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip")))
 899 |         download_binary("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip"))
 900 | 
 901 |         logging.info("Extracting win32 markdown zipped sources : ")
 902 |         with zipfile.ZipFile(os.path.join(source_dir, "docs.zip"), 'r') as zip_ref:
 903 |             zip_ref.extractall(source_dir)
 904 | 
 905 |         logging.info("Downloading sdk-api markdown zipped sources : %s -> %s" % ("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip")))
 906 |         download_binary("https://github.com/MicrosoftDocs/sdk-api/archive/refs/heads/docs.zip", os.path.join(api_source_dir, "docs.zip"))
 907 | 
 908 |         logging.info("Extracting api-sdk markdown zipped sources : ")
 909 |         with zipfile.ZipFile(os.path.join(api_source_dir, "docs.zip"), 'r') as zip_ref:
 910 |             zip_ref.extractall(api_source_dir)
 911 | 
 912 | 
 913 |         """ 1. Download html pages """
 914 |         logging.info("[1] scraping win32 web contents")
 915 |         content_toc = {}
 916 |         content_toc = crawl_msdn_contents(configuration, download_dir, source_dir)
 917 | 
 918 |         logging.info("[1] scraping sdk-api web contents")
 919 |         api_content_toc = crawl_sdk_api_contents(configuration, download_dir, api_source_dir)
 920 | 
 921 |         # Merge win32 api content
 922 |         content_toc.update(api_content_toc)
 923 |         with open(os.path.join(download_dir, "toc.json"), "w") as content:
 924 |             json.dump(content_toc, content)
 925 |     else:
 926 |         # print(os.path.join(download_dir, "toc.json"))
 927 |         with open(os.path.join(download_dir, "toc.json"), "r") as content:
 928 |                 content_toc = json.load(content)
 929 | 
 930 |     """ 2.  Parse and rewrite html contents """
 931 |     logging.info("[2] rewriting urls and hrefs")
 932 |     copy_folder(download_dir, html_rewrite_dir)
 933 |     resources_to_dl = rewrite_html_contents(configuration, html_rewrite_dir)
 934 | 
 935 |     """ 3.  Download additionnal resources """
 936 |     logging.info("[3] download style contents")
 937 |     copy_folder(html_rewrite_dir, additional_resources_dir )
 938 |     download_additional_resources(configuration, additional_resources_dir, resources_to_dl)
 939 | 
 940 |     """ 4.  Database indexing """
 941 |     logging.info("[4] indexing to database")
 942 |     copy_folder(additional_resources_dir, document_dir )
 943 |     create_sqlite_database(configuration, content_toc, resources_dir, document_dir)
 944 | 
 945 |     """ 5.  Archive packaging """
 946 |     src_dir = os.path.dirname(__file__)
 947 |     shutil.copy(os.path.join(src_dir, "static/Info.plist"), content_dir)
 948 |     shutil.copy(os.path.join(src_dir, "static/DASH_LICENSE"), os.path.join(resources_dir, "LICENSE"))
 949 |     shutil.copy(os.path.join(src_dir, "static/icon.png"), docset_dir)
 950 |     shutil.copy(os.path.join(src_dir, "static/icon@2x.png"), docset_dir)
 951 | 
 952 |     output_dir = os.path.dirname(configuration.output_filepath)
 953 |     os.makedirs(output_dir, exist_ok=True)
 954 | 
 955 |     logging.info("[5] packaging as a dash docset")
 956 |     make_docset(
 957 |         docset_dir,
 958 |         configuration.output_filepath,
 959 |         Configuration.docset_name
 960 |     )
 961 | 
 962 | 
 963 | if __name__ == '__main__':
 964 | 
 965 |     
 966 | 
 967 |     parser = argparse.ArgumentParser(
 968 |         description="Dash docset creation script for MSDN's Win32 API"
 969 |     )
 970 | 
 971 |     parser.add_argument("-vv", "--verbose", 
 972 |         help="increase output verbosity", 
 973 |         action="store_true"
 974 |     )
 975 | 
 976 |     subparsers = parser.add_subparsers(help='sub-command help', dest='command')
 977 | 
 978 | 
 979 |     parser_create = subparsers.add_parser('create_docset', help='scrap the internet in order to create a docset')
 980 |     parser_create.add_argument("-t", "--temporary", 
 981 |         help="Use a temporary directory for creating docset, otherwise use current dir.", 
 982 |         default=False, 
 983 |         action="store_true"
 984 |     )
 985 | 
 986 | 
 987 |     parser_create.add_argument("-o", "--output", 
 988 |         help="set output filepath", 
 989 |         default = os.path.join(os.getcwd(), "MSDN.tgz"),
 990 |     )
 991 | 
 992 |     parser_create.add_argument("-s", "--sampling", 
 993 |         help="generate only a 'sample' docset, in order to test if the rewriting rules are corrects", 
 994 |         default=False, 
 995 |         action="store_true"
 996 |     )
 997 | 
 998 |     parser_rewrite = subparsers.add_parser('rewrite_html', help='rewrite html file in order to test rules')
 999 | 
1000 |     parser_rewrite.add_argument("input", 
1001 |         help="set input filepath"
1002 |     )
1003 | 
1004 |     parser_rewrite.add_argument("output", 
1005 |         help="set output filepath"
1006 |     )
1007 | 
1008 |     parser_rewrite.add_argument("html_root_dir", 
1009 |         help="set html_root_dir filepath"
1010 |     )
1011 | 
1012 |     args = parser.parse_args()
1013 |     if args.verbose:
1014 |         logging.basicConfig(level=logging.DEBUG)
1015 |         logging.getLogger("requests").setLevel(logging.WARNING)
1016 |         logging.getLogger("urllib3").setLevel(logging.WARNING)
1017 |     else:
1018 |         logging.basicConfig(level=logging.INFO)
1019 | 
1020 | 
1021 |     if args.command == "rewrite_html":
1022 | 
1023 |         conf = Configuration( args )
1024 | 
1025 |         # Read content and parse html
1026 |         with open(args.input, 'r', encoding='utf8') as i_fd:
1027 |             html_content = i_fd.read()
1028 | 
1029 |         soup = bs(html_content, 'html.parser')
1030 |         
1031 |         # rewrite html
1032 |         soup, resources = rewrite_soup(conf, soup, args.input, args.html_root_dir)
1033 | 
1034 |         # Export fixed html
1035 |         fixed_html = soup.prettify("utf-8")
1036 |         with open(args.output, 'wb') as o_fd:
1037 |             o_fd.write(fixed_html)
1038 | 
1039 |     elif args.command == "create_docset":
1040 |         conf = Configuration( args )
1041 | 
1042 |         if args.temporary:
1043 | 
1044 |             with tempfile.TemporaryDirectory() as tmp_builddir:
1045 |                 conf.build_folder = tmp_builddir
1046 |                 main(conf)
1047 |         else:
1048 |             main(conf)
1049 | 
1050 |     else:
1051 |         raise NotImplementedError("command not implemented %s" % args.command)
1052 | 


--------------------------------------------------------------------------------
/static/DASH_LICENSE:
--------------------------------------------------------------------------------
1 | You are not allowed to distribute or make use of any of the files within this folder ("Resources") without written permission from Kapeli or whilst using the Dash app developed by Kapeli. This does not apply to the files located within the "Documents" folder.


--------------------------------------------------------------------------------
/static/Info.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>CFBundleIdentifier</key>
 6 | 	<string>msdn</string>
 7 | 
 8 | 	<key>CFBundleName</key>
 9 | 	<string>MSDN</string>
10 | 
11 | 	<key>DashDocSetFallbackURL</key>
12 | 	<string>https://docs.microsoft.com/win32/</string>
13 | 
14 | 	<key>dashIndexFilePath</key>
15 | 	<string>docs.microsoft.com/win32/index.html</string>
16 | 
17 | 	<key>DashDocSetFamily</key>
18 | 	<string>msdn</string>
19 | 
20 | 	<key>DocSetPlatformFamily</key>
21 | 	<string>msdn</string>
22 | 
23 | 	<key>isDashDocset</key>
24 | 	<true/>
25 | 
26 | 	<key>isJavaScriptEnabled</key>
27 | 	<false/>
28 | 	
29 | </dict>
30 | </plist>
31 | 


--------------------------------------------------------------------------------
/static/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/msdn-docset/188d087f030a92f2c13b0dfd8df477cfef7e6876/static/icon.png


--------------------------------------------------------------------------------
/static/icon@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/msdn-docset/188d087f030a92f2c13b0dfd8df477cfef7e6876/static/icon@2x.png


--------------------------------------------------------------------------------