├── msdn-to-docset.py
└── static
├── DASH_LICENSE
├── Info.plist
├── icon.png
└── icon@2x.png
/msdn-to-docset.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sqlite3
4 | import os
5 | import sys
6 | import glob
7 | import re
8 | import shutil
9 | import logging
10 | import json
11 | import tarfile
12 | import tempfile
13 | import argparse
14 | import urllib.parse
15 | import urllib
16 | import time
17 | import collections
18 | import zipfile
19 |
20 | from ruamel.yaml import YAML
21 | import requests
22 | from requests.adapters import HTTPAdapter
23 | from requests.packages.urllib3.util.retry import Retry
24 | from requests.exceptions import ConnectionError
25 | from bs4 import BeautifulSoup as bs, Tag # pip install bs4
26 | from selenium import webdriver
27 | # from selenium.webdriver import Firefox
28 | # from selenium.webdriver.firefox.options import Options
29 | # from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
30 |
31 | from selenium.webdriver import Chrome
32 | from selenium.webdriver.chrome.options import Options
33 |
34 | class PoshWebDriver:
35 | """ Thin wrapper for selenium webdriver for page content retrieval """
36 |
37 | def __init__(self, executable_path = None):
38 |
39 | self.options = Options()
40 | self.options.add_argument("--headless")
41 | self.options.add_argument("--window-size=1920x1080")
42 |
43 | self.driver = webdriver.Chrome(options=self.options)
44 |
45 |
46 | def get_url_page(self, url):
47 | """ retrieve the full html content of a page after Javascript execution """
48 |
49 | index_html = None
50 | try:
51 | self.driver.get(url)
52 | index_html = self.driver.page_source
53 | except (ConnectionResetError, urllib.error.URLError) as e:
54 | # we may have a triggered a anti-scraping time ban
55 | # Lay low for several seconds and get back to it.
56 |
57 | self.driver.quit()
58 | time.sleep(2)
59 |
60 |
61 | self.driver = webdriver.Chrome(options=self.options)
62 |
63 | index_html = None
64 |
65 | # try a second time, and raise error if fail
66 | if not index_html:
67 | self.driver.get(url)
68 | index_html = self.driver.page_source
69 |
70 | return index_html
71 |
72 | def quit():
73 | return self.driver.quit()
74 |
75 |
76 | class Configuration:
77 |
78 | # STATIC CONSTANTS
79 | docset_name = 'MSDN'
80 |
81 | domain = "docs.microsoft.com"
82 | default_theme_uri = "_themes/docs.theme/master/en-us/_themes"
83 |
84 | def __init__(self, args):
85 |
86 |
87 | # # selected powershell api version
88 | # self.powershell_version = args.version
89 |
90 | # # The modules and cmdlets pages are "versionned" using additional params in the GET request
91 | # self.powershell_version_param = "view=powershell-{0:s}".format(self.powershell_version)
92 |
93 | # build folder (must be cleaned afterwards)
94 | # self.build_folder = os.path.join(os.getcwd(), "_build_{0:s}".format(self.powershell_version))
95 | self.build_folder = os.path.join(os.getcwd(), "_build_msdn")
96 |
97 | # output file
98 | self.output_filepath = os.path.realpath(args.output)
99 |
100 | # powershell docs start page
101 | self.api_index_url = "https://docs.microsoft.com/en-us/windows/win32/api/"
102 |
103 | self.docs_index_url = "https://docs.microsoft.com/en-us/windows/win32/desktop-app-technologies"
104 |
105 |
106 |
107 | # # powershell docs table of contents url
108 | # self.docs_toc_url = "https://{0:s}/psdocs/toc.json?{2:s}".format(
109 | # Configuration.base_url,
110 | # self.powershell_version,
111 | # self.powershell_version_param
112 | # )
113 |
114 | # self.windows_toc_url = "https://{0:s}/windowsserver2019-ps/toc.json?view=windowsserver2019-ps".format(
115 | # Configuration.base_url
116 | # )
117 |
118 | # selenium webdriver
119 | self.webdriver = PoshWebDriver()
120 |
121 | self.crawl_contents = True
122 |
123 | # selected module
124 | # self.filter_modules = [module.lower() for module in args.modules]
125 |
126 |
127 | # Global session for several retries
128 | session = requests.Session()
129 | retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
130 | session.mount('http://', HTTPAdapter(max_retries=retries))
131 |
132 |
133 | def download_binary(url, output_filename):
134 | """ Download GET request as binary file """
135 | global session
136 |
137 | logging.debug("download_binary : %s -> %s" % (url, output_filename))
138 |
139 | # ensure the folder path actually exist
140 | os.makedirs(os.path.dirname(output_filename), exist_ok = True)
141 |
142 | r = session.get(url, stream=True)
143 | with open(output_filename, 'wb') as f:
144 | for data in r.iter_content(32*1024):
145 | f.write(data)
146 |
147 | def download_textfile(url : str , output_filename : str, params : dict = None):
148 | """ Download GET request as utf-8 text file """
149 | global session
150 |
151 | logging.debug("download_textfile : %s -> %s" % (url, output_filename))
152 |
153 | # ensure the folder path actually exist
154 | os.makedirs(os.path.dirname(output_filename), exist_ok = True)
155 |
156 | while True:
157 | try:
158 | r = session.get(url, data = params)
159 | except ConnectionError:
160 | logging.debug("caught ConnectionError, retrying...")
161 | time.sleep(2)
162 | else:
163 | break
164 |
165 | # do not write 404 pages on disk
166 | if r.status_code != 200:
167 | return False
168 |
169 | r.encoding = 'utf-8'
170 | with open(output_filename, 'w', encoding="utf-8") as f:
171 | f.write(r.text)
172 |
173 | return True
174 |
175 |
176 | def make_docset(source_dir, dst_filepath, filename):
177 | """
178 | Tar-gz the build directory while conserving the relative folder tree paths.
179 | Copied from : https://stackoverflow.com/a/17081026/1741450
180 | """
181 | dst_dir = os.path.dirname(dst_filepath)
182 | tar_filepath = os.path.join(dst_dir, '%s.tar' % filename)
183 |
184 | with tarfile.open(tar_filepath, "w:gz") as tar:
185 | tar.add(source_dir, arcname=os.path.basename(source_dir))
186 |
187 | shutil.move(tar_filepath, dst_filepath)
188 |
189 |
190 |
191 | def download_page_contents(configuration, uri, output_filepath):
192 | """ Download a page using it's uri from the TOC """
193 |
194 | # Resolving "absolute" url et use appropriate version
195 | full_url = urllib.parse.urljoin(configuration.docs_toc_url, uri)
196 | versionned_url = "{0:s}?{1:s}".format(full_url, configuration.powershell_version_param)
197 |
198 | download_textfile(versionned_url, output_filepath)
199 |
200 |
201 | def download_module_contents(configuration, module_name, module_uri, module_dir, cmdlets, root_dir):
202 | """ Download a modules contents """
203 |
204 | module_filepath = os.path.join(module_dir, "%s.html" % module_name)
205 |
206 | logging.debug("downloading %s module index page -> %s" % (module_name, module_filepath))
207 | if module_uri:
208 | download_page_contents(configuration, module_uri, module_filepath)
209 |
210 | cmdlets_infos = []
211 |
212 | # Downloading cmdlet contents
213 | for cmdlet in cmdlets:
214 |
215 | cmdlet_name = cmdlet['toc_title']
216 | if cmdlet_name.lower() in ("about", "functions", "providers", "provider"): # skip special toc
217 | continue
218 |
219 | cmdlet_uri = cmdlet["href"]
220 | cmdlet_filepath = os.path.join(module_dir, "%s.html" % cmdlet_name)
221 |
222 | logging.debug("downloading %s cmdlet doc -> %s" % (cmdlet_name, cmdlet_filepath))
223 | download_page_contents(configuration, cmdlet_uri, cmdlet_filepath)
224 |
225 | cmdlets_infos.append({
226 | 'name' : cmdlet_name,
227 | 'path' : os.path.relpath(cmdlet_filepath, root_dir),
228 | })
229 |
230 | module_infos = {
231 | 'name' : module_name,
232 | 'index' : os.path.relpath(module_filepath, root_dir),
233 | 'cmdlets' : cmdlets_infos
234 | }
235 |
236 | return module_infos
237 |
238 | def _findname(obj, key):
239 | """ return the 'toc_title' value associated to a 'href' node """
240 | # print("%r == %s" % (obj.get('href', None), key))
241 | if obj.get('href', None)==key:return obj['toc_title']
242 | for k, v in obj.items():
243 | if isinstance(v,dict):
244 | item = _findname(v, key)
245 | if item is not None:
246 | return item
247 | if isinstance(v,list):
248 | for i in v:
249 | item = _findname(i, key)
250 | if item is not None:
251 | return item
252 |
253 | def crawl_sdk_api_folder(configuration: Configuration, download_dir : str, source_dir: str, directory : str, api_content_toc : dict):
254 |
255 |
256 | for markdown_filepath in glob.glob(os.path.join(source_dir,directory , "*.md")):
257 |
258 | page_filename, page_ext = os.path.splitext(os.path.basename(markdown_filepath))
259 | realarb = os.path.relpath(os.path.dirname(markdown_filepath), source_dir)
260 |
261 | # already processed
262 | if page_filename == "index":
263 | continue
264 |
265 | url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}/{1:s}".format(realarb, page_filename)
266 | filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}/{1:s}.html".format(realarb, page_filename))
267 | logging.info("[+] download page %s -> %s " % (url, filepath))
268 | success = download_textfile(url, filepath)
269 |
270 | if not success:
271 | logging.info("[X] could not download page %s -> %s " % (url, filepath))
272 | continue
273 |
274 |
275 | url_relpath = "/windows/win32/api/{0:s}/{1:s}".format(realarb, page_filename)
276 | page_title = _findname(api_content_toc['toc'][directory]['items'][0], url_relpath)
277 | #logging.info("[+] %s => title '%s'" % (url_relpath, page_title))
278 |
279 | if page_filename.startswith("nc-"):
280 | category = "callbacks"
281 | elif page_filename.startswith("ne-"):
282 | category = "enums"
283 | elif page_filename.startswith("nf-"):
284 | category = "functions"
285 | elif page_filename.startswith("nn-"):
286 | category = "interfaces"
287 | elif page_filename.startswith("ns-"):
288 | category = "structures"
289 | elif page_filename.startswith("nl-"):
290 | category = "classes"
291 | else:
292 | category = "entries"
293 |
294 | api_content_toc[category].append({
295 | 'name' : page_title,
296 | 'path' : "docs.microsoft.com/en-us{0:s}.html".format(url_relpath),
297 | })
298 |
299 | return api_content_toc
300 |
301 |
302 | def crawl_sdk_api_contents(configuration: Configuration, download_dir : str, source_dir : str):
303 | """ Download sdk-api entries based on TOC """
304 |
305 | api_content_toc = {
306 | 'categories' : [],
307 | 'files' : [],
308 | 'callbacks' : [],
309 | 'functions' : [],
310 | 'enums' : [],
311 | 'interfaces' : [],
312 | 'structures' : [],
313 | 'classes' : [],
314 |
315 | 'entries' : [],
316 | 'toc' : {}
317 | }
318 |
319 | content_dir = os.path.join(source_dir, "sdk-api-docs", "sdk-api-src", "content")
320 |
321 | for directory in os.listdir(content_dir):
322 |
323 | # download toc for directory
324 | toc_url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}/toc.json".format(directory)
325 | logging.info("[+] download toc for directory %s" % (toc_url))
326 | toc_r = requests.get(toc_url)
327 | if toc_r.status_code == 200:
328 | api_content_toc['toc'][directory] = json.loads(requests.get(toc_url).text)
329 | else:
330 | logging.warning("[!] directory %s has no TOC !" % (toc_url))
331 |
332 | # only index folders with a toc
333 | if not api_content_toc['toc'].get(directory, None):
334 | continue
335 |
336 | # "meta" directory
337 | if directory.startswith("_"):
338 |
339 |
340 |
341 | url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(
342 | directory,
343 | )
344 | filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html")
345 | logging.info("[+] download page %s -> %s " % (url, filepath))
346 | download_textfile(url, filepath)
347 |
348 |
349 | category_title = api_content_toc['toc'][directory]['items'][0]['toc_title']
350 | api_content_toc['categories'].append({
351 | 'name' : category_title,
352 | 'path' : os.path.join("docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html"),
353 | })
354 |
355 | # directory generated from a file
356 | else:
357 |
358 | url = "https://docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(
359 | directory,
360 | )
361 | filepath = os.path.join(download_dir, "docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html")
362 | logging.info("[+] download page %s -> %s " % (url, filepath))
363 | download_textfile(url, filepath)
364 |
365 | category_title = directory
366 | if api_content_toc['toc'].get(directory, None):
367 | category_title = api_content_toc['toc'][directory]['items'][0]['toc_title']
368 |
369 | api_content_toc['files'].append({
370 | 'name' : category_title,
371 | 'path' : os.path.join("docs.microsoft.com/en-us/windows/win32/api/{0:s}".format(directory), "index.html"),
372 | })
373 |
374 |
375 | api_content_toc = crawl_sdk_api_folder(configuration, download_dir, content_dir, directory, api_content_toc)
376 |
377 | return api_content_toc
378 |
379 |
380 | def crawl_msdn_contents(configuration: Configuration, download_dir : str, source_dir : str):
381 | """ Download MSDN modules and content pages based on TOC """
382 |
383 | content_toc = {
384 | 'attributes' : [],
385 | 'classes' : [],
386 | 'entries' : [],
387 | 'guides' : [],
388 | 'toc' : {},
389 | }
390 |
391 | # counter = 0
392 | for r, d, f in os.walk(os.path.join(source_dir, "win32-docs", "desktop-src"), topdown=True):
393 |
394 | # if counter >=2000:
395 | # break
396 |
397 | for image_file in filter(lambda s: os.path.splitext(s)[1] in [".png", ".jpg", ".jpeg"] ,f):
398 | realarb = os.path.relpath(r, os.path.join(source_dir, "win32-docs", "desktop-src"))
399 | image_dir = os.path.join(download_dir, "docs.microsoft.com/win32", realarb)
400 | filepath = os.path.join(image_dir, image_file)
401 |
402 | os.makedirs(image_dir, exist_ok=True)
403 | shutil.copyfile(os.path.join(r, image_file), filepath)
404 |
405 | for markdown_file in filter(lambda s: os.path.splitext(s)[1] == ".md" ,f):
406 | page_filename, page_ext = os.path.splitext(markdown_file)
407 |
408 | realarb = os.path.relpath(r, os.path.join(source_dir, "win32-docs", "desktop-src"))
409 | url = "https://docs.microsoft.com/en-us/windows/win32/{0:s}/{1:s}".format(
410 | realarb,
411 | page_filename
412 | )
413 |
414 | # retrieve html of page
415 | page_dir = os.path.join(download_dir, "docs.microsoft.com/win32", realarb)
416 | filepath = os.path.join(page_dir, "%s.html" % page_filename)
417 | logging.debug("[+] download page %s -> %s " % (url, filepath))
418 | download_textfile(url, filepath)
419 |
420 | # don't care about top level pages
421 | if realarb == '.':
422 | continue
423 |
424 | # First time navigating in this directory
425 | if realarb not in content_toc['toc'].keys():
426 |
427 | # download toc for page
428 | toc_url = "https://docs.microsoft.com/en-us/windows/win32/{0:s}/toc.json".format(
429 | realarb
430 | )
431 | logging.info("[+] download toc for page %s" % (toc_url))
432 |
433 | toc_r = requests.get(toc_url)
434 | if toc_r.status_code != 200:
435 |
436 | # Could not find a toc for this folder
437 | content_toc['toc'][realarb] = {
438 | 'toc' : {'items' : [{}]}
439 | }
440 |
441 | content_toc['guides'].append({
442 | 'name' : page_filename,
443 | 'path' : os.path.join(os.path.relpath(page_dir, download_dir), "%s.html" % page_filename),
444 | })
445 |
446 | else:
447 | component_toc = json.loads(requests.get(toc_url).text)
448 |
449 | component_title = component_toc['items'][0]['toc_title']
450 | component_href = component_toc['items'][0]['href']
451 |
452 |
453 | content_toc['toc'][realarb] = {
454 | 'toc' : component_toc
455 | }
456 |
457 | content_toc['guides'].append({
458 | 'name' : component_title,
459 | 'path' : os.path.join(os.path.relpath(page_dir, download_dir), "%s.html" % component_href),
460 | })
461 |
462 |
463 | # Adding current page to content toc
464 |
465 |
466 | # Class page
467 | if "ADSchema" in realarb and page_filename.startswith("c-"):
468 | logging.info("[+] new class page %s" % (page_filename))
469 |
470 | page_title = _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename)
471 | if not page_title:
472 | page_title = page_filename
473 |
474 | content_toc['classes'].append({
475 | 'name' : page_title,
476 | 'path' : os.path.relpath(filepath, download_dir),
477 | })
478 |
479 | # Attribute page
480 | elif "ADSchema" in realarb and page_filename.startswith("a-"):
481 | logging.debug("[+] new attribute page %s" % (page_filename))
482 |
483 | page_title = _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename)
484 | if not page_title:
485 | page_title = page_filename
486 |
487 | content_toc['attributes'].append({
488 | 'name' : page_title,
489 | 'path' : os.path.relpath(filepath, download_dir),
490 | })
491 |
492 | # Generic entry
493 | else:
494 | page_title = _findname(content_toc['toc'][realarb]['toc']['items'][0], page_filename)
495 | if not page_title:
496 | page_title = page_filename
497 |
498 | content_toc['entries'].append({
499 | 'name' :page_title,
500 | 'path' : os.path.relpath(filepath, download_dir),
501 | })
502 |
503 |
504 | # counter+=1
505 |
506 | # if counter >=2000:
507 | # break
508 |
509 |
510 |
511 | return content_toc
512 |
513 | def rewrite_soup(configuration : Configuration, soup, html_path : str, documents_dir : str):
514 | """ rewrite html contents by fixing links and remove unnecessary cruft """
515 |
516 | # Fix navigations links
517 | links = soup.findAll("a", { "data-linktype" : "relative-path"}) # for modules and cmdlet pages
518 | link_pattern = re.compile(r"([\w\.\/-]+)")
519 |
520 | for link in links:
521 |
522 | href = link['href']
523 | fixed_href = href
524 |
525 | # go back to module
526 | # if href == "./?view=powershell-%s" % configuration.powershell_version:
527 | # fixed_href = "./%s.html" % link.text
528 |
529 | # go to a relative page
530 | targets = link_pattern.findall(href)
531 | if not len(targets): # badly formated 'a' link
532 | continue
533 |
534 | page_target = targets[0]
535 | if page_target[-1] == '/': # module index
536 | fixed_href = "%sindex.html" % page_target
537 | else:
538 | fixed_href = "%s.html" % page_target
539 |
540 | if fixed_href != href:
541 | logging.info("link rewrite : %s -> %s " % ( href, fixed_href))
542 | link['href'] = fixed_href
543 |
544 | # remove link to external references if we can't support it
545 | for abs_href in soup.findAll("a", { "data-linktype" : "absolute-path"}):
546 |
547 | # some externals hrefs are like this win32 -> api:
548 | # IActivationFactory
549 | if abs_href['href'].startswith("/en-us/windows/win32/api/"):
550 |
551 | # remove prefixing /
552 | prefix, *abs_suffix = abs_href['href'].split("/")
553 |
554 | # strip .html if it exists
555 | html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com")))
556 | uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", *abs_suffix))
557 |
558 | rel_href = os.path.relpath(uri_target, html_uri)
559 |
560 | #rel_href = os.path.relpath(full_url_target, full_url_html_page)
561 | if rel_href[-1] == '/': # module index
562 | rel_href = "%sindex.html" % rel_href
563 | else:
564 | rel_href = "%s.html" % rel_href
565 |
566 | logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href))
567 | abs_href['href'] = rel_href
568 | abs_href['data-linktype'] = "relative-path"
569 |
570 | # some externals hrefs are like this win32 -> win32 :
571 | # DefineDosDevice
572 | elif abs_href['href'].startswith("/en-us/windows/desktop/api/"):
573 |
574 | # rewrite /en-us/windows/desktop/api to /en-us/windows/win32/api
575 | prefix, abs_suffix = abs_href['href'].split("/en-us/windows/desktop/api/")
576 |
577 |
578 | # strip .html if it exists
579 | html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com")))
580 | uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", "en-us", "windows", "win32", "api" , abs_suffix))
581 |
582 | rel_href = os.path.relpath(uri_target, html_uri)
583 |
584 | #rel_href = os.path.relpath(full_url_target, full_url_html_page)
585 | if rel_href[-1] == '/': # module index
586 | rel_href = "%sindex.html" % rel_href
587 | else:
588 | rel_href = "%s.html" % rel_href
589 |
590 | logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href))
591 | abs_href['href'] = rel_href
592 | abs_href['data-linktype'] = "relative-path"
593 |
594 |
595 | # some externals hrefs are like this win32 -> win32 :
596 | # Inspect
597 | elif abs_href['href'].startswith("/en-us/windows/desktop/"):
598 |
599 | # rewrite /en-us/windows/desktop to /win32/
600 | prefix, abs_suffix = abs_href['href'].split("/en-us/windows/desktop/")
601 |
602 |
603 | # strip .html if it exists
604 | html_uri, ext = os.path.splitext(os.path.relpath(html_path, os.path.join(documents_dir, "docs.microsoft.com")))
605 | uri_target, ext = os.path.splitext(os.path.join("docs.microsoft.com", "win32", abs_suffix))
606 |
607 | rel_href = os.path.relpath(uri_target, html_uri)
608 |
609 | #rel_href = os.path.relpath(full_url_target, full_url_html_page)
610 | if rel_href[-1] == '/': # module index
611 | rel_href = "%sindex.html" % rel_href
612 | else:
613 | rel_href = "%s.html" % rel_href
614 |
615 | logging.info("link rewrite : %s -> %s " % (abs_href['href'], rel_href))
616 | abs_href['href'] = rel_href
617 | abs_href['data-linktype'] = "relative-path"
618 |
619 | # some externals hrefs are like this :
620 | # UISettings.TextScaleFactorChanged Event
621 | elif abs_href['href'].startswith("/en-us/"):
622 | full_url_target = "https://docs.microsoft.com" + abs_href['href']
623 | abs_href['href'] = full_url_target
624 |
625 | # Remove every other linktype absolute since we don't know how to handle it
626 | else:
627 | # TODO : currently we don't replace it in order to show the broken urls
628 | # abs_href.replace_with(abs_href.text)
629 | pass
630 |
631 | # remove unsupported nav elements
632 | nav_elements = [
633 | ["nav" , { "class" : "doc-outline", "role" : "navigation"}],
634 | ["ul" , { "class" : "breadcrumbs", "role" : "navigation"}],
635 | ["div" , { "class" : "sidebar", "role" : "navigation"}],
636 | ["div" , { "class" : "dropdown dropdown-full mobilenavi"}],
637 | ["p" , { "class" : "api-browser-description"}],
638 | ["div" , { "class" : "api-browser-search-field-container"}],
639 | ["div" , { "class" : "pageActions"}],
640 | ["div" , { "class" : "container footerContainer"}],
641 | ["div" , { "class" : "dropdown-container"}],
642 | ["div" , { "class" : "binary-rating-buttons"}],
643 | ["ul" , { "class" : "metadata page-metadata"}],
644 | ["div" , { "data-bi-name" : "pageactions"}],
645 | ["div" , { "class" : "page-action-holder"}],
646 | ["div" , { "class" : "header-holder"}],
647 | ["footer" , { "data-bi-name" : "footer", "id" : "footer"}],
648 | ["div" , { "class" : "binary-rating-holder"}],
649 | ["div" , { "id" : "left-container"}],
650 | ]
651 |
652 | for nav in nav_elements:
653 | nav_class, nav_attr = nav
654 |
655 | for nav_tag in soup.findAll(nav_class, nav_attr):
656 | _ = nav_tag.extract()
657 |
658 | # remove script elems
659 | for head_script in soup.head.findAll("script"):
660 | _ = head_script.extract()
661 |
662 | # Extract and rewrite additionnal stylesheets to download
663 | ThemeResourceRecord = collections.namedtuple('ThemeResourceRecord', 'url, path')
664 |
665 | theme_output_dir = os.path.join(documents_dir, Configuration.domain)
666 | theme_resources = []
667 |
668 | for link in soup.head.findAll("link", { "rel" : "stylesheet"}):
669 | uri_path = link['href'].strip()
670 |
671 | if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri):
672 | continue
673 |
674 | # Construct (url, path) tuple
675 | css_url = "https://%s/%s" % (Configuration.domain, uri_path)
676 | css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/'))
677 |
678 | # Converting href to a relative link
679 | path = os.path.relpath(css_filepath, os.path.dirname(html_path))
680 | rel_uri = '/'.join(path.split(os.sep))
681 | link['href'] = rel_uri
682 |
683 | theme_resources.append( ThemeResourceRecord(
684 | url = css_url,
685 | path = os.path.relpath(css_filepath, documents_dir), # stored as relative path
686 | ))
687 |
688 | return soup, set(theme_resources)
689 |
690 |
691 |
692 | def rewrite_html_contents(configuration : Configuration, html_root_dir : str):
693 | """ rewrite every html file downloaded """
694 |
695 | additional_resources = set()
696 |
697 | for html_file in glob.glob("%s/**/*.html" % html_root_dir, recursive = True):
698 |
699 | logging.info("rewrite html_file : %s" % (html_file))
700 |
701 | # Read content and parse html
702 | with open(html_file, 'r', encoding='utf8') as i_fd:
703 | html_content = i_fd.read()
704 |
705 | soup = bs(html_content, 'html.parser')
706 |
707 | # rewrite html
708 | soup, resources = rewrite_soup(configuration, soup, html_file, html_root_dir)
709 | additional_resources = additional_resources.union(resources)
710 |
711 | # Export fixed html
712 | fixed_html = soup.prettify("utf-8")
713 | with open(html_file, 'wb') as o_fd:
714 | o_fd.write(fixed_html)
715 |
716 | return additional_resources
717 |
718 |
719 | def download_additional_resources(configuration : Configuration, documents_dir : str, resources_to_dl : set = set()):
720 | """ Download optional resources for "beautification """
721 |
722 | for resource in resources_to_dl:
723 |
724 | download_textfile(
725 | resource.url,
726 | os.path.join(documents_dir, resource.path)
727 | )
728 |
729 | # Download index start page
730 | src_index_filepath = os.path.join(documents_dir, Configuration.domain, "win32", "desktop-app-technologies.html")
731 | index_filepath = os.path.join(documents_dir, Configuration.domain, "win32", "index.html")
732 | shutil.copy(src_index_filepath, index_filepath)
733 |
734 | # soup = bs( configuration.webdriver.get_url_page(index_url), 'html.parser')
735 | # soup = rewrite_index_soup(configuration, soup, index_filepath, documents_dir)
736 | # fixed_html = soup.prettify("utf-8")
737 | # with open(index_filepath, 'wb') as o_fd:
738 | # o_fd.write(fixed_html)
739 |
740 |
741 | # # Download module.svg icon for start page
742 | # icon_module_url = '/'.join(["https:/" , Configuration.domain, "en-us", "media", "toolbars", "module.svg"])
743 | # icon_module_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg")
744 | # download_binary(icon_module_url, icon_module_path)
745 |
746 |
747 | def create_sqlite_database(configuration, content_toc, resources_dir, documents_dir):
748 | """ Indexing the html document in a format Dash can understand """
749 |
750 | def insert_into_sqlite_db(cursor, name, record_type, path):
751 | """ Insert a new unique record in the sqlite database. """
752 | try:
753 | cursor.execute('SELECT rowid FROM searchIndex WHERE path = ?', (path,))
754 | dbpath = cursor.fetchone()
755 | cursor.execute('SELECT rowid FROM searchIndex WHERE name = ?', (name,))
756 | dbname = cursor.fetchone()
757 |
758 | if dbpath is None and dbname is None:
759 | cursor.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, record_type, path))
760 | logging.debug('DB add [%s] >> name: %s, path: %s' % (record_type, name, path))
761 | else:
762 | logging.debug('record exists')
763 |
764 | except:
765 | pass
766 |
767 | sqlite_filepath = os.path.join(resources_dir, "docSet.dsidx")
768 | if os.path.exists(sqlite_filepath):
769 | os.remove(sqlite_filepath)
770 |
771 | db = sqlite3.connect(sqlite_filepath)
772 | cur = db.cursor()
773 | cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
774 | cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
775 |
776 |
777 | mapping = {
778 | # win32 content
779 | "guides" : "Guide",
780 | "attributes" : "Attribute",
781 | "classes" : "Class",
782 | "entries" : "Entry",
783 |
784 | # api-sdk content
785 | "categories" : "Category",
786 | "files" : "File",
787 |
788 | 'callbacks' : "Callback",
789 | 'functions' : "Function",
790 | 'enums' : "Enum",
791 | 'interfaces' : "Interface",
792 | 'structures' : "Structure",
793 |
794 | }
795 |
796 | # import pdb;pdb.set_trace()
797 | for key in mapping.keys():
798 |
799 | for _value in content_toc.get(key, []):
800 |
801 | # path should be unix compliant
802 | value_path = _value['path'].replace(os.sep, '/')
803 | insert_into_sqlite_db(cur, _value['name'], mapping[key], value_path)
804 |
805 |
806 |
807 |
808 | # commit and close db
809 | db.commit()
810 | db.close()
811 |
812 | def copy_folder(src_folder : str, dst_folder : str):
813 | """ Copy a full folder tree anew every time """
814 |
815 | def onerror(func, path, exc_info):
816 | """
817 | Error handler for ``shutil.rmtree``.
818 |
819 | If the error is due to an access error (read only file)
820 | it attempts to add write permission and then retries.
821 |
822 | If the error is for another reason it re-raises the error.
823 |
824 | Usage : ``shutil.rmtree(path, onerror=onerror)``
825 | """
826 | import stat
827 |
828 | if not os.path.exists(path):
829 | return
830 |
831 | if not os.access(path, os.W_OK):
832 | # Is the error an access error ?
833 | os.chmod(path, stat.S_IWUSR)
834 | func(path)
835 | else:
836 | raise
837 |
838 | # print(dst_folder)
839 | shutil.rmtree(dst_folder,ignore_errors=False,onerror=onerror)
840 | shutil.copytree(src_folder, dst_folder)
841 |
842 | def merge_folders(src, dst):
843 |
844 | if os.path.isdir(src):
845 |
846 | if not os.path.exists(dst):
847 | os.makedirs(dst)
848 |
849 | for name in os.listdir(src):
850 | merge_folders(
851 | os.path.join(src, name),
852 | os.path.join(dst, name)
853 | )
854 | else:
855 | shutil.copyfile(src, dst)
856 |
857 | def main(configuration : Configuration):
858 |
859 | # """ Scheme for content toc :
860 | # {
861 | # module_name : {
862 | # 'name' : str,
863 | # 'index' : relative path,
864 | # 'entries' : [
865 | # {
866 | # 'name' : str,
867 | # 'path' : relative path,
868 | # },
869 | # ...
870 | # ]
871 | # },
872 | # ...
873 | # }
874 | # """
875 | content_toc = {}
876 | resources_to_dl = set()
877 |
878 | """ 0. Prepare folders """
879 | source_dir = os.path.join(configuration.build_folder, "_0_win32_source")
880 | api_source_dir = os.path.join(configuration.build_folder, "_0_api_sdk_source")
881 |
882 | download_dir = os.path.join(configuration.build_folder, "_1_downloaded_contents")
883 | html_rewrite_dir = os.path.join(configuration.build_folder, "_2_html_rewrite")
884 | additional_resources_dir = os.path.join(configuration.build_folder, "_3_additional_resources")
885 | package_dir = os.path.join(configuration.build_folder, "_4_ready_to_be_packaged")
886 |
887 | for folder in [source_dir, api_source_dir, download_dir, html_rewrite_dir, additional_resources_dir, package_dir]:
888 | os.makedirs(folder, exist_ok=True)
889 |
890 | # _4_ready_to_be_packaged is the final build dir
891 | docset_dir = os.path.join(package_dir, "%s.docset" % Configuration.docset_name)
892 | content_dir = os.path.join(docset_dir , "Contents")
893 | resources_dir = os.path.join(content_dir, "Resources")
894 | document_dir = os.path.join(resources_dir, "Documents")
895 |
896 | if conf.crawl_contents:
897 | # cloning source directories for scraping contents, extremely long operation
898 | logging.info("Downloading win32 markdown zipped sources : %s -> %s" % ("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip")))
899 | download_binary("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip"))
900 |
901 | logging.info("Extracting win32 markdown zipped sources : ")
902 | with zipfile.ZipFile(os.path.join(source_dir, "docs.zip"), 'r') as zip_ref:
903 | zip_ref.extractall(source_dir)
904 |
905 | logging.info("Downloading sdk-api markdown zipped sources : %s -> %s" % ("https://github.com/MicrosoftDocs/win32/archive/refs/heads/docs.zip", os.path.join(source_dir, "docs.zip")))
906 | download_binary("https://github.com/MicrosoftDocs/sdk-api/archive/refs/heads/docs.zip", os.path.join(api_source_dir, "docs.zip"))
907 |
908 | logging.info("Extracting api-sdk markdown zipped sources : ")
909 | with zipfile.ZipFile(os.path.join(api_source_dir, "docs.zip"), 'r') as zip_ref:
910 | zip_ref.extractall(api_source_dir)
911 |
912 |
913 | """ 1. Download html pages """
914 | logging.info("[1] scraping win32 web contents")
915 | content_toc = {}
916 | content_toc = crawl_msdn_contents(configuration, download_dir, source_dir)
917 |
918 | logging.info("[1] scraping sdk-api web contents")
919 | api_content_toc = crawl_sdk_api_contents(configuration, download_dir, api_source_dir)
920 |
921 | # Merge win32 api content
922 | content_toc.update(api_content_toc)
923 | with open(os.path.join(download_dir, "toc.json"), "w") as content:
924 | json.dump(content_toc, content)
925 | else:
926 | # print(os.path.join(download_dir, "toc.json"))
927 | with open(os.path.join(download_dir, "toc.json"), "r") as content:
928 | content_toc = json.load(content)
929 |
930 | """ 2. Parse and rewrite html contents """
931 | logging.info("[2] rewriting urls and hrefs")
932 | copy_folder(download_dir, html_rewrite_dir)
933 | resources_to_dl = rewrite_html_contents(configuration, html_rewrite_dir)
934 |
935 | """ 3. Download additionnal resources """
936 | logging.info("[3] download style contents")
937 | copy_folder(html_rewrite_dir, additional_resources_dir )
938 | download_additional_resources(configuration, additional_resources_dir, resources_to_dl)
939 |
940 | """ 4. Database indexing """
941 | logging.info("[4] indexing to database")
942 | copy_folder(additional_resources_dir, document_dir )
943 | create_sqlite_database(configuration, content_toc, resources_dir, document_dir)
944 |
945 | """ 5. Archive packaging """
946 | src_dir = os.path.dirname(__file__)
947 | shutil.copy(os.path.join(src_dir, "static/Info.plist"), content_dir)
948 | shutil.copy(os.path.join(src_dir, "static/DASH_LICENSE"), os.path.join(resources_dir, "LICENSE"))
949 | shutil.copy(os.path.join(src_dir, "static/icon.png"), docset_dir)
950 | shutil.copy(os.path.join(src_dir, "static/icon@2x.png"), docset_dir)
951 |
952 | output_dir = os.path.dirname(configuration.output_filepath)
953 | os.makedirs(output_dir, exist_ok=True)
954 |
955 | logging.info("[5] packaging as a dash docset")
956 | make_docset(
957 | docset_dir,
958 | configuration.output_filepath,
959 | Configuration.docset_name
960 | )
961 |
962 |
963 | if __name__ == '__main__':
964 |
965 |
966 |
967 | parser = argparse.ArgumentParser(
968 | description="Dash docset creation script for MSDN's Win32 API"
969 | )
970 |
971 | parser.add_argument("-vv", "--verbose",
972 | help="increase output verbosity",
973 | action="store_true"
974 | )
975 |
976 | subparsers = parser.add_subparsers(help='sub-command help', dest='command')
977 |
978 |
979 | parser_create = subparsers.add_parser('create_docset', help='scrap the internet in order to create a docset')
980 | parser_create.add_argument("-t", "--temporary",
981 | help="Use a temporary directory for creating docset, otherwise use current dir.",
982 | default=False,
983 | action="store_true"
984 | )
985 |
986 |
987 | parser_create.add_argument("-o", "--output",
988 | help="set output filepath",
989 | default = os.path.join(os.getcwd(), "MSDN.tgz"),
990 | )
991 |
992 | parser_create.add_argument("-s", "--sampling",
993 | help="generate only a 'sample' docset, in order to test if the rewriting rules are corrects",
994 | default=False,
995 | action="store_true"
996 | )
997 |
998 | parser_rewrite = subparsers.add_parser('rewrite_html', help='rewrite html file in order to test rules')
999 |
1000 | parser_rewrite.add_argument("input",
1001 | help="set input filepath"
1002 | )
1003 |
1004 | parser_rewrite.add_argument("output",
1005 | help="set output filepath"
1006 | )
1007 |
1008 | parser_rewrite.add_argument("html_root_dir",
1009 | help="set html_root_dir filepath"
1010 | )
1011 |
1012 | args = parser.parse_args()
1013 | if args.verbose:
1014 | logging.basicConfig(level=logging.DEBUG)
1015 | logging.getLogger("requests").setLevel(logging.WARNING)
1016 | logging.getLogger("urllib3").setLevel(logging.WARNING)
1017 | else:
1018 | logging.basicConfig(level=logging.INFO)
1019 |
1020 |
1021 | if args.command == "rewrite_html":
1022 |
1023 | conf = Configuration( args )
1024 |
1025 | # Read content and parse html
1026 | with open(args.input, 'r', encoding='utf8') as i_fd:
1027 | html_content = i_fd.read()
1028 |
1029 | soup = bs(html_content, 'html.parser')
1030 |
1031 | # rewrite html
1032 | soup, resources = rewrite_soup(conf, soup, args.input, args.html_root_dir)
1033 |
1034 | # Export fixed html
1035 | fixed_html = soup.prettify("utf-8")
1036 | with open(args.output, 'wb') as o_fd:
1037 | o_fd.write(fixed_html)
1038 |
1039 | elif args.command == "create_docset":
1040 | conf = Configuration( args )
1041 |
1042 | if args.temporary:
1043 |
1044 | with tempfile.TemporaryDirectory() as tmp_builddir:
1045 | conf.build_folder = tmp_builddir
1046 | main(conf)
1047 | else:
1048 | main(conf)
1049 |
1050 | else:
1051 | raise NotImplementedError("command not implemented %s" % args.command)
1052 |
--------------------------------------------------------------------------------
/static/DASH_LICENSE:
--------------------------------------------------------------------------------
1 | You are not allowed to distribute or make use of any of the files within this folder ("Resources") without written permission from Kapeli or whilst using the Dash app developed by Kapeli. This does not apply to the files located within the "Documents" folder.
--------------------------------------------------------------------------------
/static/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | CFBundleIdentifier
6 | msdn
7 |
8 | CFBundleName
9 | MSDN
10 |
11 | DashDocSetFallbackURL
12 | https://docs.microsoft.com/win32/
13 |
14 | dashIndexFilePath
15 | docs.microsoft.com/win32/index.html
16 |
17 | DashDocSetFamily
18 | msdn
19 |
20 | DocSetPlatformFamily
21 | msdn
22 |
23 | isDashDocset
24 |
25 |
26 | isJavaScriptEnabled
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/static/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/msdn-docset/188d087f030a92f2c13b0dfd8df477cfef7e6876/static/icon.png
--------------------------------------------------------------------------------
/static/icon@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/msdn-docset/188d087f030a92f2c13b0dfd8df477cfef7e6876/static/icon@2x.png
--------------------------------------------------------------------------------