├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── posh-to-dash.py ├── requirements.txt ├── screenshots └── posh-docset.PNG └── static ├── DASH_LICENSE ├── Info.plist ├── docset-template ├── README.md └── create-versioned-docset-json.py ├── icon.png └── icon@2x.png /.gitignore: -------------------------------------------------------------------------------- 1 | ghostdriver.log 2 | _win10_downloaded_contents 3 | geckodriver.log 4 | Powershell.tgz 5 | _build* 6 | geckodriver.log 7 | Pipfile* -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.6' 4 | 5 | # Used to properly name build artifacts 6 | env: 7 | global: 8 | - ARTIFACT_VER="`git describe --tags`" 9 | - ARTIFACT_NAME="posh-docsets-$ARTIFACT_VER.zip" 10 | - MOZ_HEADLESS=1 11 | - FIREFOX_PATH="`which firefox`" 12 | 13 | addons: 14 | firefox: latest 15 | chrome: stable 16 | 17 | deploy: 18 | provider: releases 19 | api_key: 20 | secure: ZkANjBeq7LNuteAD9LsjOaOXIAHI8Wof6Vn+CkWq+kd7sW2ymjPrucPLEoHYeWrcJcuL67C7C2LXvl30IDaBPrBQbcoAJ2vAcPsK2iIRZxqL2FSp8CpCO1BB64gYqJ+6FZ9ZHrYNfAQNhBGiuaT7YzxBF6qP1ODy8ofrSpRauQMssPV28CRQQ9vek3um0QsAth/FtcUP7je5/8IImZDsxzFYDsSTn2MjnrtCc7x9EEuIel59b+1Cw5k6oLzHQP1IIXmyt2AvMI2v/Qvr/FiByg49vM0Kb8rdqsFlx37MaORY5jolpuL0iND5SuTLNsdC4r6yfyp4bLg9kG0VaevU9QK0mYqD8VQIikE8mMsIVLc1jC6tzrK8A5rIZwRo8Ug7We05TEUssidqzXImMy1AYTPSBvoM1iuAYdEewncOCRqeFrZpsD52YD9gp9LqsTVWJ/iV0UnXLg6owgrRrE8Os/vvb3rK4c7ev2UcT0//lJutmg4E0WAtOtI0d4FhGvaPFh8GVmdTwt38cJgsVcaDD1ATUB03vlafT2LTnbaSCmP9BYB+2Sc3Ml3nRcCTjguUaNX6goGr0G7uiCqo3Eyf9NRaKPFd5IYHMKqDHf1z1JSDp3/hPrzO7RtbW6iB96SjCnBXK8ddg42oG7d4dRlbAmQidS1a3cWy34ddwnlxlaE= 21 | file: $ARTIFACT_NAME 22 | skip_cleanup: true 23 | all_branches: true 24 | on: 25 | repo: lucasg/powershell-docset 26 | tags: true 27 | 28 | addons: 29 | artifacts: 30 | # ⋮ 31 | paths: 32 | - $(ls *.zip | tr "\n" ":") 33 | 34 | notifications: 35 | email: 36 | recipients: 37 | - lucas.georges@outlook.com 38 | on_success: never # default: change 39 | on_failure: always # default: always 40 | 41 | install: 42 | - pip install selenium requests bs4 43 | 44 | before_install: 45 | # dynamically resolve the correct chromedriver version to install from chrome's version 46 | - CHROME_MAIN_VERSION=`google-chrome-stable --version | sed -E 's/(^Google Chrome |\.[0-9]+ )//g'` 47 | - CHROMEDRIVER_VERSION=`curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROME_MAIN_VERSION"` 48 | - wget "https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip" 49 | - unzip chromedriver_linux64.zip 50 | - chmod +x chromedriver 51 | - export PATH=$PATH:$PWD/ 52 | 53 | script: 54 | - mkdir -p Powershell 55 | 56 | - python posh-to-dash.py --temporary --output=Powershell/versions/7.1/Powershell.tgz --version=7.1 57 | # - python posh-to-dash.py --temporary --output=Powershell/versions/5.1/Powershell.tgz --version=5.1 58 | # - python posh-to-dash.py --temporary --output=Powershell/versions/5.0/Powershell.tgz --version=5.0 59 | # - python posh-to-dash.py --temporary --output=Powershell/versions/4.0/Powershell.tgz --version=4.0 60 | # - python posh-to-dash.py --temporary --output=Powershell/versions/3.0/Powershell.tgz --version=3.0 61 | 62 | - cp static/icon.png Powershell/icon.png 63 | - cp static/icon@2x.png Powershell/icon@2x.png 64 | - cp Powershell/versions/7.1/Powershell.tgz Powershell/Powershell.tgz 65 | 66 | - cp static/docset-template/README.md Powershell/README.md 67 | - python static/docset-template/create-versioned-docset-json.py --output=Powershell/docset.json --version=$ARTIFACT_VER 68 | 69 | - zip -r $ARTIFACT_NAME Powershell 70 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 lucasg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # powershell-docset : A dash docset for powershell modules 2 | 3 | ### Status 4 | 5 | [![Build Status](https://travis-ci.org/lucasg/powershell-docset.svg?branch=master)](https://travis-ci.org/lucasg/powershell-docset) 6 | 7 | `posh-to-dash.py` scrapes the newly announced `https://docs.microsoft.com/en-us/powershell/module/` website in order to create an offline dash-compatible archive to be viewed in `Dash`, `Zeal` or `Velocity` : 8 | 9 |

10 | Powershell docset in Velocity 11 |

12 | 13 | ## Releases 14 | 15 | - [v0.1 -- Minimal working version](https://github.com/lucasg/powershell-docset/releases/tag/v0.1) 16 | - [v0.2 -- Offline mode supported](https://github.com/lucasg/powershell-docset/releases/tag/v0.2) 17 | - [v0.3 -- travis setup](https://github.com/lucasg/powershell-docset/releases/tag/v0.3) 18 | - [v0.4 -- user contributed docset](https://github.com/lucasg/powershell-docset/releases/tag/v0.4) 19 | - [v0.5 -- versionned docsets](https://github.com/lucasg/powershell-docset/releases/tag/v0.5) 20 | - [v0.6 -- windows 10 modules documentation](https://github.com/lucasg/powershell-docset/releases/tag/v0.6) 21 | - [v0.7.2 -- powershell 7.1 documentation](https://github.com/lucasg/powershell-docset/releases/tag/v0.7.2) 22 | 23 | ## Installation & Execution 24 | 25 | `posh-to-dash.py` relies on : 26 | 27 | - `requests` for http(s) downloads 28 | - `selenium` and `phantomjs` for webscraping 29 | - `bs4` for html parsing and rewriting 30 | 31 | 1. Copy the repository 32 | 2. Install the dependencies from requirements.txt, use a virtualenv to avoid problems with dependencies and versions. 33 | 3. Download the geckodriver from [Mozilla's Repo](https://github.com/mozilla/geckodriver/releases), download the version that matches your OS. 34 | 4. Place the geckodriver in your path 35 | 36 | - If Windows, grab the executable an place it in `%USERPROFILE%\AppData\Local\Microsoft\WindowsApps` 37 | 38 | - If Linux, move it to your `~/.local/bin` or wherever you have your path 39 | 40 | 5. Start scraping by typing : `posh-to-dash.py --output=$outputfile --version=6 --temporary` 41 | 42 | - if `--output` is not provided, `posh-to-dash.py` will output "Powershell.tgz' into the working directory 43 | - the `--version` switch support only Powershell API versions `7.0`, `7.1` (default) and `7.2` , the rest are obsolete by Microsoft. 44 | - `--temporary` specify to download the web scraping resources in a temporary folder instead of clobbering the current directory. However if the download fail, the results will be thrown out. 45 | 46 | **NOTE: The process takes 15+ minutes to run. The more versions you download increases the time.** 47 | 48 | ## Add your docset to Zeal 49 | 50 | With the Powershell.tar file, unzip it and place it in `C:\Users\\AppData\Local\Zeal\Zeal\docsets` 51 | 52 | ## Limitations 53 | 54 | The powershell modules API endpoint is quite new, so it may be subject to breakage by the `docs.microsoft.com` people. 55 | -------------------------------------------------------------------------------- /posh-to-dash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sqlite3 4 | import os 5 | import sys 6 | import glob 7 | import re 8 | import shutil 9 | import logging 10 | import json 11 | import tarfile 12 | import tempfile 13 | import argparse 14 | import urllib.parse 15 | import urllib 16 | import time 17 | import collections 18 | 19 | import requests 20 | from requests.adapters import HTTPAdapter 21 | from requests.packages.urllib3.util.retry import Retry 22 | from requests.exceptions import ConnectionError 23 | from bs4 import BeautifulSoup as bs, Tag # pip install bs4 24 | from selenium import webdriver 25 | # from selenium.webdriver import Firefox 26 | # from selenium.webdriver.firefox.options import Options 27 | # from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 28 | 29 | from selenium.webdriver import Chrome 30 | from selenium.webdriver.chrome.options import Options 31 | 32 | class PoshWebDriver: 33 | """ Thin wrapper for selenium webdriver for page content retrieval """ 34 | 35 | def __init__(self, executable_path = None): 36 | 37 | self.options = Options() 38 | self.options.add_argument("--headless") 39 | self.options.add_argument("--window-size=1920x1080") 40 | 41 | self.driver = webdriver.Chrome(options=self.options) 42 | 43 | # self.driver_exe_path = executable_path 44 | 45 | # if self.driver_exe_path: 46 | # binary = FirefoxBinary(executable_path) 47 | # self.driver = webdriver.Firefox( 48 | # firefox_binary=binary, 49 | # options=options, 50 | # ) 51 | # else: 52 | # self.driver = webdriver.Firefox( 53 | # options=options 54 | # ) 55 | 56 | def get_url_page(self, url): 57 | """ retrieve the full html content of a page after Javascript execution """ 58 | 59 | index_html = None 60 | try: 61 | self.driver.get(url) 62 | index_html = self.driver.page_source 63 | except (ConnectionResetError, urllib.error.URLError) as e: 64 | # we may have a triggered a anti-scraping time ban 65 | # Lay low for several seconds and get back to it. 66 | 67 | self.driver.quit() 68 | time.sleep(2) 69 | 70 | # if self.driver_exe_path: 71 | # self.driver = webdriver.PhantomJS(executable_path = self.driver_exe_path) 72 | # else: 73 | # self.driver = webdriver.PhantomJS() 74 | 75 | self.driver = webdriver.Chrome(options=self.options) 76 | 77 | index_html = None 78 | 79 | # try a second time, and raise error if fail 80 | if not index_html: 81 | self.driver.get(url) 82 | index_html = self.driver.page_source 83 | 84 | return index_html 85 | 86 | def quit(): 87 | return self.driver.quit() 88 | 89 | 90 | class Configuration: 91 | 92 | # STATIC CONSTANTS 93 | posh_doc_api_version = '0.2' # powershell doc api version, not this docset one. 94 | posh_version = '6' 95 | docset_name = 'Powershell' 96 | 97 | domain = "docs.microsoft.com" 98 | base_url = "%s/en-us/powershell/module" % domain 99 | default_url = "https://%s/?view=powershell-%%s" % (base_url) 100 | default_theme_uri = "_themes/docs.theme/master/en-us/_themes" 101 | 102 | def __init__(self, args): 103 | 104 | 105 | # selected powershell api version 106 | self.powershell_version = args.version 107 | 108 | # The modules and cmdlets pages are "versionned" using additional params in the GET request 109 | self.powershell_version_param = "view=powershell-{0:s}".format(self.powershell_version) 110 | 111 | # build folder (must be cleaned afterwards) 112 | self.build_folder = os.path.join(os.getcwd(), "_build_{0:s}".format(self.powershell_version)) 113 | 114 | # output file 115 | self.output_filepath = os.path.realpath(args.output) 116 | 117 | # powershell docs start page 118 | self.docs_index_url = Configuration.default_url % self.powershell_version 119 | 120 | # powershell docs table of contents url 121 | self.docs_toc_url = "https://{0:s}/psdocs/toc.json?{2:s}".format( 122 | Configuration.base_url, 123 | self.powershell_version, 124 | self.powershell_version_param 125 | ) 126 | 127 | self.windows_toc_url = "https://{0:s}/windowsserver2019-ps/toc.json?view=windowsserver2019-ps".format( 128 | Configuration.base_url 129 | ) 130 | 131 | # selenium webdriver 132 | self.webdriver = PoshWebDriver(args.phantom) 133 | 134 | # selected module 135 | self.filter_modules = [module.lower() for module in args.modules] 136 | 137 | 138 | # Global session for several retries 139 | session = requests.Session() 140 | retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) 141 | session.mount('http://', HTTPAdapter(max_retries=retries)) 142 | 143 | 144 | def download_binary(url, output_filename): 145 | """ Download GET request as binary file """ 146 | global session 147 | 148 | logging.debug("download_binary : %s -> %s" % (url, output_filename)) 149 | 150 | # ensure the folder path actually exist 151 | os.makedirs(os.path.dirname(output_filename), exist_ok = True) 152 | 153 | r = session.get(url, stream=True) 154 | with open(output_filename, 'wb') as f: 155 | for data in r.iter_content(32*1024): 156 | f.write(data) 157 | 158 | def download_textfile(url : str , output_filename : str, params : dict = None): 159 | """ Download GET request as utf-8 text file """ 160 | global session 161 | 162 | logging.debug("download_textfile : %s -> %s" % (url, output_filename)) 163 | 164 | # ensure the folder path actually exist 165 | os.makedirs(os.path.dirname(output_filename), exist_ok = True) 166 | 167 | while True: 168 | try: 169 | r = session.get(url, data = params) 170 | except ConnectionError: 171 | logging.debug("caught ConnectionError, retrying...") 172 | time.sleep(2) 173 | else: 174 | break 175 | 176 | with open(output_filename, 'w', encoding="utf8") as f: 177 | f.write(r.text) 178 | 179 | 180 | def make_docset(source_dir, dst_filepath, filename): 181 | """ 182 | Tar-gz the build directory while conserving the relative folder tree paths. 183 | Copied from : https://stackoverflow.com/a/17081026/1741450 184 | """ 185 | dst_dir = os.path.dirname(dst_filepath) 186 | tar_filepath = os.path.join(dst_dir, '%s.tar' % filename) 187 | 188 | with tarfile.open(tar_filepath, "w:gz") as tar: 189 | tar.add(source_dir, arcname=os.path.basename(source_dir)) 190 | 191 | shutil.move(tar_filepath, dst_filepath) 192 | 193 | 194 | 195 | def download_page_contents(configuration, uri, output_filepath): 196 | """ Download a page using it's uri from the TOC """ 197 | 198 | # Resolving "absolute" url et use appropriate version 199 | full_url = urllib.parse.urljoin(configuration.docs_toc_url, uri) 200 | versionned_url = "{0:s}?{1:s}".format(full_url, configuration.powershell_version_param) 201 | 202 | download_textfile(versionned_url, output_filepath) 203 | 204 | 205 | def download_module_contents(configuration, module_name, module_uri, module_dir, cmdlets, root_dir): 206 | """ Download a modules contents """ 207 | 208 | module_filepath = os.path.join(module_dir, "%s.html" % module_name) 209 | 210 | logging.debug("downloading %s module index page -> %s" % (module_name, module_filepath)) 211 | if module_uri: 212 | download_page_contents(configuration, module_uri, module_filepath) 213 | 214 | cmdlets_infos = [] 215 | 216 | # Downloading cmdlet contents 217 | for cmdlet in cmdlets: 218 | 219 | cmdlet_name = cmdlet['toc_title'] 220 | if cmdlet_name.lower() in ("about", "functions", "providers", "provider"): # skip special toc 221 | continue 222 | 223 | cmdlet_uri = cmdlet["href"] 224 | cmdlet_filepath = os.path.join(module_dir, "%s.html" % cmdlet_name) 225 | 226 | logging.debug("downloading %s cmdlet doc -> %s" % (cmdlet_name, cmdlet_filepath)) 227 | download_page_contents(configuration, cmdlet_uri, cmdlet_filepath) 228 | 229 | cmdlets_infos.append({ 230 | 'name' : cmdlet_name, 231 | 'path' : os.path.relpath(cmdlet_filepath, root_dir), 232 | }) 233 | 234 | module_infos = { 235 | 'name' : module_name, 236 | 'index' : os.path.relpath(module_filepath, root_dir), 237 | 'cmdlets' : cmdlets_infos 238 | } 239 | 240 | return module_infos 241 | 242 | def crawl_posh_contents(configuration: Configuration, toc_url : str, download_dir : str, ): 243 | """ Download Powershell modules and cmdlets content pages based on TOC """ 244 | 245 | # Download toc 246 | logging.debug("Downloading powershell toc : %s" % (toc_url)) 247 | r = requests.get(toc_url) 248 | modules_toc = json.loads(r.text) 249 | 250 | # modules_toc is a web based TOC, where as content_toc is file based 251 | content_toc = {} 252 | 253 | logging.debug("raw modules : %s" % [m['toc_title'] for m in modules_toc['items'][0]['children']]) 254 | 255 | # optional filter on selected module 256 | modules = modules_toc['items'][0]['children'] 257 | if len(configuration.filter_modules): 258 | modules = list(filter(lambda m: m['toc_title'].lower() in configuration.filter_modules, modules)) 259 | logging.debug("filtered modules : %s" % [m['toc_title'] for m in modules]) 260 | 261 | # Downloading modules contents 262 | for module in modules: 263 | 264 | module_name = module['toc_title'] 265 | module_uri = module.get("href") 266 | module_cmdlets = module['children'] 267 | module_dir = os.path.join(download_dir, Configuration.base_url, module_name) 268 | 269 | logging.info("[+] download module %s" % (module_name)) 270 | module_infos = download_module_contents(configuration, module_name, module_uri, module_dir, module_cmdlets, download_dir) 271 | content_toc[module_name] = module_infos 272 | 273 | return content_toc 274 | 275 | def rewrite_soup(configuration : Configuration, soup, html_path : str, documents_dir : str): 276 | """ rewrite html contents by fixing links and remove unnecessary cruft """ 277 | 278 | # Fix navigations links 279 | links = soup.findAll("a", { "data-linktype" : "relative-path"}) # for modules and cmdlet pages 280 | link_pattern = re.compile(r"([\w\.\/-]+)\?view=[powershell-|windowsserver2019-ps]") 281 | 282 | for link in links: 283 | 284 | href = link['href'] 285 | fixed_href = href 286 | 287 | # go back to module 288 | if href == "./?view=powershell-%s" % configuration.powershell_version: 289 | fixed_href = "./%s.html" % link.text 290 | elif href == "./?view=windowsserver2019-ps": 291 | fixed_href = "./%s.html" % link.text 292 | 293 | # go to a cmdlet page 294 | else: 295 | targets = link_pattern.findall(href) 296 | if not len(targets): # badly formated 'a' link 297 | continue 298 | 299 | module_name = targets[0] 300 | fixed_href = "%s.html" % module_name 301 | 302 | if fixed_href != href: 303 | logging.debug("link rewrite : %s -> %s " % ( href, fixed_href)) 304 | link['href'] = fixed_href 305 | 306 | 307 | # remove link to external references since we can't support it 308 | for abs_href in soup.findAll("a", { "data-linktype" : "absolute-path"}): 309 | abs_href.replace_with(abs_href.text) 310 | 311 | # remove unsupported nav elements 312 | nav_elements = [ 313 | ["nav" , { "class" : "doc-outline", "role" : "navigation"}], 314 | ["ul" , { "class" : "breadcrumbs", "role" : "navigation"}], 315 | ["div" , { "class" : "sidebar", "role" : "navigation"}], 316 | ["div" , { "class" : "dropdown dropdown-full mobilenavi"}], 317 | ["p" , { "class" : "api-browser-description"}], 318 | ["div" , { "class" : "api-browser-search-field-container"}], 319 | ["div" , { "class" : "pageActions"}], 320 | ["div" , { "class" : "container footerContainer"}], 321 | ["div" , { "class" : "dropdown-container"}], 322 | ["div" , { "class" : "page-action-holder"}], 323 | ["div" , { "aria-label" : "Breadcrumb", "role" : "navigation"}], 324 | ["div" , { "data-bi-name" : "rating"}], 325 | ["div" , { "data-bi-name" : "feedback-section"}], 326 | ["section" , { "class" : "feedback-section", "data-bi-name" : "feedback-section"}], 327 | ["footer" , { "data-bi-name" : "footer", "id" : "footer"}], 328 | ] 329 | 330 | for nav in nav_elements: 331 | nav_class, nav_attr = nav 332 | 333 | for nav_tag in soup.findAll(nav_class, nav_attr): 334 | _ = nav_tag.extract() 335 | 336 | # remove script elems 337 | for head_script in soup.head.findAll("script"): 338 | _ = head_script.extract() 339 | 340 | # Extract and rewrite additionnal stylesheets to download 341 | ThemeResourceRecord = collections.namedtuple('ThemeResourceRecord', 'url, path') 342 | 343 | theme_output_dir = os.path.join(documents_dir, Configuration.domain) 344 | theme_resources = [] 345 | 346 | for link in soup.head.findAll("link", { "rel" : "stylesheet"}): 347 | uri_path = link['href'].strip() 348 | 349 | if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri): 350 | continue 351 | 352 | # Construct (url, path) tuple 353 | css_url = "https://%s/%s" % (Configuration.domain, uri_path) 354 | css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/')) 355 | 356 | # Converting href to a relative link 357 | path = os.path.relpath(css_filepath, os.path.dirname(html_path)) 358 | rel_uri = '/'.join(path.split(os.sep)) 359 | link['href'] = rel_uri 360 | 361 | theme_resources.append( ThemeResourceRecord( 362 | url = css_url, 363 | path = os.path.relpath(css_filepath, documents_dir), # stored as relative path 364 | )) 365 | 366 | return soup, set(theme_resources) 367 | 368 | def rewrite_index_soup(configuration : Configuration, soup, index_html_path : str, documents_dir : str): 369 | """ rewrite html contents by fixing links and remove unnecessary cruft """ 370 | 371 | # Fix navigations links 372 | content_tables = soup.findAll("table", { 373 | "class" : "api-search-results" 374 | }) 375 | 376 | for content_table in content_tables: 377 | 378 | links = content_table.findAll(lambda tag: tag.name == 'a') 379 | link_pattern = re.compile(r"/powershell/module/([\w\.\-]+)/\?view=powershell-") 380 | 381 | for link in links: 382 | 383 | href = link['href'] 384 | fixed_href = href 385 | 386 | 387 | targets = link_pattern.findall(href) 388 | if not len(targets): 389 | continue # badly formated 'a' link 390 | 391 | module_name = targets[0].lstrip('/').rstrip('/') 392 | fixed_href = "powershell/module/%s/%s.html" % (module_name, module_name) 393 | 394 | if fixed_href != href: 395 | logging.debug("link rewrite : %s -> %s " % ( href, fixed_href)) 396 | link['href'] = fixed_href 397 | 398 | # Fix link to module.svg 399 | module_svg_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg") 400 | images = content_table.findAll("img" , {'alt' : "Module"}) 401 | for image in images: 402 | image['src'] = os.path.relpath(module_svg_path, os.path.dirname(index_html_path)) 403 | 404 | # remove unsupported nav elements 405 | nav_elements = [ 406 | ["nav" , { "class" : "doc-outline", "role" : "navigation"}], 407 | ["ul" , { "class" : "breadcrumbs", "role" : "navigation"}], 408 | ["div" , { "class" : "sidebar", "role" : "navigation"}], 409 | ["div" , { "class" : "dropdown dropdown-full mobilenavi"}], 410 | ["p" , { "class" : "api-browser-description"}], 411 | ["div" , { "class" : "api-browser-search-field-container"}], 412 | ["div" , { "class" : "pageActions"}], 413 | ["div" , { "class" : "dropdown-container"}], 414 | ["div" , { "class" : "container footerContainer"}], 415 | ["div" , { "data-bi-name" : "header", "id" : "headerAreaHolder"}], 416 | ["div" , { "class" : "header-holder"}], 417 | ["div" , { "id" : "action-panel"}], 418 | ["div" , { "id" : "api-browser-search-field-container"}], 419 | ] 420 | 421 | for nav in nav_elements: 422 | nav_class, nav_attr = nav 423 | 424 | for nav_tag in soup.findAll(nav_class, nav_attr): 425 | _ = nav_tag.extract() 426 | 427 | # remove script elems 428 | for head_script in soup.head.findAll("script"): 429 | _ = head_script.extract() 430 | for body_async_script in soup.body.findAll("script", { "async" : "", "defer" : ""}): 431 | _ = head_script.extract() 432 | 433 | # Fixing and downloading css stylesheets 434 | theme_output_dir = os.path.join(documents_dir, Configuration.domain) 435 | for link in soup.head.findAll("link", { "rel" : "stylesheet"}): 436 | uri_path = link['href'].strip() 437 | 438 | if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri): 439 | continue 440 | 441 | # Construct (url, path) tuple 442 | css_url = "https://%s/%s" % (Configuration.domain, uri_path) 443 | css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/')) 444 | 445 | # Converting href to a relative link 446 | path = os.path.relpath(css_filepath, os.path.dirname(index_html_path)) 447 | rel_uri = '/'.join(path.split(os.sep)) 448 | link['href'] = rel_uri 449 | 450 | download_textfile(css_url, css_filepath) 451 | 452 | return soup 453 | 454 | 455 | def rewrite_html_contents(configuration : Configuration, html_root_dir : str): 456 | """ rewrite every html file downloaded """ 457 | 458 | additional_resources = set() 459 | 460 | for html_file in glob.glob("%s/**/*.html" % html_root_dir, recursive = True): 461 | 462 | logging.debug("rewrite html_file : %s" % (html_file)) 463 | 464 | # Read content and parse html 465 | with open(html_file, 'r', encoding='utf8') as i_fd: 466 | html_content = i_fd.read() 467 | 468 | soup = bs(html_content, 'html.parser') 469 | 470 | # rewrite html 471 | soup, resources = rewrite_soup(configuration, soup, html_file, html_root_dir) 472 | additional_resources = additional_resources.union(resources) 473 | 474 | # Export fixed html 475 | fixed_html = soup.prettify("utf-8") 476 | with open(html_file, 'wb') as o_fd: 477 | o_fd.write(fixed_html) 478 | 479 | return additional_resources 480 | 481 | 482 | def download_additional_resources(configuration : Configuration, documents_dir : str, resources_to_dl : set = set()): 483 | """ Download optional resources for "beautification """ 484 | 485 | for resource in resources_to_dl: 486 | 487 | download_textfile( 488 | resource.url, 489 | os.path.join(documents_dir, resource.path) 490 | ) 491 | 492 | # Download index start page 493 | index_url = Configuration.default_url % configuration.powershell_version 494 | index_filepath = os.path.join(documents_dir, Configuration.domain, "en-us", "index.html") 495 | 496 | soup = bs( configuration.webdriver.get_url_page(index_url), 'html.parser') 497 | soup = rewrite_index_soup(configuration, soup, index_filepath, documents_dir) 498 | fixed_html = soup.prettify("utf-8") 499 | with open(index_filepath, 'wb') as o_fd: 500 | o_fd.write(fixed_html) 501 | 502 | 503 | # Download module.svg icon for start page 504 | icon_module_url = '/'.join(["https:/" , Configuration.domain, "en-us", "media", "toolbars", "module.svg"]) 505 | icon_module_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg") 506 | download_binary(icon_module_url, icon_module_path) 507 | 508 | 509 | def create_sqlite_database(configuration, content_toc, resources_dir, documents_dir): 510 | """ Indexing the html document in a format Dash can understand """ 511 | 512 | def insert_into_sqlite_db(cursor, name, record_type, path): 513 | """ Insert a new unique record in the sqlite database. """ 514 | try: 515 | cursor.execute('SELECT rowid FROM searchIndex WHERE path = ?', (path,)) 516 | dbpath = cursor.fetchone() 517 | cursor.execute('SELECT rowid FROM searchIndex WHERE name = ?', (name,)) 518 | dbname = cursor.fetchone() 519 | 520 | if dbpath is None and dbname is None: 521 | cursor.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, record_type, path)) 522 | logging.debug('DB add [%s] >> name: %s, path: %s' % (record_type, name, path)) 523 | else: 524 | logging.debug('record exists') 525 | 526 | except: 527 | pass 528 | 529 | sqlite_filepath = os.path.join(resources_dir, "docSet.dsidx") 530 | if os.path.exists(sqlite_filepath): 531 | os.remove(sqlite_filepath) 532 | 533 | db = sqlite3.connect(sqlite_filepath) 534 | cur = db.cursor() 535 | cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);') 536 | cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);') 537 | 538 | 539 | for module_name, module in content_toc.items(): 540 | 541 | # path should be unix compliant 542 | module_path = module['index'].replace(os.sep, '/') 543 | insert_into_sqlite_db(cur, module_name, "Module", module_path) 544 | 545 | for cmdlet in module['cmdlets']: 546 | 547 | cmdlet_name = cmdlet['name'] 548 | if cmdlet_name == module_name: 549 | continue 550 | 551 | # path should be unix compliant 552 | cmdlet_path = cmdlet['path'].replace(os.sep, '/') 553 | 554 | insert_into_sqlite_db(cur, cmdlet_name, "Command", cmdlet_path) 555 | 556 | 557 | # commit and close db 558 | db.commit() 559 | db.close() 560 | 561 | def copy_folder(src_folder : str, dst_folder : str): 562 | """ Copy a full folder tree anew every time """ 563 | 564 | def onerror(func, path, exc_info): 565 | """ 566 | Error handler for ``shutil.rmtree``. 567 | 568 | If the error is due to an access error (read only file) 569 | it attempts to add write permission and then retries. 570 | 571 | If the error is for another reason it re-raises the error. 572 | 573 | Usage : ``shutil.rmtree(path, onerror=onerror)`` 574 | """ 575 | import stat 576 | 577 | if not os.path.exists(path): 578 | return 579 | 580 | if not os.access(path, os.W_OK): 581 | # Is the error an access error ? 582 | os.chmod(path, stat.S_IWUSR) 583 | func(path) 584 | else: 585 | raise 586 | 587 | shutil.rmtree(dst_folder,ignore_errors=False,onerror=onerror) 588 | shutil.copytree(src_folder, dst_folder) 589 | 590 | def merge_folders(src, dst): 591 | 592 | if os.path.isdir(src): 593 | 594 | if not os.path.exists(dst): 595 | os.makedirs(dst) 596 | 597 | for name in os.listdir(src): 598 | merge_folders( 599 | os.path.join(src, name), 600 | os.path.join(dst, name) 601 | ) 602 | else: 603 | shutil.copyfile(src, dst) 604 | 605 | def main(configuration : Configuration): 606 | 607 | # """ Scheme for content toc : 608 | # { 609 | # module_name : { 610 | # 'name' : str, 611 | # 'index' : relative path, 612 | # 'cmdlets' : [ 613 | # { 614 | # 'name' : str, 615 | # 'path' : relative path, 616 | # }, 617 | # ... 618 | # ] 619 | # }, 620 | # ... 621 | # } 622 | # """ 623 | content_toc = {} 624 | resources_to_dl = set() 625 | 626 | """ 0. Prepare folders """ 627 | download_dir = os.path.join(configuration.build_folder, "_1_downloaded_contents") 628 | win10_download_dir = os.path.join(os.getcwd(), "_win10_downloaded_contents") 629 | html_rewrite_dir = os.path.join(configuration.build_folder, "_2_html_rewrite") 630 | additional_resources_dir = os.path.join(configuration.build_folder, "_3_additional_resources") 631 | package_dir = os.path.join(configuration.build_folder, "_4_ready_to_be_packaged") 632 | 633 | for folder in [download_dir, html_rewrite_dir, additional_resources_dir, package_dir]: 634 | os.makedirs(folder, exist_ok=True) 635 | 636 | # _4_ready_to_be_packaged is the final build dir 637 | docset_dir = os.path.join(package_dir, "%s.docset" % Configuration.docset_name) 638 | content_dir = os.path.join(docset_dir , "Contents") 639 | resources_dir = os.path.join(content_dir, "Resources") 640 | document_dir = os.path.join(resources_dir, "Documents") 641 | 642 | """ 1. Download html pages """ 643 | logging.info("[1] scraping web contents") 644 | content_toc = crawl_posh_contents(configuration, configuration.docs_toc_url, download_dir) 645 | 646 | # do not download twice the win10 api since it's quite a handful 647 | if os.path.exists(os.path.join(win10_download_dir, "toc.json")): 648 | with open(os.path.join(win10_download_dir, "toc.json"), "r") as content: 649 | windows_toc = json.load(content) 650 | else: 651 | windows_toc = crawl_posh_contents(configuration, configuration.windows_toc_url, win10_download_dir) 652 | with open(os.path.join(win10_download_dir, "toc.json"), "w") as content: 653 | json.dump(windows_toc, content) 654 | 655 | # Merge win10 api content 656 | merge_folders(win10_download_dir, download_dir) 657 | content_toc.update(windows_toc) 658 | with open(os.path.join(download_dir, "toc.json"), "w") as content: 659 | json.dump(content_toc, content) 660 | 661 | """ 2. Parse and rewrite html contents """ 662 | logging.info("[2] rewriting urls and hrefs") 663 | copy_folder(download_dir, html_rewrite_dir) 664 | resources_to_dl = rewrite_html_contents(configuration, html_rewrite_dir) 665 | 666 | """ 3. Download additionnal resources """ 667 | logging.info("[3] download style contents") 668 | copy_folder(html_rewrite_dir, additional_resources_dir ) 669 | download_additional_resources(configuration, additional_resources_dir, resources_to_dl) 670 | 671 | """ 4. Database indexing """ 672 | logging.info("[4] indexing to database") 673 | copy_folder(additional_resources_dir, document_dir ) 674 | create_sqlite_database(configuration, content_toc, resources_dir, document_dir) 675 | 676 | """ 5. Archive packaging """ 677 | src_dir = os.path.dirname(__file__) 678 | shutil.copy(os.path.join(src_dir, "static/Info.plist"), content_dir) 679 | shutil.copy(os.path.join(src_dir, "static/DASH_LICENSE"), os.path.join(resources_dir, "LICENSE")) 680 | shutil.copy(os.path.join(src_dir, "static/icon.png"), docset_dir) 681 | shutil.copy(os.path.join(src_dir, "static/icon@2x.png"), docset_dir) 682 | 683 | output_dir = os.path.dirname(configuration.output_filepath) 684 | os.makedirs(output_dir, exist_ok=True) 685 | 686 | logging.info("[5] packaging as a dash docset") 687 | make_docset( 688 | docset_dir, 689 | configuration.output_filepath, 690 | Configuration.docset_name 691 | ) 692 | 693 | 694 | if __name__ == '__main__': 695 | 696 | 697 | 698 | parser = argparse.ArgumentParser( 699 | description='Dash docset creation script for Powershell modules and Cmdlets' 700 | ) 701 | 702 | parser.add_argument("-vv", "--verbose", 703 | help="increase output verbosity", 704 | action="store_true" 705 | ) 706 | 707 | parser.add_argument("-v", "--version", 708 | help="select powershell API versions", 709 | default = "7.1", 710 | choices = [ 711 | "5.1", 712 | "7.0", # LTS 713 | "7.1" # current 714 | ] 715 | ) 716 | 717 | parser.add_argument("-t", "--temporary", 718 | help="Use a temporary directory for creating docset, otherwise use current dir.", 719 | default=False, 720 | action="store_true" 721 | ) 722 | 723 | parser.add_argument("-l", "--local", 724 | help="Do not download content. Only for development use.\n" + 725 | "Incompatible with --temporary option", 726 | default=False, 727 | action="store_true" 728 | ) 729 | 730 | parser.add_argument("-o", "--output", 731 | help="set output filepath", 732 | default = os.path.join(os.getcwd(), "Powershell.tgz"), 733 | ) 734 | 735 | parser.add_argument("-p", "--phantom", 736 | help="path to phantomjs executable", 737 | default = None, 738 | ) 739 | 740 | parser.add_argument("-m", "--modules", 741 | help="filter on selected modules", 742 | default = [], 743 | type=str, 744 | nargs='+' 745 | ) 746 | 747 | args = parser.parse_args() 748 | if args.verbose: 749 | logging.basicConfig(level=logging.DEBUG) 750 | logging.getLogger("requests").setLevel(logging.WARNING) 751 | logging.getLogger("urllib3").setLevel(logging.WARNING) 752 | else: 753 | logging.basicConfig(level=logging.INFO) 754 | 755 | conf = Configuration( args ) 756 | 757 | if args.temporary: 758 | 759 | with tempfile.TemporaryDirectory() as tmp_builddir: 760 | conf.build_folder = tmp_builddir 761 | main(conf) 762 | else: 763 | main(conf) 764 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/requirements.txt -------------------------------------------------------------------------------- /screenshots/posh-docset.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/screenshots/posh-docset.PNG -------------------------------------------------------------------------------- /static/DASH_LICENSE: -------------------------------------------------------------------------------- 1 | You are not allowed to distribute or make use of any of the files within this folder ("Resources") without written permission from Kapeli or whilst using the Dash app developed by Kapeli. This does not apply to the files located within the "Documents" folder. -------------------------------------------------------------------------------- /static/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleIdentifier 6 | posh 7 | 8 | CFBundleName 9 | Powershell 10 | 11 | DashDocSetFallbackURL 12 | https://docs.microsoft.com/en-US/powershell/module/ 13 | 14 | dashIndexFilePath 15 | docs.microsoft.com/en-US/index.html 16 | 17 | DashDocSetFamily 18 | posh 19 | 20 | DocSetPlatformFamily 21 | posh 22 | 23 | isDashDocset 24 | 25 | 26 | isJavaScriptEnabled 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /static/docset-template/README.md: -------------------------------------------------------------------------------- 1 | [Powershell modules][1] Docset 2 | ================ 3 | 4 | Author: [lucasg][2] 5 | 6 | #### Generation steps: 7 | 8 | `posh-to-dash.py` is written for Python 3, and has been tested on Windows and Linux. 9 | 10 | ``` 11 | pip install selenium requests bs4 12 | 13 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/6/Powershell.tgz --version=6 14 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/5.1/Powershell.tgz --version=5.1 15 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/5.0/Powershell.tgz --version=5.0 16 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/4.0/Powershell.tgz --version=4.0 17 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/3.0/Powershell.tgz --version=3.0 18 | 19 | cp static/icon.PNG Powershell/icon.png 20 | cp static/icon@2x.PNG Powershell/icon@2x.png 21 | cp Powershell/versions/6/Powershell.tgz Powershell/Powershell.tgz 22 | 23 | ``` 24 | 25 | Otherwise, look at the `.travis` generation script for an up to date build recipe : `https://github.com/lucasg/powershell-docset/blob/master/.travis.yml` 26 | 27 | 28 | [1]: https://docs.microsoft.com/en-us/powershell/module/ 29 | [2]: https://github.com/lucasg -------------------------------------------------------------------------------- /static/docset-template/create-versioned-docset-json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import argparse 4 | from datetime import datetime 5 | 6 | docset_json = """{ 7 | "name": "Powershell", 8 | "version": "%s/%s", 9 | "archive": "Powershell.tgz", 10 | "author": { 11 | "name": "lucasg", 12 | "link": "https://github.com/lucasg" 13 | }, 14 | "aliases": ["Windows shell", 15 | "posh", 16 | "Cmdlets", 17 | "Windows automation"], 18 | 19 | "specific_versions": [ 20 | 21 | { 22 | "version": "6", 23 | "archive": "versions/6/Powershell.tgz", 24 | }, 25 | { 26 | "version": "5.1", 27 | "archive": "versions/5.1/Powershell.tgz", 28 | }, 29 | { 30 | "version": "5.0", 31 | "archive": "versions/5.0/Powershell.tgz", 32 | }, 33 | { 34 | "version": "4.0", 35 | "archive": "versions/4.0/Powershell.tgz", 36 | }, 37 | { 38 | "version": "3.0", 39 | "archive": "versions/3.0/Powershell.tgz", 40 | }, 41 | ] 42 | } 43 | """ 44 | 45 | if __name__ == '__main__': 46 | 47 | parser = argparse.ArgumentParser( 48 | description='Create a timestamped versioned docset.json file' 49 | ) 50 | 51 | parser.add_argument("-v", "--version", 52 | help="set powershell docset version", 53 | required=True 54 | ) 55 | 56 | parser.add_argument("-o", "--output", 57 | help="set output filepath", 58 | default = os.path.join(os.getcwd(), "Powershell", "docset.json"), 59 | ) 60 | 61 | args = parser.parse_args() 62 | 63 | with open(args.output, "w") as out: 64 | date = datetime.strftime(datetime.utcnow(), "%y-%m-%d") 65 | version = args.version.lstrip("v") 66 | out.write(docset_json % (version, date)) -------------------------------------------------------------------------------- /static/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/static/icon.png -------------------------------------------------------------------------------- /static/icon@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/static/icon@2x.png --------------------------------------------------------------------------------