├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── posh-to-dash.py
├── requirements.txt
├── screenshots
└── posh-docset.PNG
└── static
├── DASH_LICENSE
├── Info.plist
├── docset-template
├── README.md
└── create-versioned-docset-json.py
├── icon.png
└── icon@2x.png
/.gitignore:
--------------------------------------------------------------------------------
1 | ghostdriver.log
2 | _win10_downloaded_contents
3 | geckodriver.log
4 | Powershell.tgz
5 | _build*
6 | geckodriver.log
7 | Pipfile*
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - '3.6'
4 |
5 | # Used to properly name build artifacts
6 | env:
7 | global:
8 | - ARTIFACT_VER="`git describe --tags`"
9 | - ARTIFACT_NAME="posh-docsets-$ARTIFACT_VER.zip"
10 | - MOZ_HEADLESS=1
11 | - FIREFOX_PATH="`which firefox`"
12 |
13 | addons:
14 | firefox: latest
15 | chrome: stable
16 |
17 | deploy:
18 | provider: releases
19 | api_key:
20 | secure: ZkANjBeq7LNuteAD9LsjOaOXIAHI8Wof6Vn+CkWq+kd7sW2ymjPrucPLEoHYeWrcJcuL67C7C2LXvl30IDaBPrBQbcoAJ2vAcPsK2iIRZxqL2FSp8CpCO1BB64gYqJ+6FZ9ZHrYNfAQNhBGiuaT7YzxBF6qP1ODy8ofrSpRauQMssPV28CRQQ9vek3um0QsAth/FtcUP7je5/8IImZDsxzFYDsSTn2MjnrtCc7x9EEuIel59b+1Cw5k6oLzHQP1IIXmyt2AvMI2v/Qvr/FiByg49vM0Kb8rdqsFlx37MaORY5jolpuL0iND5SuTLNsdC4r6yfyp4bLg9kG0VaevU9QK0mYqD8VQIikE8mMsIVLc1jC6tzrK8A5rIZwRo8Ug7We05TEUssidqzXImMy1AYTPSBvoM1iuAYdEewncOCRqeFrZpsD52YD9gp9LqsTVWJ/iV0UnXLg6owgrRrE8Os/vvb3rK4c7ev2UcT0//lJutmg4E0WAtOtI0d4FhGvaPFh8GVmdTwt38cJgsVcaDD1ATUB03vlafT2LTnbaSCmP9BYB+2Sc3Ml3nRcCTjguUaNX6goGr0G7uiCqo3Eyf9NRaKPFd5IYHMKqDHf1z1JSDp3/hPrzO7RtbW6iB96SjCnBXK8ddg42oG7d4dRlbAmQidS1a3cWy34ddwnlxlaE=
21 | file: $ARTIFACT_NAME
22 | skip_cleanup: true
23 | all_branches: true
24 | on:
25 | repo: lucasg/powershell-docset
26 | tags: true
27 |
28 | addons:
29 | artifacts:
30 | # ⋮
31 | paths:
32 | - $(ls *.zip | tr "\n" ":")
33 |
34 | notifications:
35 | email:
36 | recipients:
37 | - lucas.georges@outlook.com
38 | on_success: never # default: change
39 | on_failure: always # default: always
40 |
41 | install:
42 | - pip install selenium requests bs4
43 |
44 | before_install:
45 | # dynamically resolve the correct chromedriver version to install from chrome's version
46 | - CHROME_MAIN_VERSION=`google-chrome-stable --version | sed -E 's/(^Google Chrome |\.[0-9]+ )//g'`
47 | - CHROMEDRIVER_VERSION=`curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROME_MAIN_VERSION"`
48 | - wget "https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip"
49 | - unzip chromedriver_linux64.zip
50 | - chmod +x chromedriver
51 | - export PATH=$PATH:$PWD/
52 |
53 | script:
54 | - mkdir -p Powershell
55 |
56 | - python posh-to-dash.py --temporary --output=Powershell/versions/7.1/Powershell.tgz --version=7.1
57 | # - python posh-to-dash.py --temporary --output=Powershell/versions/5.1/Powershell.tgz --version=5.1
58 | # - python posh-to-dash.py --temporary --output=Powershell/versions/5.0/Powershell.tgz --version=5.0
59 | # - python posh-to-dash.py --temporary --output=Powershell/versions/4.0/Powershell.tgz --version=4.0
60 | # - python posh-to-dash.py --temporary --output=Powershell/versions/3.0/Powershell.tgz --version=3.0
61 |
62 | - cp static/icon.png Powershell/icon.png
63 | - cp static/icon@2x.png Powershell/icon@2x.png
64 | - cp Powershell/versions/7.1/Powershell.tgz Powershell/Powershell.tgz
65 |
66 | - cp static/docset-template/README.md Powershell/README.md
67 | - python static/docset-template/create-versioned-docset-json.py --output=Powershell/docset.json --version=$ARTIFACT_VER
68 |
69 | - zip -r $ARTIFACT_NAME Powershell
70 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 lucasg
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # powershell-docset : A dash docset for powershell modules
2 |
3 | ### Status
4 |
5 | [](https://travis-ci.org/lucasg/powershell-docset)
6 |
7 | `posh-to-dash.py` scrapes the newly announced `https://docs.microsoft.com/en-us/powershell/module/` website in order to create an offline dash-compatible archive to be viewed in `Dash`, `Zeal` or `Velocity` :
8 |
9 |
10 |
11 |
12 |
13 | ## Releases
14 |
15 | - [v0.1 -- Minimal working version](https://github.com/lucasg/powershell-docset/releases/tag/v0.1)
16 | - [v0.2 -- Offline mode supported](https://github.com/lucasg/powershell-docset/releases/tag/v0.2)
17 | - [v0.3 -- travis setup](https://github.com/lucasg/powershell-docset/releases/tag/v0.3)
18 | - [v0.4 -- user contributed docset](https://github.com/lucasg/powershell-docset/releases/tag/v0.4)
19 | - [v0.5 -- versionned docsets](https://github.com/lucasg/powershell-docset/releases/tag/v0.5)
20 | - [v0.6 -- windows 10 modules documentation](https://github.com/lucasg/powershell-docset/releases/tag/v0.6)
21 | - [v0.7.2 -- powershell 7.1 documentation](https://github.com/lucasg/powershell-docset/releases/tag/v0.7.2)
22 |
23 | ## Installation & Execution
24 |
25 | `posh-to-dash.py` relies on :
26 |
27 | - `requests` for http(s) downloads
28 | - `selenium` and `phantomjs` for webscraping
29 | - `bs4` for html parsing and rewriting
30 |
31 | 1. Copy the repository
32 | 2. Install the dependencies from requirements.txt, use a virtualenv to avoid problems with dependencies and versions.
33 | 3. Download the geckodriver from [Mozilla's Repo](https://github.com/mozilla/geckodriver/releases), download the version that matches your OS.
34 | 4. Place the geckodriver in your path
35 |
36 | - If Windows, grab the executable an place it in `%USERPROFILE%\AppData\Local\Microsoft\WindowsApps`
37 |
38 | - If Linux, move it to your `~/.local/bin` or wherever you have your path
39 |
40 | 5. Start scraping by typing : `posh-to-dash.py --output=$outputfile --version=6 --temporary`
41 |
42 | - if `--output` is not provided, `posh-to-dash.py` will output "Powershell.tgz' into the working directory
43 | - the `--version` switch support only Powershell API versions `7.0`, `7.1` (default) and `7.2` , the rest are obsolete by Microsoft.
44 | - `--temporary` specify to download the web scraping resources in a temporary folder instead of clobbering the current directory. However if the download fail, the results will be thrown out.
45 |
46 | **NOTE: The process takes 15+ minutes to run. The more versions you download increases the time.**
47 |
48 | ## Add your docset to Zeal
49 |
50 | With the Powershell.tar file, unzip it and place it in `C:\Users\\AppData\Local\Zeal\Zeal\docsets`
51 |
52 | ## Limitations
53 |
54 | The powershell modules API endpoint is quite new, so it may be subject to breakage by the `docs.microsoft.com` people.
55 |
--------------------------------------------------------------------------------
/posh-to-dash.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sqlite3
4 | import os
5 | import sys
6 | import glob
7 | import re
8 | import shutil
9 | import logging
10 | import json
11 | import tarfile
12 | import tempfile
13 | import argparse
14 | import urllib.parse
15 | import urllib
16 | import time
17 | import collections
18 |
19 | import requests
20 | from requests.adapters import HTTPAdapter
21 | from requests.packages.urllib3.util.retry import Retry
22 | from requests.exceptions import ConnectionError
23 | from bs4 import BeautifulSoup as bs, Tag # pip install bs4
24 | from selenium import webdriver
25 | # from selenium.webdriver import Firefox
26 | # from selenium.webdriver.firefox.options import Options
27 | # from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
28 |
29 | from selenium.webdriver import Chrome
30 | from selenium.webdriver.chrome.options import Options
31 |
32 | class PoshWebDriver:
33 | """ Thin wrapper for selenium webdriver for page content retrieval """
34 |
35 | def __init__(self, executable_path = None):
36 |
37 | self.options = Options()
38 | self.options.add_argument("--headless")
39 | self.options.add_argument("--window-size=1920x1080")
40 |
41 | self.driver = webdriver.Chrome(options=self.options)
42 |
43 | # self.driver_exe_path = executable_path
44 |
45 | # if self.driver_exe_path:
46 | # binary = FirefoxBinary(executable_path)
47 | # self.driver = webdriver.Firefox(
48 | # firefox_binary=binary,
49 | # options=options,
50 | # )
51 | # else:
52 | # self.driver = webdriver.Firefox(
53 | # options=options
54 | # )
55 |
56 | def get_url_page(self, url):
57 | """ retrieve the full html content of a page after Javascript execution """
58 |
59 | index_html = None
60 | try:
61 | self.driver.get(url)
62 | index_html = self.driver.page_source
63 | except (ConnectionResetError, urllib.error.URLError) as e:
64 | # we may have a triggered a anti-scraping time ban
65 | # Lay low for several seconds and get back to it.
66 |
67 | self.driver.quit()
68 | time.sleep(2)
69 |
70 | # if self.driver_exe_path:
71 | # self.driver = webdriver.PhantomJS(executable_path = self.driver_exe_path)
72 | # else:
73 | # self.driver = webdriver.PhantomJS()
74 |
75 | self.driver = webdriver.Chrome(options=self.options)
76 |
77 | index_html = None
78 |
79 | # try a second time, and raise error if fail
80 | if not index_html:
81 | self.driver.get(url)
82 | index_html = self.driver.page_source
83 |
84 | return index_html
85 |
86 | def quit():
87 | return self.driver.quit()
88 |
89 |
90 | class Configuration:
91 |
92 | # STATIC CONSTANTS
93 | posh_doc_api_version = '0.2' # powershell doc api version, not this docset one.
94 | posh_version = '6'
95 | docset_name = 'Powershell'
96 |
97 | domain = "docs.microsoft.com"
98 | base_url = "%s/en-us/powershell/module" % domain
99 | default_url = "https://%s/?view=powershell-%%s" % (base_url)
100 | default_theme_uri = "_themes/docs.theme/master/en-us/_themes"
101 |
102 | def __init__(self, args):
103 |
104 |
105 | # selected powershell api version
106 | self.powershell_version = args.version
107 |
108 | # The modules and cmdlets pages are "versionned" using additional params in the GET request
109 | self.powershell_version_param = "view=powershell-{0:s}".format(self.powershell_version)
110 |
111 | # build folder (must be cleaned afterwards)
112 | self.build_folder = os.path.join(os.getcwd(), "_build_{0:s}".format(self.powershell_version))
113 |
114 | # output file
115 | self.output_filepath = os.path.realpath(args.output)
116 |
117 | # powershell docs start page
118 | self.docs_index_url = Configuration.default_url % self.powershell_version
119 |
120 | # powershell docs table of contents url
121 | self.docs_toc_url = "https://{0:s}/psdocs/toc.json?{2:s}".format(
122 | Configuration.base_url,
123 | self.powershell_version,
124 | self.powershell_version_param
125 | )
126 |
127 | self.windows_toc_url = "https://{0:s}/windowsserver2019-ps/toc.json?view=windowsserver2019-ps".format(
128 | Configuration.base_url
129 | )
130 |
131 | # selenium webdriver
132 | self.webdriver = PoshWebDriver(args.phantom)
133 |
134 | # selected module
135 | self.filter_modules = [module.lower() for module in args.modules]
136 |
137 |
138 | # Global session for several retries
139 | session = requests.Session()
140 | retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
141 | session.mount('http://', HTTPAdapter(max_retries=retries))
142 |
143 |
144 | def download_binary(url, output_filename):
145 | """ Download GET request as binary file """
146 | global session
147 |
148 | logging.debug("download_binary : %s -> %s" % (url, output_filename))
149 |
150 | # ensure the folder path actually exist
151 | os.makedirs(os.path.dirname(output_filename), exist_ok = True)
152 |
153 | r = session.get(url, stream=True)
154 | with open(output_filename, 'wb') as f:
155 | for data in r.iter_content(32*1024):
156 | f.write(data)
157 |
158 | def download_textfile(url : str , output_filename : str, params : dict = None):
159 | """ Download GET request as utf-8 text file """
160 | global session
161 |
162 | logging.debug("download_textfile : %s -> %s" % (url, output_filename))
163 |
164 | # ensure the folder path actually exist
165 | os.makedirs(os.path.dirname(output_filename), exist_ok = True)
166 |
167 | while True:
168 | try:
169 | r = session.get(url, data = params)
170 | except ConnectionError:
171 | logging.debug("caught ConnectionError, retrying...")
172 | time.sleep(2)
173 | else:
174 | break
175 |
176 | with open(output_filename, 'w', encoding="utf8") as f:
177 | f.write(r.text)
178 |
179 |
180 | def make_docset(source_dir, dst_filepath, filename):
181 | """
182 | Tar-gz the build directory while conserving the relative folder tree paths.
183 | Copied from : https://stackoverflow.com/a/17081026/1741450
184 | """
185 | dst_dir = os.path.dirname(dst_filepath)
186 | tar_filepath = os.path.join(dst_dir, '%s.tar' % filename)
187 |
188 | with tarfile.open(tar_filepath, "w:gz") as tar:
189 | tar.add(source_dir, arcname=os.path.basename(source_dir))
190 |
191 | shutil.move(tar_filepath, dst_filepath)
192 |
193 |
194 |
195 | def download_page_contents(configuration, uri, output_filepath):
196 | """ Download a page using it's uri from the TOC """
197 |
198 | # Resolving "absolute" url et use appropriate version
199 | full_url = urllib.parse.urljoin(configuration.docs_toc_url, uri)
200 | versionned_url = "{0:s}?{1:s}".format(full_url, configuration.powershell_version_param)
201 |
202 | download_textfile(versionned_url, output_filepath)
203 |
204 |
205 | def download_module_contents(configuration, module_name, module_uri, module_dir, cmdlets, root_dir):
206 | """ Download a modules contents """
207 |
208 | module_filepath = os.path.join(module_dir, "%s.html" % module_name)
209 |
210 | logging.debug("downloading %s module index page -> %s" % (module_name, module_filepath))
211 | if module_uri:
212 | download_page_contents(configuration, module_uri, module_filepath)
213 |
214 | cmdlets_infos = []
215 |
216 | # Downloading cmdlet contents
217 | for cmdlet in cmdlets:
218 |
219 | cmdlet_name = cmdlet['toc_title']
220 | if cmdlet_name.lower() in ("about", "functions", "providers", "provider"): # skip special toc
221 | continue
222 |
223 | cmdlet_uri = cmdlet["href"]
224 | cmdlet_filepath = os.path.join(module_dir, "%s.html" % cmdlet_name)
225 |
226 | logging.debug("downloading %s cmdlet doc -> %s" % (cmdlet_name, cmdlet_filepath))
227 | download_page_contents(configuration, cmdlet_uri, cmdlet_filepath)
228 |
229 | cmdlets_infos.append({
230 | 'name' : cmdlet_name,
231 | 'path' : os.path.relpath(cmdlet_filepath, root_dir),
232 | })
233 |
234 | module_infos = {
235 | 'name' : module_name,
236 | 'index' : os.path.relpath(module_filepath, root_dir),
237 | 'cmdlets' : cmdlets_infos
238 | }
239 |
240 | return module_infos
241 |
242 | def crawl_posh_contents(configuration: Configuration, toc_url : str, download_dir : str, ):
243 | """ Download Powershell modules and cmdlets content pages based on TOC """
244 |
245 | # Download toc
246 | logging.debug("Downloading powershell toc : %s" % (toc_url))
247 | r = requests.get(toc_url)
248 | modules_toc = json.loads(r.text)
249 |
250 | # modules_toc is a web based TOC, where as content_toc is file based
251 | content_toc = {}
252 |
253 | logging.debug("raw modules : %s" % [m['toc_title'] for m in modules_toc['items'][0]['children']])
254 |
255 | # optional filter on selected module
256 | modules = modules_toc['items'][0]['children']
257 | if len(configuration.filter_modules):
258 | modules = list(filter(lambda m: m['toc_title'].lower() in configuration.filter_modules, modules))
259 | logging.debug("filtered modules : %s" % [m['toc_title'] for m in modules])
260 |
261 | # Downloading modules contents
262 | for module in modules:
263 |
264 | module_name = module['toc_title']
265 | module_uri = module.get("href")
266 | module_cmdlets = module['children']
267 | module_dir = os.path.join(download_dir, Configuration.base_url, module_name)
268 |
269 | logging.info("[+] download module %s" % (module_name))
270 | module_infos = download_module_contents(configuration, module_name, module_uri, module_dir, module_cmdlets, download_dir)
271 | content_toc[module_name] = module_infos
272 |
273 | return content_toc
274 |
275 | def rewrite_soup(configuration : Configuration, soup, html_path : str, documents_dir : str):
276 | """ rewrite html contents by fixing links and remove unnecessary cruft """
277 |
278 | # Fix navigations links
279 | links = soup.findAll("a", { "data-linktype" : "relative-path"}) # for modules and cmdlet pages
280 | link_pattern = re.compile(r"([\w\.\/-]+)\?view=[powershell-|windowsserver2019-ps]")
281 |
282 | for link in links:
283 |
284 | href = link['href']
285 | fixed_href = href
286 |
287 | # go back to module
288 | if href == "./?view=powershell-%s" % configuration.powershell_version:
289 | fixed_href = "./%s.html" % link.text
290 | elif href == "./?view=windowsserver2019-ps":
291 | fixed_href = "./%s.html" % link.text
292 |
293 | # go to a cmdlet page
294 | else:
295 | targets = link_pattern.findall(href)
296 | if not len(targets): # badly formated 'a' link
297 | continue
298 |
299 | module_name = targets[0]
300 | fixed_href = "%s.html" % module_name
301 |
302 | if fixed_href != href:
303 | logging.debug("link rewrite : %s -> %s " % ( href, fixed_href))
304 | link['href'] = fixed_href
305 |
306 |
307 | # remove link to external references since we can't support it
308 | for abs_href in soup.findAll("a", { "data-linktype" : "absolute-path"}):
309 | abs_href.replace_with(abs_href.text)
310 |
311 | # remove unsupported nav elements
312 | nav_elements = [
313 | ["nav" , { "class" : "doc-outline", "role" : "navigation"}],
314 | ["ul" , { "class" : "breadcrumbs", "role" : "navigation"}],
315 | ["div" , { "class" : "sidebar", "role" : "navigation"}],
316 | ["div" , { "class" : "dropdown dropdown-full mobilenavi"}],
317 | ["p" , { "class" : "api-browser-description"}],
318 | ["div" , { "class" : "api-browser-search-field-container"}],
319 | ["div" , { "class" : "pageActions"}],
320 | ["div" , { "class" : "container footerContainer"}],
321 | ["div" , { "class" : "dropdown-container"}],
322 | ["div" , { "class" : "page-action-holder"}],
323 | ["div" , { "aria-label" : "Breadcrumb", "role" : "navigation"}],
324 | ["div" , { "data-bi-name" : "rating"}],
325 | ["div" , { "data-bi-name" : "feedback-section"}],
326 | ["section" , { "class" : "feedback-section", "data-bi-name" : "feedback-section"}],
327 | ["footer" , { "data-bi-name" : "footer", "id" : "footer"}],
328 | ]
329 |
330 | for nav in nav_elements:
331 | nav_class, nav_attr = nav
332 |
333 | for nav_tag in soup.findAll(nav_class, nav_attr):
334 | _ = nav_tag.extract()
335 |
336 | # remove script elems
337 | for head_script in soup.head.findAll("script"):
338 | _ = head_script.extract()
339 |
340 | # Extract and rewrite additionnal stylesheets to download
341 | ThemeResourceRecord = collections.namedtuple('ThemeResourceRecord', 'url, path')
342 |
343 | theme_output_dir = os.path.join(documents_dir, Configuration.domain)
344 | theme_resources = []
345 |
346 | for link in soup.head.findAll("link", { "rel" : "stylesheet"}):
347 | uri_path = link['href'].strip()
348 |
349 | if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri):
350 | continue
351 |
352 | # Construct (url, path) tuple
353 | css_url = "https://%s/%s" % (Configuration.domain, uri_path)
354 | css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/'))
355 |
356 | # Converting href to a relative link
357 | path = os.path.relpath(css_filepath, os.path.dirname(html_path))
358 | rel_uri = '/'.join(path.split(os.sep))
359 | link['href'] = rel_uri
360 |
361 | theme_resources.append( ThemeResourceRecord(
362 | url = css_url,
363 | path = os.path.relpath(css_filepath, documents_dir), # stored as relative path
364 | ))
365 |
366 | return soup, set(theme_resources)
367 |
368 | def rewrite_index_soup(configuration : Configuration, soup, index_html_path : str, documents_dir : str):
369 | """ rewrite html contents by fixing links and remove unnecessary cruft """
370 |
371 | # Fix navigations links
372 | content_tables = soup.findAll("table", {
373 | "class" : "api-search-results"
374 | })
375 |
376 | for content_table in content_tables:
377 |
378 | links = content_table.findAll(lambda tag: tag.name == 'a')
379 | link_pattern = re.compile(r"/powershell/module/([\w\.\-]+)/\?view=powershell-")
380 |
381 | for link in links:
382 |
383 | href = link['href']
384 | fixed_href = href
385 |
386 |
387 | targets = link_pattern.findall(href)
388 | if not len(targets):
389 | continue # badly formated 'a' link
390 |
391 | module_name = targets[0].lstrip('/').rstrip('/')
392 | fixed_href = "powershell/module/%s/%s.html" % (module_name, module_name)
393 |
394 | if fixed_href != href:
395 | logging.debug("link rewrite : %s -> %s " % ( href, fixed_href))
396 | link['href'] = fixed_href
397 |
398 | # Fix link to module.svg
399 | module_svg_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg")
400 | images = content_table.findAll("img" , {'alt' : "Module"})
401 | for image in images:
402 | image['src'] = os.path.relpath(module_svg_path, os.path.dirname(index_html_path))
403 |
404 | # remove unsupported nav elements
405 | nav_elements = [
406 | ["nav" , { "class" : "doc-outline", "role" : "navigation"}],
407 | ["ul" , { "class" : "breadcrumbs", "role" : "navigation"}],
408 | ["div" , { "class" : "sidebar", "role" : "navigation"}],
409 | ["div" , { "class" : "dropdown dropdown-full mobilenavi"}],
410 | ["p" , { "class" : "api-browser-description"}],
411 | ["div" , { "class" : "api-browser-search-field-container"}],
412 | ["div" , { "class" : "pageActions"}],
413 | ["div" , { "class" : "dropdown-container"}],
414 | ["div" , { "class" : "container footerContainer"}],
415 | ["div" , { "data-bi-name" : "header", "id" : "headerAreaHolder"}],
416 | ["div" , { "class" : "header-holder"}],
417 | ["div" , { "id" : "action-panel"}],
418 | ["div" , { "id" : "api-browser-search-field-container"}],
419 | ]
420 |
421 | for nav in nav_elements:
422 | nav_class, nav_attr = nav
423 |
424 | for nav_tag in soup.findAll(nav_class, nav_attr):
425 | _ = nav_tag.extract()
426 |
427 | # remove script elems
428 | for head_script in soup.head.findAll("script"):
429 | _ = head_script.extract()
430 | for body_async_script in soup.body.findAll("script", { "async" : "", "defer" : ""}):
431 | _ = head_script.extract()
432 |
433 | # Fixing and downloading css stylesheets
434 | theme_output_dir = os.path.join(documents_dir, Configuration.domain)
435 | for link in soup.head.findAll("link", { "rel" : "stylesheet"}):
436 | uri_path = link['href'].strip()
437 |
438 | if not uri_path.lstrip('/').startswith(Configuration.default_theme_uri):
439 | continue
440 |
441 | # Construct (url, path) tuple
442 | css_url = "https://%s/%s" % (Configuration.domain, uri_path)
443 | css_filepath = os.path.join(theme_output_dir, uri_path.lstrip('/'))
444 |
445 | # Converting href to a relative link
446 | path = os.path.relpath(css_filepath, os.path.dirname(index_html_path))
447 | rel_uri = '/'.join(path.split(os.sep))
448 | link['href'] = rel_uri
449 |
450 | download_textfile(css_url, css_filepath)
451 |
452 | return soup
453 |
454 |
455 | def rewrite_html_contents(configuration : Configuration, html_root_dir : str):
456 | """ rewrite every html file downloaded """
457 |
458 | additional_resources = set()
459 |
460 | for html_file in glob.glob("%s/**/*.html" % html_root_dir, recursive = True):
461 |
462 | logging.debug("rewrite html_file : %s" % (html_file))
463 |
464 | # Read content and parse html
465 | with open(html_file, 'r', encoding='utf8') as i_fd:
466 | html_content = i_fd.read()
467 |
468 | soup = bs(html_content, 'html.parser')
469 |
470 | # rewrite html
471 | soup, resources = rewrite_soup(configuration, soup, html_file, html_root_dir)
472 | additional_resources = additional_resources.union(resources)
473 |
474 | # Export fixed html
475 | fixed_html = soup.prettify("utf-8")
476 | with open(html_file, 'wb') as o_fd:
477 | o_fd.write(fixed_html)
478 |
479 | return additional_resources
480 |
481 |
482 | def download_additional_resources(configuration : Configuration, documents_dir : str, resources_to_dl : set = set()):
483 | """ Download optional resources for "beautification """
484 |
485 | for resource in resources_to_dl:
486 |
487 | download_textfile(
488 | resource.url,
489 | os.path.join(documents_dir, resource.path)
490 | )
491 |
492 | # Download index start page
493 | index_url = Configuration.default_url % configuration.powershell_version
494 | index_filepath = os.path.join(documents_dir, Configuration.domain, "en-us", "index.html")
495 |
496 | soup = bs( configuration.webdriver.get_url_page(index_url), 'html.parser')
497 | soup = rewrite_index_soup(configuration, soup, index_filepath, documents_dir)
498 | fixed_html = soup.prettify("utf-8")
499 | with open(index_filepath, 'wb') as o_fd:
500 | o_fd.write(fixed_html)
501 |
502 |
503 | # Download module.svg icon for start page
504 | icon_module_url = '/'.join(["https:/" , Configuration.domain, "en-us", "media", "toolbars", "module.svg"])
505 | icon_module_path = os.path.join(documents_dir, Configuration.domain, "en-us", "media", "toolbars", "module.svg")
506 | download_binary(icon_module_url, icon_module_path)
507 |
508 |
509 | def create_sqlite_database(configuration, content_toc, resources_dir, documents_dir):
510 | """ Indexing the html document in a format Dash can understand """
511 |
512 | def insert_into_sqlite_db(cursor, name, record_type, path):
513 | """ Insert a new unique record in the sqlite database. """
514 | try:
515 | cursor.execute('SELECT rowid FROM searchIndex WHERE path = ?', (path,))
516 | dbpath = cursor.fetchone()
517 | cursor.execute('SELECT rowid FROM searchIndex WHERE name = ?', (name,))
518 | dbname = cursor.fetchone()
519 |
520 | if dbpath is None and dbname is None:
521 | cursor.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, record_type, path))
522 | logging.debug('DB add [%s] >> name: %s, path: %s' % (record_type, name, path))
523 | else:
524 | logging.debug('record exists')
525 |
526 | except:
527 | pass
528 |
529 | sqlite_filepath = os.path.join(resources_dir, "docSet.dsidx")
530 | if os.path.exists(sqlite_filepath):
531 | os.remove(sqlite_filepath)
532 |
533 | db = sqlite3.connect(sqlite_filepath)
534 | cur = db.cursor()
535 | cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
536 | cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')
537 |
538 |
539 | for module_name, module in content_toc.items():
540 |
541 | # path should be unix compliant
542 | module_path = module['index'].replace(os.sep, '/')
543 | insert_into_sqlite_db(cur, module_name, "Module", module_path)
544 |
545 | for cmdlet in module['cmdlets']:
546 |
547 | cmdlet_name = cmdlet['name']
548 | if cmdlet_name == module_name:
549 | continue
550 |
551 | # path should be unix compliant
552 | cmdlet_path = cmdlet['path'].replace(os.sep, '/')
553 |
554 | insert_into_sqlite_db(cur, cmdlet_name, "Command", cmdlet_path)
555 |
556 |
557 | # commit and close db
558 | db.commit()
559 | db.close()
560 |
561 | def copy_folder(src_folder : str, dst_folder : str):
562 | """ Copy a full folder tree anew every time """
563 |
564 | def onerror(func, path, exc_info):
565 | """
566 | Error handler for ``shutil.rmtree``.
567 |
568 | If the error is due to an access error (read only file)
569 | it attempts to add write permission and then retries.
570 |
571 | If the error is for another reason it re-raises the error.
572 |
573 | Usage : ``shutil.rmtree(path, onerror=onerror)``
574 | """
575 | import stat
576 |
577 | if not os.path.exists(path):
578 | return
579 |
580 | if not os.access(path, os.W_OK):
581 | # Is the error an access error ?
582 | os.chmod(path, stat.S_IWUSR)
583 | func(path)
584 | else:
585 | raise
586 |
587 | shutil.rmtree(dst_folder,ignore_errors=False,onerror=onerror)
588 | shutil.copytree(src_folder, dst_folder)
589 |
590 | def merge_folders(src, dst):
591 |
592 | if os.path.isdir(src):
593 |
594 | if not os.path.exists(dst):
595 | os.makedirs(dst)
596 |
597 | for name in os.listdir(src):
598 | merge_folders(
599 | os.path.join(src, name),
600 | os.path.join(dst, name)
601 | )
602 | else:
603 | shutil.copyfile(src, dst)
604 |
605 | def main(configuration : Configuration):
606 |
607 | # """ Scheme for content toc :
608 | # {
609 | # module_name : {
610 | # 'name' : str,
611 | # 'index' : relative path,
612 | # 'cmdlets' : [
613 | # {
614 | # 'name' : str,
615 | # 'path' : relative path,
616 | # },
617 | # ...
618 | # ]
619 | # },
620 | # ...
621 | # }
622 | # """
623 | content_toc = {}
624 | resources_to_dl = set()
625 |
626 | """ 0. Prepare folders """
627 | download_dir = os.path.join(configuration.build_folder, "_1_downloaded_contents")
628 | win10_download_dir = os.path.join(os.getcwd(), "_win10_downloaded_contents")
629 | html_rewrite_dir = os.path.join(configuration.build_folder, "_2_html_rewrite")
630 | additional_resources_dir = os.path.join(configuration.build_folder, "_3_additional_resources")
631 | package_dir = os.path.join(configuration.build_folder, "_4_ready_to_be_packaged")
632 |
633 | for folder in [download_dir, html_rewrite_dir, additional_resources_dir, package_dir]:
634 | os.makedirs(folder, exist_ok=True)
635 |
636 | # _4_ready_to_be_packaged is the final build dir
637 | docset_dir = os.path.join(package_dir, "%s.docset" % Configuration.docset_name)
638 | content_dir = os.path.join(docset_dir , "Contents")
639 | resources_dir = os.path.join(content_dir, "Resources")
640 | document_dir = os.path.join(resources_dir, "Documents")
641 |
642 | """ 1. Download html pages """
643 | logging.info("[1] scraping web contents")
644 | content_toc = crawl_posh_contents(configuration, configuration.docs_toc_url, download_dir)
645 |
646 | # do not download twice the win10 api since it's quite a handful
647 | if os.path.exists(os.path.join(win10_download_dir, "toc.json")):
648 | with open(os.path.join(win10_download_dir, "toc.json"), "r") as content:
649 | windows_toc = json.load(content)
650 | else:
651 | windows_toc = crawl_posh_contents(configuration, configuration.windows_toc_url, win10_download_dir)
652 | with open(os.path.join(win10_download_dir, "toc.json"), "w") as content:
653 | json.dump(windows_toc, content)
654 |
655 | # Merge win10 api content
656 | merge_folders(win10_download_dir, download_dir)
657 | content_toc.update(windows_toc)
658 | with open(os.path.join(download_dir, "toc.json"), "w") as content:
659 | json.dump(content_toc, content)
660 |
661 | """ 2. Parse and rewrite html contents """
662 | logging.info("[2] rewriting urls and hrefs")
663 | copy_folder(download_dir, html_rewrite_dir)
664 | resources_to_dl = rewrite_html_contents(configuration, html_rewrite_dir)
665 |
666 | """ 3. Download additionnal resources """
667 | logging.info("[3] download style contents")
668 | copy_folder(html_rewrite_dir, additional_resources_dir )
669 | download_additional_resources(configuration, additional_resources_dir, resources_to_dl)
670 |
671 | """ 4. Database indexing """
672 | logging.info("[4] indexing to database")
673 | copy_folder(additional_resources_dir, document_dir )
674 | create_sqlite_database(configuration, content_toc, resources_dir, document_dir)
675 |
676 | """ 5. Archive packaging """
677 | src_dir = os.path.dirname(__file__)
678 | shutil.copy(os.path.join(src_dir, "static/Info.plist"), content_dir)
679 | shutil.copy(os.path.join(src_dir, "static/DASH_LICENSE"), os.path.join(resources_dir, "LICENSE"))
680 | shutil.copy(os.path.join(src_dir, "static/icon.png"), docset_dir)
681 | shutil.copy(os.path.join(src_dir, "static/icon@2x.png"), docset_dir)
682 |
683 | output_dir = os.path.dirname(configuration.output_filepath)
684 | os.makedirs(output_dir, exist_ok=True)
685 |
686 | logging.info("[5] packaging as a dash docset")
687 | make_docset(
688 | docset_dir,
689 | configuration.output_filepath,
690 | Configuration.docset_name
691 | )
692 |
693 |
694 | if __name__ == '__main__':
695 |
696 |
697 |
698 | parser = argparse.ArgumentParser(
699 | description='Dash docset creation script for Powershell modules and Cmdlets'
700 | )
701 |
702 | parser.add_argument("-vv", "--verbose",
703 | help="increase output verbosity",
704 | action="store_true"
705 | )
706 |
707 | parser.add_argument("-v", "--version",
708 | help="select powershell API versions",
709 | default = "7.1",
710 | choices = [
711 | "5.1",
712 | "7.0", # LTS
713 | "7.1" # current
714 | ]
715 | )
716 |
717 | parser.add_argument("-t", "--temporary",
718 | help="Use a temporary directory for creating docset, otherwise use current dir.",
719 | default=False,
720 | action="store_true"
721 | )
722 |
723 | parser.add_argument("-l", "--local",
724 | help="Do not download content. Only for development use.\n" +
725 | "Incompatible with --temporary option",
726 | default=False,
727 | action="store_true"
728 | )
729 |
730 | parser.add_argument("-o", "--output",
731 | help="set output filepath",
732 | default = os.path.join(os.getcwd(), "Powershell.tgz"),
733 | )
734 |
735 | parser.add_argument("-p", "--phantom",
736 | help="path to phantomjs executable",
737 | default = None,
738 | )
739 |
740 | parser.add_argument("-m", "--modules",
741 | help="filter on selected modules",
742 | default = [],
743 | type=str,
744 | nargs='+'
745 | )
746 |
747 | args = parser.parse_args()
748 | if args.verbose:
749 | logging.basicConfig(level=logging.DEBUG)
750 | logging.getLogger("requests").setLevel(logging.WARNING)
751 | logging.getLogger("urllib3").setLevel(logging.WARNING)
752 | else:
753 | logging.basicConfig(level=logging.INFO)
754 |
755 | conf = Configuration( args )
756 |
757 | if args.temporary:
758 |
759 | with tempfile.TemporaryDirectory() as tmp_builddir:
760 | conf.build_folder = tmp_builddir
761 | main(conf)
762 | else:
763 | main(conf)
764 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/requirements.txt
--------------------------------------------------------------------------------
/screenshots/posh-docset.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/screenshots/posh-docset.PNG
--------------------------------------------------------------------------------
/static/DASH_LICENSE:
--------------------------------------------------------------------------------
1 | You are not allowed to distribute or make use of any of the files within this folder ("Resources") without written permission from Kapeli or whilst using the Dash app developed by Kapeli. This does not apply to the files located within the "Documents" folder.
--------------------------------------------------------------------------------
/static/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | CFBundleIdentifier
6 | posh
7 |
8 | CFBundleName
9 | Powershell
10 |
11 | DashDocSetFallbackURL
12 | https://docs.microsoft.com/en-US/powershell/module/
13 |
14 | dashIndexFilePath
15 | docs.microsoft.com/en-US/index.html
16 |
17 | DashDocSetFamily
18 | posh
19 |
20 | DocSetPlatformFamily
21 | posh
22 |
23 | isDashDocset
24 |
25 |
26 | isJavaScriptEnabled
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/static/docset-template/README.md:
--------------------------------------------------------------------------------
1 | [Powershell modules][1] Docset
2 | ================
3 |
4 | Author: [lucasg][2]
5 |
6 | #### Generation steps:
7 |
8 | `posh-to-dash.py` is written for Python 3, and has been tested on Windows and Linux.
9 |
10 | ```
11 | pip install selenium requests bs4
12 |
13 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/6/Powershell.tgz --version=6
14 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/5.1/Powershell.tgz --version=5.1
15 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/5.0/Powershell.tgz --version=5.0
16 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/4.0/Powershell.tgz --version=4.0
17 | python posh-to-dash.py --verbose --temporary --output=Powershell/versions/3.0/Powershell.tgz --version=3.0
18 |
19 | cp static/icon.PNG Powershell/icon.png
20 | cp static/icon@2x.PNG Powershell/icon@2x.png
21 | cp Powershell/versions/6/Powershell.tgz Powershell/Powershell.tgz
22 |
23 | ```
24 |
25 | Otherwise, look at the `.travis` generation script for an up to date build recipe : `https://github.com/lucasg/powershell-docset/blob/master/.travis.yml`
26 |
27 |
28 | [1]: https://docs.microsoft.com/en-us/powershell/module/
29 | [2]: https://github.com/lucasg
--------------------------------------------------------------------------------
/static/docset-template/create-versioned-docset-json.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import argparse
4 | from datetime import datetime
5 |
6 | docset_json = """{
7 | "name": "Powershell",
8 | "version": "%s/%s",
9 | "archive": "Powershell.tgz",
10 | "author": {
11 | "name": "lucasg",
12 | "link": "https://github.com/lucasg"
13 | },
14 | "aliases": ["Windows shell",
15 | "posh",
16 | "Cmdlets",
17 | "Windows automation"],
18 |
19 | "specific_versions": [
20 |
21 | {
22 | "version": "6",
23 | "archive": "versions/6/Powershell.tgz",
24 | },
25 | {
26 | "version": "5.1",
27 | "archive": "versions/5.1/Powershell.tgz",
28 | },
29 | {
30 | "version": "5.0",
31 | "archive": "versions/5.0/Powershell.tgz",
32 | },
33 | {
34 | "version": "4.0",
35 | "archive": "versions/4.0/Powershell.tgz",
36 | },
37 | {
38 | "version": "3.0",
39 | "archive": "versions/3.0/Powershell.tgz",
40 | },
41 | ]
42 | }
43 | """
44 |
45 | if __name__ == '__main__':
46 |
47 | parser = argparse.ArgumentParser(
48 | description='Create a timestamped versioned docset.json file'
49 | )
50 |
51 | parser.add_argument("-v", "--version",
52 | help="set powershell docset version",
53 | required=True
54 | )
55 |
56 | parser.add_argument("-o", "--output",
57 | help="set output filepath",
58 | default = os.path.join(os.getcwd(), "Powershell", "docset.json"),
59 | )
60 |
61 | args = parser.parse_args()
62 |
63 | with open(args.output, "w") as out:
64 | date = datetime.strftime(datetime.utcnow(), "%y-%m-%d")
65 | version = args.version.lstrip("v")
66 | out.write(docset_json % (version, date))
--------------------------------------------------------------------------------
/static/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/static/icon.png
--------------------------------------------------------------------------------
/static/icon@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucasg/powershell-docset/d9becce8783a572525b94c2660f480569262ab92/static/icon@2x.png
--------------------------------------------------------------------------------