├── .dockerignore
├── .gitignore
├── .pre-commit-config.yaml
├── BaseDockerfile
├── CODEOWNERS
├── Dockerfile
├── LICENSE
├── README.md
├── app.py
├── img
    ├── AAPL.jpg
    └── MSFT.jpg
├── nbhtml.py
└── requirements.txt


/.dockerignore:
--------------------------------------------------------------------------------
1 | README.md
2 | *.pyc
3 | *.pyo
4 | *.pyd
5 | __pycache__


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.ipynb
2 | *.html
3 | .vscode
4 | *.pyc
5 | *.pyo
6 | *.pyd
7 | __pycache__
8 | Dockerfile_Full


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: check-added-large-files
 7 |     -   id: check-ast
 8 |     -   id: check-byte-order-marker
 9 |     -   id: check-case-conflict
10 |     -   id: check-docstring-first
11 |     -   id: check-json
12 |     -   id: check-merge-conflict
13 |     -   id: check-symlinks
14 |     -   id: check-toml
15 |     -   id: check-xml
16 |     -   id: detect-aws-credentials
17 |     -   id: detect-private-key
18 |     -   id: pretty-format-json
19 | -   repo: https://github.com/psf/black
20 |     rev: 19.10b0
21 |     hooks:
22 |     -   id: black


--------------------------------------------------------------------------------
/BaseDockerfile:
--------------------------------------------------------------------------------
1 | # Use the official lightweight Python image.
2 | # https://hub.docker.com/_/python
3 | FROM python:3.7
4 | 
5 | COPY requirements.txt ./
6 | 
7 | RUN pip install -r requirements.txt


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @orcaman


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official lightweight Python image.
 2 | # https://hub.docker.com/_/python
 3 | FROM python:3.7
 4 | 
 5 | COPY requirements.txt ./
 6 | 
 7 | RUN pip install -r requirements.txt
 8 | 
 9 | # Copy local code to the container image.
10 | ENV APP_HOME /app
11 | WORKDIR $APP_HOME
12 | COPY . ./
13 | 
14 | # Run the web service on container startup. Here we use the gunicorn
15 | # webserver, with one worker process and 8 threads.
16 | # For environments with multiple CPU cores, increase the number of workers
17 | # to be equal to the cores available.
18 | CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 app:app


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020 skyline-ai
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nb_to_html
 2 | 
 3 | nb_to_html is a web application that creates on-demand dynamic HTML reports powered by Jupyter Notebooks.
 4 | 
 5 | This is quite a powerful concept as it allows the data scientist to really quickly generate HTML reports that could be sent out as simple URLs.
 6 | 
 7 | It does so simply by receiving a path to Notebook to be run (which could be a public or private GitHub repo) and a collection of optional runtimes parameters for the Notebook. Once the notebook finishes execution, its contents is convered to HTML and rendered to the client.
 8 | 
 9 | ## example
10 | 
11 | Suppose you have a simple notebook that plots data for a stock symbol. In its cleared state, the notebook looks like this: https://github.com/orcaman/stocks_demo/blob/master/stocks_demo.ipynb
12 | 
13 | This is how you would get an HTML report of the execution result for different stock symbols:
14 | 
15 | AAPL (Apple):
16 | http://localhost:8080/nb_to_html?org=orcaman&repo=stocks_demo&nb_path=stocks_demo.ipynb&params={"target_stock_symbol":"AAPL"}
17 | 
18 | ![Alt text](/img/AAPL.jpg?raw=true "AAPL")
19 | 
20 | MSFT (Microsoft):
21 | http://localhost:8080/nb_to_html?org=orcaman&repo=stocks_demo&nb_path=stocks_demo.ipynb&params={"target_stock_symbol":"MSFT"}
22 | 
23 | ![Alt text](/img/MSFT.jpg?raw=true "MSFT")
24 | 
25 | For each request, nb_to_html downloads the notebook, sets its parameters, executes it, converts the results to HTML and renders it to the client.
26 | 
27 | Note the parameters in this case were:
28 | - org = orcaman (the GitHub organization/user)
29 | - repo = stocks_demo (repo name on GitHub)
30 | - nb_path = stocks_demo.ipynb (the full path to the notebook on GitHub)
31 | - params={"target_stock_symbol":"MSFT"} (the optional params argument containing the name of the variable to inject). Fo this to work, you must tag your notebook cell with "parameters" (see [papermill](https://github.com/nteract/papermill) docs)
32 | 
33 | ## under-the-hood
34 | 
35 | Under-the-hood, nb_to_html uses [papermill](https://github.com/nteract/papermill) to execute the notebook and nbconvert to convert the result to HTML.
36 | 
37 | ## development
38 | 
39 | In dev, run the app with docker:
40 | 
41 | ```
42 | docker build -t nb_to_html .
43 | ```
44 | 
45 | ```
46 | docker run --rm -ti -p 8080:8080 -e GITHUB_TOKEN -e GOOGLE_APPLICATION_CREDENTIALS_JSON -e PORT -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e S3_BUCKET -e REDIS_PASSWORD -e REDIS_HOST -e REDIS_PORT nb_to_html
47 | ```
48 | 
49 | ## adding dependencies
50 | To add dependencies (e.g. a python library you need in your notebook), either pip install it via a notebook cell like so:
51 | 
52 | ```
53 | !pip install bokeh
54 | ```
55 | 
56 | Or, you could add it to the `requirements.txt` file, that already contains a few commonly used libraries (pandas, numpy, boto3, seaborn, etc.).
57 | 
58 | 
59 | ## server-side caching
60 | 
61 | The server supports caching the results of notebook execution on redis. If you choose to set up the redis configuruation as explained below, the server will hash the contents of the notebook in combination with the runtime parameters and use this hash as a cache key to use a previous execution.
62 | 
63 | ### Env vars / server configuration
64 | 
65 | All configuration values are *optional*:
66 | - GITHUB_TOKEN (optional, a GitHub API access token to access private repos)
67 | - GOOGLE_APPLICATION_CREDENTIALS_JSON (optional, if your notebook talks to a GCP service like Google BigQuery). Setting this varaible with the contents of the GOOGLE_APPLICATION_CREDENTIALS file will allow you to read those credentials from notebooks without having to store the credentials anywhere else
68 | - S3_Bucket (optional, an S3 bucket name). If supplied, every report will be saved on the bucket under the following format: s3://<S3_BUCKET>//nb_to_html//<yyyy>//<MM>//report_name_yyyy_mm_dd_hh:ss.html
69 | - AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (optional, AWS credentials if S3 storage is desired)
70 | - REDIS_PASSWORD, REDIS_HOST and REDIS_PORT if redis caching is desired
71 | 
72 | And run the app using the docker run command above.
73 | 
74 | ## deployment
75 | 
76 | The application is deployed to AWS Farget. To create a new revision:
77 | 1. Push to ECR:
78 | ```
79 | aws ecr get-login-password --region {REGION} | docker login --username AWS --password-stdin {REPO}.dkr.ecr.{REGION}.amazonaws.com/nb_to_html
80 | 
81 | docker build -t nb_to_html .
82 | 
83 | docker tag nb_to_html:latest {REPO}.{REGION}.amazonaws.com/nb_to_html:latest
84 | 
85 | docker push {REPO}.{REGION}.amazonaws.com/nb_to_html:latest
86 | ```
87 | 
88 | 2. Create a new revision on Fargate:
89 | https://console.aws.amazon.com/ecs/home?region={REGION}#/clusters/default-nb-to-html-server/services (select the service and click on update)
90 | 
91 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import nbhtml
 3 | import redis
 4 | 
 5 | from flask import Flask
 6 | from flask import request
 7 | 
 8 | redis_host = os.environ.get("REDIS_HOST", "localhost")
 9 | redis_password = os.environ.get("REDIS_PASSWORD")
10 | redis_port = int(os.environ.get("REDIS_PORT", 6379))
11 | cache_time = int(os.environ.get("CACHE_TIME", 259200))
12 | redis_client = redis.StrictRedis(
13 |     host=redis_host, port=redis_port, password=redis_password
14 | )
15 | 
16 | print(f"redis_host: {redis_host}")
17 | 
18 | app = Flask(__name__)
19 | 
20 | 
21 | @app.route("/nb_to_html")
22 | def nb_to_html_request():
23 |     print(f"request url: {request.url}")
24 | 
25 |     # download the notebook first
26 |     downloaded_nb_path = nbhtml.download_notebook_and_return_path(request)
27 |     print(f"notebook downloaded to: {downloaded_nb_path}")
28 | 
29 |     # hash the notebook's content to see if we have ran this one before
30 |     nb_content_hash = nbhtml.hash_file(downloaded_nb_path)
31 |     print(f"notebook content hash: {nb_content_hash}")
32 | 
33 |     # use a combination of the notebok content and the runtime parameters as cache key
34 |     key = nbhtml.hash_string(f'{request.url.encode("utf-8")}-{nb_content_hash}')
35 | 
36 |     try:
37 |         print(f"try to get cache key: {key}")
38 |         cached = redis_client.get(key)
39 |     except:
40 |         print(f"cannot get cache key: {key}")
41 |         cached = None
42 | 
43 |     # cache is enabled but key is not found
44 |     if cached is None:
45 |         data = nbhtml.execute_notebook(downloaded_nb_path, request)
46 |         try:
47 |             redis_client.setex(key, cache_time, data)
48 |             print(f"cache key set successfully: {key}")
49 |         except:
50 |             print(f"cannot set cache key: {key}")
51 |     else:
52 |         # data was read from cache
53 |         print(f"data was read from cache for key: {key}")
54 |         data = cached
55 | 
56 |     return data
57 | 
58 | 
59 | @app.route("/health")
60 | def health():
61 |     return "OK"
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
66 | 


--------------------------------------------------------------------------------
/img/AAPL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/orcaman/nb_to_html/c35349b7ccd32fe8be1342cd91b4814a063560c7/img/AAPL.jpg


--------------------------------------------------------------------------------
/img/MSFT.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/orcaman/nb_to_html/c35349b7ccd32fe8be1342cd91b4814a063560c7/img/MSFT.jpg


--------------------------------------------------------------------------------
/nbhtml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import time
  4 | import subprocess
  5 | import sys
  6 | import shlex
  7 | import boto3
  8 | import hashlib
  9 | from datetime import datetime
 10 | 
 11 | 
 12 | class cd:
 13 |     """Context manager for changing the current working directory"""
 14 | 
 15 |     def __init__(self, newPath):
 16 |         self.newPath = newPath
 17 | 
 18 |     def __enter__(self):
 19 |         self.savedPath = os.getcwd()
 20 |         os.chdir(self.newPath)
 21 | 
 22 |     def __exit__(self, etype, value, traceback):
 23 |         os.chdir(self.savedPath)
 24 | 
 25 | 
 26 | def download(
 27 |     execution_id: int,
 28 |     org: str,
 29 |     repo_name: str,
 30 |     file_path: str,
 31 |     target_dir: str,
 32 |     out_file: str,
 33 |     commit: str,
 34 | ):
 35 |     print(repo_name, file_path, target_dir, commit)
 36 |     token = os.getenv("GITHUB_TOKEN")
 37 |     if token is None:
 38 |         print(
 39 |             "Warning: Environment Var GITHUB_TOKEN does not exist, set this variable to access private GitHub repos"
 40 |         )
 41 | 
 42 |     cmd = "curl"
 43 |     if token is not None:
 44 |         cmd = cmd + f" -H 'Authorization: token {token}'"
 45 |     cmd = (
 46 |         cmd + " -H 'Accept: application/vnd.github.v3.raw' "
 47 |         "-L https://raw.githubusercontent.com/{org}/"
 48 |         "{repo_name}/{commit}/{file_path} -o {out_file}".format(
 49 |             out_file=out_file,
 50 |             org=org,
 51 |             token=token,
 52 |             repo_name=repo_name,
 53 |             commit=commit,
 54 |             file_path=file_path,
 55 |         )
 56 |     )
 57 | 
 58 |     if not os.path.isdir(target_dir):
 59 |         os.makedirs(target_dir)
 60 | 
 61 |     with cd(target_dir):
 62 |         shcmd(cmd)
 63 | 
 64 | 
 65 | def shcmd(cmd, ignore_error=False):
 66 |     print("Doing:", cmd)
 67 |     try:
 68 |         output = subprocess.check_output(
 69 |             cmd,
 70 |             stderr=subprocess.STDOUT,
 71 |             shell=True,
 72 |             timeout=900,
 73 |             universal_newlines=True,
 74 |         )
 75 |     except subprocess.CalledProcessError as exc:
 76 |         print("Status : FAIL", exc.returncode, exc.output)
 77 |     else:
 78 |         print("Output: \n{}\n".format(output))
 79 | 
 80 | 
 81 | def download_notebook_and_return_path(request) -> str:
 82 |     org = request.args.get("org")
 83 |     repo = request.args.get("repo")
 84 |     nb_path = request.args.get("nb_path")
 85 |     params = request.args.get("params")
 86 | 
 87 |     execution_id = time.time() * 1000
 88 |     base_file_name = f"{org}.{repo}.{nb_path}".replace("/", "_")
 89 | 
 90 |     download(
 91 |         execution_id,
 92 |         org,
 93 |         repo,
 94 |         nb_path,
 95 |         "/tmp",
 96 |         f"{base_file_name}_{execution_id}.ipynb",
 97 |         "master",
 98 |     )
 99 | 
100 |     out_format = "ipynb"
101 |     input = f"/tmp/{base_file_name}_{execution_id}.{out_format}"
102 |     return input
103 | 
104 | 
105 | def execute_notebook(downloaded_nb_path: str, request):
106 |     org = request.args.get("org")
107 |     repo = request.args.get("repo")
108 |     nb_path = request.args.get("nb_path")
109 |     params = request.args.get("params")
110 | 
111 |     execution_id = time.time() * 1000
112 |     base_file_name = f"{org}.{repo}.{nb_path}".replace("/", "_")
113 | 
114 |     out_format = "ipynb"
115 |     input = downloaded_nb_path
116 |     output = f"/tmp/{base_file_name}_{execution_id}_out.{out_format}"
117 | 
118 |     papermill(input, output, params)
119 |     nbconvert(output)
120 | 
121 |     output = output[: len(output) - 5] + "html"
122 | 
123 |     store(output)
124 |     return readHTML(output)
125 | 
126 | 
127 | def readHTML(file: str):
128 |     with open(file, "r") as content_file:
129 |         return content_file.read()
130 | 
131 | 
132 | def nbconvert(report: str):
133 |     shcmd(f"jupyter nbconvert --to html {report}")
134 | 
135 | 
136 | def papermill(input: str, output: str, params: str):
137 |     params_s = ""
138 |     if params is not None:
139 |         d = json.loads(params)
140 |         for param in d:
141 |             val = d[param]
142 |             if is_number(val):
143 |                 params_s = f"{params_s} -p {param} {val}"
144 |             else:
145 |                 params_s = f'{params_s} -p {param} "{val}"'
146 |     cmd = f"papermill {input} {output} {params_s}".strip()
147 |     shcmd(cmd)
148 | 
149 | 
150 | def is_number(s):
151 |     try:
152 |         float(s)
153 |         return True
154 |     except:
155 |         return False
156 | 
157 | 
158 | def store(output_filename: str):
159 |     bucket_name = os.getenv("S3_BUCKET")
160 |     if bucket_name is None:
161 |         print(
162 |             "Warning: Environment Var S3_BUCKET does not exist, set this variable to save generated reports on S3"
163 |         )
164 |         return
165 | 
166 |     d = datetime.now()
167 |     obj_name = f'nb_to_html/{d.year}/{d.month}/{output_filename.split("/tmp/")[1]}'
168 |     s3 = boto3.client("s3")
169 |     with open(output_filename, "rb") as f:
170 |         s3.upload_fileobj(f, bucket_name, obj_name)
171 | 
172 | 
173 | def hash_file(file_path: str) -> str:
174 |     """
175 |     return sha1 hash of the file
176 |     """
177 |     with open(file_path, "r") as content_file:
178 |         hash_object = hashlib.sha1(content_file.read().encode("utf-8"))
179 |         return hash_object.hexdigest()
180 | 
181 | 
182 | def hash_string(str_to_hash: str) -> str:
183 |     hash_object = hashlib.sha1(str_to_hash.encode("utf-8"))
184 |     return hash_object.hexdigest()
185 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyter
 2 | dask==2.1.0
 3 | gcsfs
 4 | matplotlib
 5 | papermill
 6 | numpy
 7 | pandas-datareader
 8 | google-cloud-bigquery
 9 | google-cloud-storage
10 | pandas
11 | seaborn
12 | Flask
13 | gunicorn
14 | boto3
15 | redis


--------------------------------------------------------------------------------