├── .github └── workflows │ ├── build-docs.yml │ └── publish-docs.yml ├── .gitignore ├── LICENSE ├── README.md └── docs ├── .nojekyll ├── Makefile ├── _static ├── cuda-logo.png ├── dask-logo.png ├── delta-logo.png ├── jupyter-logo.png ├── open-datastudio-logo.png ├── open-datastudio-logo.svg ├── overview.png ├── ray-logo.png ├── ray-usecase.png ├── spark-logo.png ├── spark-serverless-client-mode.png ├── spark-serverless-cluster-mode.png ├── spark-usecase.png └── zeppelin-logo.svg ├── _templates └── layout.html ├── about ├── index.rst └── overview.rst ├── business-intelligence ├── index.rst ├── metabase.rst └── superset.rst ├── computing ├── dask.rst ├── flink.rst ├── index.rst ├── ray │ ├── from_cluster_launcher.rst │ ├── from_staroid_management_console.rst │ └── index.rst └── spark │ ├── from_ods_zeppelin.rst │ ├── from_python_environment.rst │ ├── index.rst │ ├── instances.rst │ └── spark_ui.rst ├── conf.py ├── data-lake ├── delta.rst ├── hive-metastore.rst ├── index.rst └── spark-thriftserver.rst ├── index.rst ├── machine-learning ├── index.rst ├── mlflow-model-serving.rst └── mlflow-server.rst ├── notebook ├── index.rst ├── jupyter.rst └── zeppelin.rst ├── ref.rst ├── requirements.txt └── support └── index.rst /.github/workflows/build-docs.yml: -------------------------------------------------------------------------------- 1 | name: build-docs 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: ammaraskar/sphinx-action@master 14 | with: 15 | docs-folder: "./docs" 16 | -------------------------------------------------------------------------------- /.github/workflows/publish-docs.yml: -------------------------------------------------------------------------------- 1 | name: publish-docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: checkout master 13 | uses: actions/checkout@v2 14 | - name: build docs 15 | uses: ammaraskar/sphinx-action@master 16 | with: 17 | docs-folder: "./docs" 18 | - name: checkout gh-pages branch 19 | uses: actions/checkout@v2 20 | with: 21 | ref: gh-pages 22 | path: gh-pages 23 | - name: Commit documentation changes 24 | run: | 25 | cp -r docs/_build/html/* gh-pages/ 26 | cd gh-pages 27 | git config --local user.email "action@github.com" 28 | git config --local user.name "GitHub Action" 29 | git add . 30 | git commit -m "Update documentation" -a || true 31 | # The above command will fail if no changes were present, so we ignore 32 | # the return code. 33 | - name: Push changes 34 | uses: ad-m/github-push-action@master 35 | with: 36 | branch: gh-pages 37 | directory: gh-pages 38 | github_token: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | docs/_build 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright 2020 The Open-datastudio Authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
5 |
6 | 7 | # Open data studio 8 | 9 | Open data studio is an open initiative to bring machine learning and large scale data processing open-source software to click away for everyone. 10 | 11 | ## Documentation 12 | 13 | Please visit [open-datastudio.io](https://open-datastudio.io) 14 | 15 | ## Projects 16 | 17 | | Component | Project | Description | Integration Status | 18 | | ------- | --------- | ----------- | ------- | 19 | | Notebook | [jupyter](https://github.com/open-datastudio/jupyter) | Jupyter Lab | Integrated | 20 | | | [zeppelin](https://github.com/open-datastudio/zeppelin) | Integrates with Apache Zeppelin and Apache Spark on Kubernetes mode | Integrated | 21 | | Data Lake | [hive-metastore](https://github.com/open-datastudio/hive-metastore) | Provides hive metastore server with Postgresql database | Integrated | 22 | | | [spark-thriftserver](https://github.com/open-datastudio/spark-thriftserver) | Spark cluster on Kubernetes for ODBC/JDBC connection | Integrated | 23 | | Computing | [ray-cluster](https://github.com/open-datastudio/ray-cluster) | [Ray](https://ray.io/) cluster | Integrated | 24 | | | [spark-serverless](https://github.com/open-datastudio/spark-serverless) | On-demand [Spark](https://spark.apache.org) cluster from everywhere | Integrated | 25 | | Machine learning | [mlflow-server](https://github.com/open-datastudio/mlflow-server) | [MLflow](https://mlflow.org/) model remote tracking server and ui | Integrated 26 | | | [mlflow-model-serving](https://github.com/open-datastudio/mlflow-model-serving) | Deploy models from mlflow-server and get endpoint | Integrated 27 | | Business Intelligence | [metabase](https://github.com/open-datastudio/metabase) | Metabase Business Intelligence | Integrated | 28 | | | [superset](https://github.com/open-datastudio/superset) | Apache Superset Business Intelligence | Integrated | 29 | | Misc | [spark](https://github.com/open-datastudio/spark) | It does not integrates to Staroid but publishes docker image for other projects | - | 30 | 31 | 32 | 33 | ## How to contribute? 34 | 35 | You can create issues or pull requests to contribute individual repositories under [open-datasicence](https://github.com/open-datastudio). 36 | 37 | If you'd like to create a new integration project here, please create an [issue](https://github.com/open-datastudio/datastudio/issues) in this repository. 38 | 39 | We need your help! 40 | 41 | ## Community 42 | 43 | * Open data studio slack channel - [Join](https://join.slack.com/t/opendatastudio/shared_invite/zt-jq449y9j-DIPBteeWC15xBbQAqi4J4g) 44 | 45 | ## License 46 | 47 | Open data studio is an open source projects. 48 | LICENSE file is included in each repository. 49 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/.nojekyll -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/_static/cuda-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/cuda-logo.png -------------------------------------------------------------------------------- /docs/_static/dask-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/dask-logo.png -------------------------------------------------------------------------------- /docs/_static/delta-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/delta-logo.png -------------------------------------------------------------------------------- /docs/_static/jupyter-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/jupyter-logo.png -------------------------------------------------------------------------------- /docs/_static/open-datastudio-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/open-datastudio-logo.png -------------------------------------------------------------------------------- /docs/_static/open-datastudio-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/_static/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/overview.png -------------------------------------------------------------------------------- /docs/_static/ray-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/ray-logo.png -------------------------------------------------------------------------------- /docs/_static/ray-usecase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/ray-usecase.png -------------------------------------------------------------------------------- /docs/_static/spark-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/spark-logo.png -------------------------------------------------------------------------------- /docs/_static/spark-serverless-client-mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/spark-serverless-client-mode.png -------------------------------------------------------------------------------- /docs/_static/spark-serverless-cluster-mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/spark-serverless-cluster-mode.png -------------------------------------------------------------------------------- /docs/_static/spark-usecase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/open-datastudio/datastudio/5055579adf969ad6d7491454b30ab2fedbaaa067/docs/_static/spark-usecase.png -------------------------------------------------------------------------------- /docs/_static/zeppelin-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 39 | 41 | 43 | 44 | 46 | image/svg+xml 47 | 49 | 50 | 51 | 52 | 53 | 65 | 68 | 73 | 74 | 77 | 78 | -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {% block footer %} 4 | {{ super() }} 5 | 6 | 7 | 14 | 15 | 16 | 26 | {% endblock %} 27 | -------------------------------------------------------------------------------- /docs/about/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | About 3 | =============== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | Overview 9 | 10 | .. include:: ../ref.rst -------------------------------------------------------------------------------- /docs/about/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ================================== 3 | 4 | Open data studio is an open initiative to bring machine learning and large scale data processing open-source software to click away for everyone. 5 | 6 | Why open data studio? 7 | ------------------------ 8 | 9 | The ability to use data becomes the key differentiator of businesses. 10 | 11 | To use data, we need tools. Thankfully, machine learning, data analytics, large scale data processing are the areas where open source software dominates. So many great tools are available to build your ML/data pipeline. 12 | 13 | However, architecting, installing, integrating, and maintaining to build your pipeline became a new challenge, while it needs lots of investment to get proper setup/configurations, experiences, and best practices. 14 | We know not every organization can not invest as much as those top few trillion-dollar value companies to build ML/data pipeline. 15 | 16 | Open data studio's goal is to minimize this gap so any organization can have the 17 | same ability from day one. 18 | 19 | 20 | |overview-img| 21 | 22 | .. include:: ../ref.rst 23 | 24 | .. |overview-img| image:: ../_static/overview.png 25 | :width: 700px 26 | :alt: Open data studio overview 27 | -------------------------------------------------------------------------------- /docs/business-intelligence/index.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | Business Intelligence 3 | ===================== 4 | 5 | Business Intelligence tools are designed to make sense of the huge quantities of data that organizations accumulate over time. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | Metabase 11 | Superset 12 | 13 | .. include:: ../ref.rst 14 | -------------------------------------------------------------------------------- /docs/business-intelligence/metabase.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Metabase 3 | ================= 4 | 5 | Metabase is the easy, open source way for everyone in the users' company to ask questions and learn from data. 6 | Open data studio makes it easy to deploy on the cloud. 7 | 8 | Key features 9 | 10 | - Click to deploy. No setup required. 11 | - Postgres is pre-configured 12 | - Connect database on private network using `secure tunnel `__ 13 | 14 | .. _metabase.Quickstart: 15 | 16 | Metabase Quickstart 17 | ------------------- 18 | 19 | .. image:: https://staroid.com/api/run/button.svg 20 | :target: https://staroid.com/g/open-datastudio/metabase 21 | 22 | **Screenshots** 23 | 24 | |metabase-screenshot| 25 | 26 | =============================== =================================================================== 27 | Launch page https://staroid.com/g/open-datastudio/metabase 28 | Open data studio repository https://github.com/open-datastudio/metabase 29 | Original repository https://github.com/metabase/metabase 30 | Documentation https://www.metabase.com/docs/latest/ 31 | =============================== =================================================================== 32 | 33 | .. include:: ../ref.rst 34 | 35 | .. |metabase-screenshot| image:: https://github.com/metabase/metabase/raw/master/docs/metabase-product-screenshot.png 36 | :width: 700px 37 | :alt: Metabase 38 | -------------------------------------------------------------------------------- /docs/business-intelligence/superset.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Superset 3 | =============== 4 | 5 | .. raw:: html 6 | 7 | 8 | 9 | | 10 | 11 | Apache Superset is a modern, enterprise-ready business intelligence web application. 12 | Open data studio makes it easy to deploy on the cloud. 13 | 14 | Key features 15 | 16 | - Click to deploy. No setup required. 17 | - Postgres and Redis are pre-configured. 18 | - Connect database on private network using `secure tunnel `__ 19 | 20 | 21 | Superset Quickstart 22 | ------------------- 23 | 24 | .. image:: https://staroid.com/api/run/button.svg 25 | :target: https://staroid.com/g/open-datastudio/superset 26 | 27 | **Screenshots** 28 | 29 | |superset-screenshot| 30 | 31 | 32 | =============================== =================================================================== 33 | Launch page https://staroid.com/g/open-datastudio/superset 34 | Open data studio repository https://github.com/open-datastudio/superset 35 | Original repository https://github.com/apache/superset 36 | Documentation https://superset.apache.org/ 37 | =============================== =================================================================== 38 | 39 | .. include:: ../ref.rst 40 | 41 | .. |superset-screenshot| image:: https://raw.githubusercontent.com/apache/incubator-superset/master/superset-frontend/images/screenshots/bank_dash.png 42 | :width: 700px 43 | :alt: Apache superset 44 | -------------------------------------------------------------------------------- /docs/computing/dask.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Dask 3 | ============== 4 | 5 | https://github.com/open-datastudio/dask-cluster 6 | 7 | Dask supported is planned. 8 | 9 | .. include:: ../ref.rst 10 | -------------------------------------------------------------------------------- /docs/computing/flink.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Apache Flink 3 | ============== 4 | 5 | Apache Flink support is planned. 6 | 7 | .. include:: ../ref.rst 8 | -------------------------------------------------------------------------------- /docs/computing/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Computing 3 | =============== 4 | 5 | Large scale, parallel/distributed computing. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | Spark serverless 11 | Ray cluster 12 | 13 | Dask (planned) 14 | Flink (Planned) 15 | 16 | .. include:: ../ref.rst 17 | -------------------------------------------------------------------------------- /docs/computing/ray/from_cluster_launcher.rst: -------------------------------------------------------------------------------- 1 | ========================================= 2 | Ray cluster from Ray Cluster Launcher CLI 3 | ========================================= 4 | 5 | Ray master branch includes `Ray cluster launcher for Staroid `_. 6 | This allows creating a ray cluster using standard ``ray up `` CLI command. 7 | 8 | Install Ray and dependency libraries 9 | ------------------------------------ 10 | 11 | First, install ray (1.1.0 or newer) and python dependency packages. 12 | 13 | .. code-block:: bash 14 | 15 | $ pip install ray staroid kubernetes 16 | 17 | Configure Staroid access token 18 | ------------------------------ 19 | 20 | Then, let's configure staroid access token. `Get access token `_ and set 21 | ``STAROID_ACCESS_TOKEN`` environment variable. 22 | 23 | .. code-block:: bash 24 | 25 | $ export STAROID_ACCESS_TOKEN=[your access token] 26 | 27 | Cluster configuration file 28 | -------------------------- 29 | 30 | We can get example Ray cluster launcher configuration files for Staroid from Ray source tree. 31 | 32 | .. code-block:: bash 33 | 34 | $ git clone https://github.com/ray-project/ray.git 35 | $ ls ray/python/ray/autoscaler/staroid/example-*.yaml 36 | 37 | Open example configurations and modify them as you need. 38 | 39 | Start a Ray cluster 40 | ------------------- 41 | 42 | Now, you can create a Ray cluster using ``ray up`` command. 43 | 44 | .. code-block:: bash 45 | 46 | $ ray up ray/python/ray/autoscaler/staroid/example-full.yaml 47 | 48 | Once cluster is up and running, you can attach your shell to the Ray head node. 49 | 50 | .. code-block:: bash 51 | 52 | $ ray attach ray/python/ray/autoscaler/staroid/example-full.yaml 53 | 54 | Ray instance management menu 55 | ---------------------------- 56 | 57 | Check `Instance management menu `_. 58 | You'll see your Ray cluster instances. 59 | 60 | .. image:: https://user-images.githubusercontent.com/1540981/101430734-71d83780-38ba-11eb-94d4-f7b20f0135ae.png 61 | :width: 600 62 | 63 | You'll find link to Ray dashbord and Jupyter notebook. 64 | 65 | 66 | Shutdown Ray cluster 67 | -------------------- 68 | 69 | To shutdown cluster, 70 | 71 | .. code-block:: bash 72 | 73 | $ ray down ray/python/ray/autoscaler/staroid/example-full.yaml 74 | -------------------------------------------------------------------------------- /docs/computing/ray/from_staroid_management_console.rst: -------------------------------------------------------------------------------- 1 | =============================================== 2 | Ray cluster from Staroid management console GUI 3 | =============================================== 4 | 5 | Ray cluster can be managed from `Instance management menu `_ 6 | without using Ray CLI (Command Line Interface). 7 | 8 | Start a Ray cluster from GUI 9 | ---------------------------- 10 | 11 | Click ``Launch`` button from `Instance management menu `_. 12 | 13 | .. image:: https://user-images.githubusercontent.com/1540981/101434974-65ef7400-38c0-11eb-8647-22a4a11ca2e1.png 14 | :width: 500 15 | :alt: Ray cluster launch dialog 16 | 17 | In a launch dialog, you can configure a name of your Ray cluster instance, number of max workers and so on. 18 | Once launched, you can see status of your Ray cluster instance. 19 | 20 | .. note:: 21 | 22 | Ray cluster takes few seconds to a couple of minutes to fully initialized. 23 | During initialization, it performs node provisioning, downloading Ray container image and executing bootstrap commands. 24 | 25 | Access Ray dashboard and Jupyter notebook 26 | ----------------------------------------- 27 | 28 | Once your Ray cluster instance is fully initialized, 29 | you'll see link to the Ray dashbaord and Jupyter notebook. 30 | 31 | .. image:: https://user-images.githubusercontent.com/1540981/101435650-8f5ccf80-38c1-11eb-8619-ea448c33a50e.png 32 | :width: 600 33 | 34 | In the Jupyter notebook, ray environment is pre-configured so you can just run 35 | 36 | .. code-block:: python 37 | 38 | import ray 39 | ray.init() # no 'address' parameter required :) 40 | 41 | and use Ray cluster environment. 42 | 43 | 44 | Stop Ray cluster instance 45 | -------------------------- 46 | 47 | In `Instance management menu `_ menu, 48 | You can find ``Stop`` (``Start``) and ``Terminate`` button. 49 | 50 | Stop 51 | Stop Ray head and all workers. Can be (re)started later. Data stored in persistent volume is not removed. 52 | 53 | Terminate 54 | Stop Ray head and all workers permanently. Can not be restarted. Data stored in persistent volume is also removed. 55 | -------------------------------------------------------------------------------- /docs/computing/ray/index.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Ray cluster 3 | ============== 4 | 5 | .. image:: https://staroid.com/api/run/button.svg 6 | :target: https://staroid.com/g/open-datastudio/ray-cluster 7 | 8 | Ray is a distributed execution framework that makes it easy to scale your applications and to leverage state of the art machine learning libraries. 9 | 10 | Key features 11 | 12 | - Fully managed 13 | - Manage ray cluster using `Ray Cluster Launcher `_ (``ray up ...``) 14 | - Comes with Jupyter notebook 15 | - Ray Dashboard link to the cluster 16 | - GPU workers 17 | 18 | | 19 | 20 | |ray-logo| 21 | 22 | Getting started Ray cluster 23 | --------------------------- 24 | 25 | .. toctree:: 26 | :maxdepth: 1 27 | 28 | Ray cluster from Ray Cluster Launcher CLI 29 | Ray cluster from Staroid management console GUI 30 | 31 | | 32 | | 33 | 34 | =============================== =================================================================== 35 | Open data studio Ray Cluster https://github.com/open-datastudio/ray-cluster 36 | Original repository https://github.com/ray-project/ray 37 | =============================== =================================================================== 38 | 39 | 40 | .. |ray-logo| image:: ../../_static/ray-logo.png 41 | :width: 150px 42 | :alt: Ray 43 | 44 | .. include:: ../../ref.rst 45 | -------------------------------------------------------------------------------- /docs/computing/spark/from_ods_zeppelin.rst: -------------------------------------------------------------------------------- 1 | --------------------------------------------------- 2 | Spark cluster from Open data studio Zeppelin 3 | --------------------------------------------------- 4 | 5 | .. raw:: html 6 | 7 | 8 | 9 | | 10 | 11 | Open data studio :ref:`Apache Zeppelin` integrates Spark 3.x out of the box. 12 | Extra installation/initialization steps are not required. 13 | 14 | .. image:: https://user-images.githubusercontent.com/1540981/80290438-cf3bc180-86f9-11ea-8c1f-d2dedcd48a86.png 15 | :width: 600 16 | 17 | Launch and use spark interpreter. Spark cluster will be automatically created. 18 | 19 | .. code-block:: bash 20 | :caption: configure spark executors 21 | 22 | %spark.conf 23 | spark.executor.instances 3 24 | 25 | 26 | .. code-block:: scala 27 | :caption: run spark api 28 | 29 | %spark 30 | // 'sc' and 'spark' are automatically created 31 | spark.read.json(...) 32 | 33 | 34 | Check :ref:`Apache Zeppelin` for more details. 35 | -------------------------------------------------------------------------------- /docs/computing/spark/from_python_environment.rst: -------------------------------------------------------------------------------- 1 | --------------------------------------------------- 2 | Spark cluster from your python environment 3 | --------------------------------------------------- 4 | 5 | .. raw:: html 6 | 7 | 8 | 9 | | 10 | 11 | Try in Google Colab 12 | .. image:: https://colab.research.google.com/assets/colab-badge.svg 13 | :target: https://colab.research.google.com/github/open-datastudio/ods/blob/master/notebook/open-data-studio.ipynb 14 | 15 | 16 | | 17 | 18 | Install 19 | -------------------------- 20 | 21 | Install `ods `_ package using pip command. 22 | 23 | .. code-block:: bash 24 | 25 | $ pip install ods 26 | 27 | And let's get an `access token `_ and set ``STAROID_ACCESS_TOKEN`` environment variable. 28 | 29 | .. code-block:: bash 30 | 31 | $ export STAROID_ACCESS_TOKEN="" 32 | 33 | For alternative ways to configure access token, check `staroid-python `_. 34 | 35 | Create Kubernetes cluster 36 | -------------------------- 37 | 38 | `staroid.com `_ -> Products -> Kubernetes (SKE) -> New Kubernetes cluster. 39 | 40 | .. image:: https://user-images.githubusercontent.com/1540981/87723637-ede8ac00-c76e-11ea-98d3-b6f8d972453d.png 41 | :width: 400 42 | 43 | And configure kubernetes cluster name after import python library. 44 | 45 | .. code-block:: python 46 | 47 | import ods 48 | # 'ske' is the name of kubernetes cluster created from staroid.com. 49 | # Alternatively, you can set the 'STAROID_SKE' environment variable. 50 | ods.init(ske="data-team1") 51 | 52 | 53 | Create PySpark session 54 | ----------------------- 55 | 56 | Spark-serverless enables you to create an interactive PySpark sessions with executors running on the cloud remotely. 57 | 58 | .. code-block:: python 59 | 60 | import ods 61 | # 'ske' is the name of kubernetes cluster created from staroid.com. 62 | # Alternatively, you can set the 'STAROID_SKE' environment variable. 63 | ods.init(ske="data-team1") 64 | 65 | # get saprk session with 3 initial worker nodes, delta lake enabled 66 | spark = ods.spark("my-cluster", worker_num=3, delta=True).session() 67 | 68 | # Do your work with Spark session 69 | df = spark.read.load(...) 70 | 71 | Now you can use Spark session with 3 remotely running executors. 72 | 73 | .. note:: 74 | 75 | There's no application packaging and submit step required. Everything runs interactively. 76 | 77 | 78 | .. include:: ../../ref.rst 79 | -------------------------------------------------------------------------------- /docs/computing/spark/index.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Spark serverless 3 | ================ 4 | 5 | Apache Spark is a unified analytics engine for large-scale data processing. 6 | Open data studio provides instant access to the Spark cluster from anywhere without thinking about infrastructure and maintenance. 7 | 8 | Key features 9 | 10 | - Spark 3.x 11 | - Delta lake support 12 | - Fully managed 13 | - Spark UI access 14 | 15 | | 16 | 17 | |spark-logo| 18 | 19 | Spark serverless cluster 20 | ----------------------------- 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | 25 | Getting Started (Python environment) 26 | Access Spark UI 27 | Managing cluster instances 28 | 29 | | 30 | 31 | =============================== =================================================================== 32 | Open data studio spark service https://github.com/open-datastudio/spark-serverless 33 | Original repository https://github.com/apache/spark 34 | =============================== =================================================================== 35 | 36 | .. |spark-logo| image:: ../../_static/spark-logo.png 37 | :width: 150px 38 | :alt: Apache spark 39 | 40 | .. include:: ../../ref.rst 41 | -------------------------------------------------------------------------------- /docs/computing/spark/instances.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | Managing cluster instances 3 | =========================== 4 | 5 | Spark serverless doesn't really need complex management or maintenance of the Spark cluster. 6 | Upgrading, scaling-out, optimization, and other complex tasks are handled automatically. 7 | Enjoy **zero maintenance** serverless experience. 8 | 9 | All you need to do is simple task, such as Start or Stop cluster instances when you need. 10 | 11 | Cluster instance management operations can be done 12 | either programmatically using Python client library or with mouse clicks from `Instance management menu `_. 13 | 14 | Create a new Spark cluster instance 15 | ----------------------------------- 16 | 17 | You can create multiple Spark serverless cluster instances in 18 | one or more Kubernetes cluster (SKE). See :ref:`Create Kubernetes cluster` section to create a SKE. 19 | 20 | You can create a cluster instance by creating a spark session from your Python environment. 21 | 22 | Create spark session with the default configuration 23 | .. code-block:: python 24 | 25 | import ods 26 | ods.init(ske="my-ske") 27 | spark = ods.spark("my-cluster").session() 28 | 29 | Create spark session with 3 initial worker nodes 30 | .. code-block:: python 31 | 32 | import ods 33 | ods.init(ske="my-ske") 34 | spark = ods.spark("my-cluster", worker_num=3).session() 35 | 36 | Create spark session with delta lake support 37 | .. code-block:: python 38 | 39 | import ods 40 | ods.init(ske="my-ske") 41 | spark = ods.spark("my-cluster", delta=True).session() 42 | 43 | 44 | .. note:: 45 | 46 | ``pip install ods`` to install ods library. 47 | Python version 3.6, 3.7, 3.8 are supported. 48 | 49 | Done! You have Spark session that is connected to executors running remotely on the cloud. 50 | No application packaging and job submit to the cluster required. 51 | 52 | Your Spark session is capable of doing interactive computing. 53 | That means, you can use Spark session in Python REPL or in the Notebook. 54 | 55 | 56 | .. note:: 57 | 58 | It may take a few seconds to minutes for executors to be fully ready. See next section to monitor status of executors. 59 | 60 | 61 | Spark cluster instance management menu 62 | -------------------------------------- 63 | 64 | Open `Instance management menu `_ 65 | and you'll find Spark cluster instance automatically created by the Spark session. 66 | You can also access :ref:`Access Spark UI` from here. 67 | 68 | .. note:: 69 | 70 | Log console and shell terminal is provided for more advanced usage as well. 71 | 72 | 73 | Stop Spark cluster instance 74 | ----------------------------- 75 | 76 | In `Instance management menu `_ menu, 77 | You can find ``Stop`` (``Start``) and ``Terminate`` button. 78 | 79 | Stop 80 | Stop all executors. Can be (re)started later. Data stored in persistent volume is not removed. 81 | 82 | Python API equivalent is 83 | 84 | .. code-block:: python 85 | 86 | # 'spark' is spark session created from 'spark = ods.spark("my-cluster").session()' 87 | spark.stop() 88 | 89 | Terminate 90 | Stop all executors permanently. Can not be restarted. Data stored in persistent volume is also removed. 91 | 92 | Python API equivalent is 93 | 94 | .. code-block:: python 95 | 96 | ods.spark("my-cluster").delete() 97 | 98 | -------------------------------------------------------------------------------- /docs/computing/spark/spark_ui.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Access Spark UI 3 | =============== 4 | 5 | Access Spark UI locally 6 | ----------------------- 7 | 8 | While Spark-serverless keeps Spark driver running on your python environment (client-side), 9 | you can simply browse ``localhost:4040`` (or subsequent port numbers) to access Spark UI when you are using your laptop. 10 | 11 | 12 | Access Spark UI remotely 13 | ------------------------ 14 | 15 | If you're using some environment that accesses to the other local ports are limited 16 | (for example, notebook environment on the cloud, such as Google Colab) or you'd like to share 17 | your Spark UI with your team, you can find a Spark UI link, when you open a detail view of your instance 18 | from `Instance management menu `_. 19 | 20 | .. image:: https://user-images.githubusercontent.com/1540981/100956146-af108400-34cc-11eb-9ee5-1e8dd9937694.png 21 | :width: 600 22 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = u'Open Data Studio' 23 | copyright = u'Open Data Studio Authors' 24 | author = u'Open Data Studio Authors' 25 | 26 | # The short X.Y version 27 | version = u'' 28 | # The full version, including alpha/beta/rc tags 29 | release = u'' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autosectionlabel', 43 | 'aafigure.sphinxext' 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # The suffix(es) of source filenames. 50 | # You can specify multiple suffix as a list of string: 51 | # 52 | # source_suffix = ['.rst', '.md'] 53 | source_suffix = '.rst' 54 | 55 | # The master toctree document. 56 | master_doc = 'index' 57 | 58 | # The language for content autogenerated by Sphinx. Refer to documentation 59 | # for a list of supported languages. 60 | # 61 | # This is also used if you do content translation via gettext catalogs. 62 | # Usually you set "language" from the command line for these cases. 63 | language = None 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | # This pattern also affects html_static_path and html_extra_path. 68 | exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] 69 | 70 | # The name of the Pygments (syntax highlighting) style to use. 71 | pygments_style = None 72 | 73 | 74 | # -- Options for HTML output ------------------------------------------------- 75 | 76 | # The theme to use for HTML and HTML Help pages. See the documentation for 77 | # a list of builtin themes. 78 | # 79 | html_theme = 'sphinx_rtd_theme' 80 | 81 | # Theme options are theme-specific and customize the look and feel of a theme 82 | # further. For a list of options available for each theme, see the 83 | # documentation. 84 | # 85 | # html_theme_options = {} 86 | 87 | # Add any paths that contain custom static files (such as style sheets) here, 88 | # relative to this directory. They are copied after the builtin static files, 89 | # so a file named "default.css" will overwrite the builtin "default.css". 90 | html_static_path = ['_static'] 91 | 92 | # Custom sidebar templates, must be a dictionary that maps document names 93 | # to template names. 94 | # 95 | # The default sidebars (for documents that don't match any pattern) are 96 | # defined by theme itself. Builtin themes are using these templates by 97 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 98 | # 'searchbox.html']``. 99 | # 100 | # html_sidebars = {} 101 | html_logo = '_static/open-datastudio-logo.svg' 102 | 103 | # Enable link of 'View page source' 104 | #html_show_sourcelink = False 105 | # Add 'Edit on Github' link instead of 'View page source' 106 | # reference:https://docs.readthedocs.io/en/latest/vcs.html 107 | html_context = { 108 | # Enable the "Edit in GitHub link within the header of each page. 109 | 'display_github': True, 110 | # Set the following variables to generate the resulting github URL for each page. 111 | # Format Template: https://{{ github_host|default("github.com") }}/{{ github_user }} 112 | #/{{ github_repo }}/blob/{{ github_version }}{{ conf_py_path }}{{ pagename }}{{ suffix }} 113 | #https://github.com/runawayhorse001/SphinxGithub/blob/master/doc/index.rst 114 | 'github_user': 'open-datastudio', 115 | 'github_repo': 'datastudio', 116 | 'github_version': 'master/docs/', 117 | } 118 | 119 | # -- Options for HTMLHelp output --------------------------------------------- 120 | 121 | # Output file base name for HTML help builder. 122 | htmlhelp_basename = 'OpenDataStudioDoc' 123 | 124 | 125 | # -- Options for LaTeX output ------------------------------------------------ 126 | 127 | latex_elements = { 128 | # The paper size ('letterpaper' or 'a4paper'). 129 | # 130 | # 'papersize': 'letterpaper', 131 | 132 | # The font size ('10pt', '11pt' or '12pt'). 133 | # 134 | # 'pointsize': '10pt', 135 | 136 | # Additional stuff for the LaTeX preamble. 137 | # 138 | # 'preamble': '', 139 | 140 | # Latex figure (float) alignment 141 | # 142 | # 'figure_align': 'htbp', 143 | } 144 | 145 | # Grouping the document tree into LaTeX files. List of tuples 146 | # (source start file, target name, title, 147 | # author, documentclass [howto, manual, or own class]). 148 | latex_documents = [ 149 | (master_doc, 'OPENDATASTUDIO.tex', u'Open Data Studio Documentation', 150 | u'Open Data Studio', 'manual'), 151 | ] 152 | 153 | 154 | # -- Options for manual page output ------------------------------------------ 155 | 156 | # One entry per manual page. List of tuples 157 | # (source start file, name, description, authors, manual section). 158 | man_pages = [ 159 | (master_doc, 'open data studio', u'Open Data Studio Documentation', 160 | [author], 1) 161 | ] 162 | 163 | 164 | # -- Options for Texinfo output ---------------------------------------------- 165 | 166 | # Grouping the document tree into Texinfo files. List of tuples 167 | # (source start file, target name, title, author, 168 | # dir menu entry, description, category) 169 | texinfo_documents = [ 170 | (master_doc, 'Open Data Studio', u'Open Data Studio Documentation', 171 | author, 'Open Data Studio', 'Cloud data tools', 172 | 'Miscellaneous'), 173 | ] 174 | 175 | 176 | # -- Options for Epub output ------------------------------------------------- 177 | 178 | # Bibliographic Dublin Core info. 179 | epub_title = project 180 | 181 | # The unique identifier of the text. This can be a ISBN number 182 | # or the project homepage. 183 | # 184 | # epub_identifier = '' 185 | 186 | # A unique identification for the text. 187 | # 188 | # epub_uid = '' 189 | 190 | # A list of files that should not be packed into the epub file. 191 | epub_exclude_files = ['search.html'] 192 | -------------------------------------------------------------------------------- /docs/data-lake/delta.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Delta Lake 3 | ============== 4 | 5 | Delta Lake is an open-source storage layer that brings ACID 6 | transactions to Apache Spark™ and big data workloads. 7 | 8 | Open data provides Delta lake in the following spark environments 9 | 10 | ================================================== ========================================== 11 | Service Note 12 | ================================================== ========================================== 13 | :ref:`Apache Zeppelin` Through ``%spark`` interpreter 14 | :ref:`Spark cluster from your python environment` ``ods.spark("cluster-name", delta=True)`` 15 | ================================================== ========================================== 16 | 17 | .. include:: ../ref.rst 18 | -------------------------------------------------------------------------------- /docs/data-lake/hive-metastore.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Hive metastore 3 | ============== 4 | 5 | Apache Hive Metastore is the central repository of Apache Hive, Apache Spark metadata and more. 6 | Open data studio makes it easy to deploy on the cloud. 7 | 8 | Key features 9 | 10 | - Based on Hive 3.1.2 11 | - Click to deploy. No setup required 12 | - Configured with Postgresql DB 13 | - Connect from :ref:`Spark thriftserver` and :ref:`Apache Zeppelin`. No configuration required 14 | 15 | =============================== =================================================================== 16 | Launch page https://staroid.com/g/open-datastudio/hive-metastore 17 | Open data studio repository https://github.com/open-datastudio/hive-metastore 18 | Original repository https://github.com/apache/hive 19 | Documentation http://hive.apache.org/ 20 | =============================== =================================================================== 21 | 22 | 23 | Hive metastore Quickstart 24 | ------------------------- 25 | 26 | .. image:: https://staroid.com/api/run/button.svg 27 | :target: https://staroid.com/g/open-datastudio/hive-metastore 28 | 29 | 30 | Get hive-metastore address 31 | --------------------------------- 32 | 33 | To learn how to get address of hive metastore server address after deploy, 34 | click `README `_. 35 | 36 | .. include:: ../ref.rst 37 | -------------------------------------------------------------------------------- /docs/data-lake/index.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Data Lake 3 | =============== 4 | 5 | A data lake is a centralized repository that allows you to store all your structured and unstructured data at any scale. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | Delta lake 11 | Hive Metastore 12 | Spark Thriftserver 13 | 14 | .. include:: ../ref.rst 15 | -------------------------------------------------------------------------------- /docs/data-lake/spark-thriftserver.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Spark thriftserver 3 | ================== 4 | 5 | Spark thrift server allowing multiple remote clients to access Spark. 6 | It provides a generic JDBC endpoint that let any client including BI tools connect and access the power of Spark. 7 | Open data studio makes it easy to deploy on the cloud. 8 | 9 | Key features 10 | 11 | - Allows JDBC/ODBC clients to execute SQL queries over JDBC and ODBC protocols on Apache Spark. 12 | - Spark 3.0 13 | - Spark cluster is automatically configured on Kubernetes 14 | - Connect to :ref:`Hive metastore`. No configuration required 15 | 16 | =============================== =================================================================== 17 | Launch page https://staroid.com/g/open-datastudio/spark-thriftserver 18 | Open data studio repository https://github.com/open-datastudio/spark-thriftserver 19 | Original repository https://github.com/apache/spark 20 | Documentation https://spark.apache.org/docs/latest/sql-distributed-sql-engine.html 21 | =============================== =================================================================== 22 | 23 | Spark thrift-server Quickstart 24 | ------------------------------ 25 | 26 | .. image:: https://staroid.com/api/run/button.svg 27 | :target: https://staroid.com/g/open-datastudio/spark-thriftserver 28 | 29 | 30 | Get spark-thriftserver address 31 | ------------------------------------- 32 | 33 | `spark-thriftserver-info `_ ConfigMap is created 34 | after deployment. The ConfigMap includes spark-thriftserver JDBC URL to connect. 35 | Other project can import this ConfigMap. `Learn more `__. 36 | 37 | .. include:: ../ref.rst 38 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | |ods-logo| Open Data Studio 2 | ================================== 3 | 4 | Open data studio is a fully managed computing service on Staroid_ cloud, 5 | built with open source development model. 6 | 7 | That means you can enjoy all the benefits of software as a service, 8 | without giving up ability to understand the code, contribute and improve like any other open source software. 9 | 10 | 11 | | 12 | 13 | 14 | 15 | Use cases 16 | ------------ 17 | 18 | |spark-usecase| 19 | 20 | * From Python shell/ide/notebook on your laptop, interactively process massive data on your data lake with :ref:`Spark serverless`. 21 | * Connect your BI tools via JDBC using :ref:`Spark thriftserver`. On-demand Spark cluster is automatically configured for you. 22 | * Visualize your data on interactive notebook using :ref:`Apache Zeppelin`. On-demand Spark cluster is automatically configured for you. 23 | 24 | | 25 | 26 | |ray-usecase| 27 | 28 | * Use ``ray up`` command to launch fully managed :ref:`Ray cluster` on the cloud. 29 | * Deploy your model using Ray serve with authenticated REST API endpoint. 30 | * Launch GPU accelerated :ref:`Jupyter` instance on the cloud. 31 | 32 | 33 | .. |spark-usecase| image:: ./_static/spark-usecase.png 34 | :width: 650px 35 | :alt: Spark use case 36 | 37 | .. |ray-usecase| image:: ./_static/ray-usecase.png 38 | :width: 500px 39 | :alt: Ray use case 40 | 41 | Technology 42 | ------------ 43 | 44 | Use all the latest machine learning technology in a single place. 45 | Open data studio continues to integrate the best technologies for machine learning. 46 | 47 | |spark-logo| |ray-logo| |delta-logo| |cuda-logo| |jupyter-logo| |zeppelin-logo| 48 | 49 | .. |spark-logo| image:: ./_static/spark-logo.png 50 | :width: 80px 51 | :alt: Apache spark 52 | 53 | .. |ray-logo| image:: ./_static/ray-logo.png 54 | :width: 100px 55 | :alt: Ray 56 | 57 | .. |delta-logo| image:: ./_static/delta-logo.png 58 | :width: 70px 59 | :alt: Delta lake 60 | 61 | .. |cuda-logo| image:: ./_static/cuda-logo.png 62 | :width: 70px 63 | :alt: Nvidia CUDA 64 | 65 | .. |jupyter-logo| image:: ./_static/jupyter-logo.png 66 | :width: 60px 67 | :alt: Jupyter notebook 68 | 69 | .. |zeppelin-logo| image:: ./_static/zeppelin-logo.svg 70 | :width: 80px 71 | :alt: Zeppelin notebook 72 | 73 | | 74 | 75 | Easy of use 76 | ----------- 77 | 78 | Access to the latest machine learning technology shouldn't be more than a few clicks or a few lines of code away. 79 | 80 | .. code-block:: python 81 | :caption: Learn more about :ref:`Spark cluster from your python environment` 82 | 83 | # import open data studio library 84 | import ods 85 | 86 | # create a spark cluster on the cloud with 3 initial workers 87 | spark = ods.spark("my-spark", worker_num=3).session() 88 | 89 | # run spark task 90 | df = spark.read.load("...") 91 | 92 | 93 | .. code-block:: bash 94 | :caption: Learn more about :ref:`Ray cluster from Ray Cluster Launcher CLI` 95 | 96 | $ # install ray and staroid package 97 | $ pip install ray staroid kubernetes 98 | 99 | $ # switch to nightly build 100 | $ ray install-nightly 101 | 102 | $ # get autoscaler yaml files 103 | $ git clone https://github.com/ray-project/ray.git 104 | 105 | $ # spin-up cluster on the cloud and attach 106 | $ ray up ray/python/ray/autoscaler/staroid/example-full.yaml 107 | $ ray attach ray/python/ray/autoscaler/staroid/example-full.yaml 108 | 109 | 110 | | 111 | 112 | 113 | Fully managed 114 | ------------- 115 | 116 | Save time and reduce risk. 117 | Open data studio is maintained by the committers of the open source project and industry experts 118 | on top of secure, reliable, and high performance cloud platform Staroid_. 119 | 120 | | 121 | 122 | Open source 123 | ----------- 124 | 125 | Open data studio is an open source project. 126 | You can easily see source code, understand how it works, and get involved. 127 | When you need, fork and get your own version of managed service! 128 | 129 | Also, every time you launch projects, developers of the projects get funded via StarRank_. 130 | 131 | Community 132 | --------- 133 | 134 | * Open data studio github - https://github.com/open-datastudio 135 | * Open data studio slack channel - `Join `_ 136 | * Issue tracker - You can find 'issue' menu on each projects. But if you're not sure, create an issue `here `_ 137 | 138 | 139 | | 140 | 141 | .. toctree:: 142 | :maxdepth: 2 143 | 144 | about/index 145 | notebook/index 146 | data-lake/index 147 | computing/index 148 | machine-learning/index 149 | business-intelligence/index 150 | 151 | .. include:: ./ref.rst 152 | 153 | .. |ods-logo| image:: ./_static/open-datastudio-logo.png 154 | :width: 60px 155 | :alt: Open Datastudio 156 | -------------------------------------------------------------------------------- /docs/machine-learning/index.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Machine Learning 3 | ================ 4 | 5 | Manage, serve your models. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | MLflow server 11 | MLflow model serving 12 | 13 | .. include:: ../ref.rst 14 | -------------------------------------------------------------------------------- /docs/machine-learning/mlflow-model-serving.rst: -------------------------------------------------------------------------------- 1 | ==================== 2 | MLflow model serving 3 | ==================== 4 | 5 | Deploy models from :ref:`MLflow server`. 6 | 7 | Key features 8 | 9 | - Click to deploy. No setup required 10 | - Deploy models from :ref:`MLflow server` 11 | 12 | =============================== =================================================================== 13 | Launch page https://staroid.com/g/open-datastudio/mlflow-model-serving 14 | Open data studio repository https://github.com/open-datastudio/mlflow-model-serving 15 | Original repository https://github.com/mlflow/mlflow 16 | Documentation https://mlflow.org/docs/latest/index.html 17 | =============================== =================================================================== 18 | 19 | MLflow model serving Quickstart 20 | ------------------------------- 21 | 22 | .. image:: https://staroid.com/api/run/button.svg 23 | :target: https://staroid.com/g/open-datastudio/mlflow-model-serving 24 | 25 | **Screenshots** 26 | 27 | |mlflow-model-serving-screenshot| 28 | 29 | .. include:: ../ref.rst 30 | 31 | .. |mlflow-model-serving-screenshot| image:: https://user-images.githubusercontent.com/1540981/89857151-256d2d00-db50-11ea-9512-4e69e7f0cf89.png 32 | :width: 700px 33 | :alt: MLflow model serving 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/machine-learning/mlflow-server.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | MLflow server 3 | ============== 4 | 5 | `MLflow `_ is an open source platform for managing the end-to-end machine learning lifecycle. 6 | Open data studio makes it easy to deploy on the cloud. 7 | 8 | Key features 9 | 10 | - Click to deploy. No setup required 11 | - PostgreSQL backend store 12 | - Remote tracking server 13 | - Model registry 14 | - Tracking UI 15 | - Connect from :ref:`Jupyter`. No configuration required 16 | 17 | =============================== =================================================================== 18 | Launch page https://staroid.com/g/open-datastudio/mlflow-server 19 | Open data studio repository https://github.com/open-datastudio/mlflow-server 20 | Original repository https://github.com/mlflow/mlflow 21 | Documentation https://mlflow.org/docs/latest/index.html 22 | =============================== =================================================================== 23 | 24 | MLflow Server Quickstart 25 | ------------------------ 26 | 27 | .. image:: https://staroid.com/api/run/button.svg 28 | :target: https://staroid.com/g/open-datastudio/mlflow-server 29 | 30 | 31 | **Screenshots** 32 | 33 | |mlflow-screenshot| 34 | 35 | |mlflow-screenshot2| 36 | 37 | .. include:: ../ref.rst 38 | 39 | .. |mlflow-screenshot| image:: https://github.com/mlflow/mlflow/blob/f39a90d5fdbe588f5f4414d9d88af4f97b8f3de3/docs/source/_static/images/quickstart-ui-screenshot.png?raw=true 40 | :width: 700px 41 | :alt: MLflow 42 | 43 | .. |mlflow-screenshot2| image:: https://github.com/mlflow/mlflow/blob/f39a90d5fdbe588f5f4414d9d88af4f97b8f3de3/docs/source/_static/images/tutorial-compare.png?raw=true 44 | :width: 700px 45 | :alt: MLflow 46 | -------------------------------------------------------------------------------- /docs/notebook/index.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Notebook 3 | ============== 4 | 5 | Notebooks are a form of interactive computing, in which users write and execute code, visualize the results, and share insights. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | Jupyter 11 | Zeppelin 12 | 13 | .. include:: ../ref.rst 14 | -------------------------------------------------------------------------------- /docs/notebook/jupyter.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Jupyter 3 | ============== 4 | 5 | .. raw:: html 6 | 7 | 8 | 9 | | 10 | 11 | The Jupyter Notebook is an open-source web application that allows you to create and share documents that contain live code, equations, visualizations and narrative text. 12 | Open data studio makes it easy to deploy on the cloud. 13 | 14 | Key features 15 | 16 | - Click to deploy. No setup required 17 | - Jupyter Lab 18 | - NVIDIA GPU, CUDA support for machine learning 19 | - Works with MLflow remote tracking server and artifact store out of the box 20 | - Persistent storage for `~/work` 21 | - Connect to :ref:`MLflow server`. No configuration required 22 | 23 | 24 | Jupyter Quickstart 25 | ------------------- 26 | 27 | .. image:: https://staroid.com/api/run/button.svg 28 | :target: https://staroid.com/g/open-datastudio/jupyter 29 | 30 | **Screenshots** 31 | 32 | |jupyter-screenshot| 33 | 34 | =============================== =================================================================== 35 | Launch page https://staroid.com/g/open-datastudio/jupyter 36 | Open data studio repository https://github.com/open-datastudio/jupyter 37 | Original repository https://github.com/jupyter/jupyter 38 | Documentation https://jupyter.readthedocs.io/en/latest/ 39 | =============================== =================================================================== 40 | 41 | .. include:: ../ref.rst 42 | 43 | .. |jupyter-screenshot| image:: https://jupyter.org/assets/jupyterpreview.png 44 | :width: 700px 45 | :alt: Jupyter 46 | -------------------------------------------------------------------------------- /docs/notebook/zeppelin.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Apache Zeppelin 3 | =============== 4 | 5 | .. raw:: html 6 | 7 | 8 | 9 | | 10 | 11 | Apache Zeppelin is a web-based notebook that enables data-driven, interactive data analytics and collaborative documents with SQL, Scala and more. 12 | Open data studio makes it easy to deploy on the cloud. 13 | 14 | Key features 15 | 16 | - Click to deploy. No setup required 17 | - Spark on Kubernetes is pre-configured 18 | - Spark 3.x and spark-ui 19 | - Comes with Spark, Python, JDBC interpreter 20 | - Connect to :ref:`Hive metastore`. No configuration required 21 | 22 | | 23 | 24 | |zeppelin-logo| 25 | 26 | Zeppelin Quickstart 27 | ------------------- 28 | 29 | Visit https://staroid.com/g/open-datastudio/zeppelin, click ``Launch`` button. 30 | 31 | .. image:: https://staroid.com/api/run/button.svg 32 | :target: https://staroid.com/g/open-datastudio/zeppelin 33 | 34 | **Screenshots** 35 | 36 | Data visualization 37 | |zeppelin-screenshot| 38 | 39 | Spark cluster is automatically configured 40 | 41 | .. image:: https://user-images.githubusercontent.com/1540981/80290438-cf3bc180-86f9-11ea-8c1f-d2dedcd48a86.png 42 | :width: 700 43 | 44 | Spark cluster is automatically configured 45 | 46 | .. image:: https://user-images.githubusercontent.com/1540981/80290438-cf3bc180-86f9-11ea-8c1f-d2dedcd48a86.png 47 | :width: 700 48 | 49 | Spark UI access 50 | 51 | .. image:: https://user-images.githubusercontent.com/1540981/80290443-d8c52980-86f9-11ea-999c-eeafab25cf38.png 52 | :width: 700 53 | 54 | File manager 55 | 56 | .. image:: https://user-images.githubusercontent.com/1540981/82079532-d79f7080-9697-11ea-99c5-5787f070dce9.gif 57 | :width: 700 58 | 59 | =============================== =================================================================== 60 | Launch page https://staroid.com/g/open-datastudio/zeppelin 61 | Open data studio repository https://github.com/open-datastudio/zeppelin 62 | Original repository https://github.com/apache/zeppelin 63 | Documentation http://zeppelin.apache.org/docs/latest/ 64 | =============================== =================================================================== 65 | 66 | .. include:: ../ref.rst 67 | 68 | .. |zeppelin-logo| image:: ../_static/zeppelin-logo.svg 69 | :width: 150px 70 | :alt: Apache zeppelin 71 | 72 | .. |zeppelin-screenshot| image:: http://zeppelin.apache.org/docs/0.8.2/assets/themes/zeppelin/img/notebook.png 73 | :width: 700px 74 | :alt: Apache zeppelin 75 | -------------------------------------------------------------------------------- /docs/ref.rst: -------------------------------------------------------------------------------- 1 | .. _Staroid: https://staroid.com 2 | .. _staroid.com: https://staroid.com 3 | .. _StarRank: https://staroid.com/site/starrank 4 | .. _skaffold.yaml: https://skaffold.dev/docs/references/yaml/ 5 | .. _Skaffold: https://skaffold.dev 6 | .. _Apache Superset: https://superset.apache.org 7 | .. _ods quick start: https://github.com/open-datastudio/ods#quick-start -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | aafigure 3 | -------------------------------------------------------------------------------- /docs/support/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Support 3 | ========= 4 | 5 | Open data studio is commercially supported by Staroid_. 6 | 7 | To get an enterprise support, please `contact `_. 8 | 9 | .. include:: ../ref.rst --------------------------------------------------------------------------------