├── .gitignore ├── LLMs ├── QA_app │ ├── Readme.md │ ├── build-kb-chatapp-shakudo-buildOnShakudo.jpg │ ├── constants.py │ ├── pdf_qa.ipynb │ ├── pdf_qa.py │ ├── pipeline_qa.yaml │ ├── requirements.txt │ ├── run_qa.sh │ └── streamlit_app_blog.py ├── confluence_app │ ├── app.py │ ├── confluenceQA.ipynb │ ├── confluence_qa.py │ ├── constants.py │ ├── pipeline.yaml │ ├── readme.md │ ├── requirements.txt │ └── run.sh ├── milvus │ ├── milvus-demo.ipynb │ └── requirements.txt └── milvus_chatbot │ ├── milvus-chatbot-demo.ipynb │ ├── requirements.txt │ └── service │ ├── app.py │ ├── requirements.txt │ ├── run.sh │ └── svc.yaml ├── README.md └── example_notebooks ├── blogs └── blog_pandas_2.0.ipynb ├── data_prep ├── dask_group_sort.ipynb ├── dask_parallelize_a_loop.ipynb ├── pyspark3.2.0.ipynb ├── rapids_data_processing.ipynb ├── spark_on_ray_data_processing.ipynb └── vaex_groupby_merge.ipynb ├── frontend ├── sentence_classification │ ├── run.sh │ ├── slnlp.py │ └── streamlit.yml ├── streamlit_app_example │ └── image_recognition │ │ ├── README.MD │ │ ├── cat.png │ │ ├── example_frontend.png │ │ ├── run.sh │ │ ├── slexample.py │ │ └── streamlit_pipeline.yaml └── voila │ ├── multiple_notebooks │ ├── basics.ipynb │ ├── bqplot.ipynb │ ├── dashboard.ipynb │ ├── gridspecLayout.ipynb │ ├── interactive.ipynb │ ├── ipympl.ipynb │ ├── ipyvolume.ipynb │ ├── multiple_widgets.ipynb │ ├── pipeline.yaml │ ├── reveal.ipynb │ └── run.sh │ ├── pipeline.yaml │ ├── run.sh │ └── voila_demo.ipynb ├── inference └── batch_inference │ ├── batch_inference.ipynb │ ├── requirements.txt │ ├── scheduler_setup.py │ └── train.ipynb ├── pipelines ├── a_simple_pipeline │ ├── dask_group_sort.ipynb │ └── pipeline.yaml ├── advanced_dag_pipeline │ ├── para_tasks.yaml │ ├── para_test.yaml │ ├── para_test_close.yaml │ ├── seq_test.yaml │ ├── start_para_end_tasks.yaml │ ├── step1.ipynb │ ├── step2.ipynb │ ├── step3.ipynb │ ├── step4.ipynb │ ├── step5.ipynb │ └── step6.ipynb ├── crypto │ ├── pipeline.yaml │ └── watch_ob_multi.ipynb ├── pipeline_with_requirements │ ├── READ.md │ ├── pipeline.yaml │ ├── requirements.txt │ ├── step1.ipynb │ ├── step2.ipynb │ └── step3.ipynb ├── python_hello_world_pipeline │ ├── hello_world.py │ └── pipeline.yaml └── rapids_pipeline │ ├── pipeline.yaml │ └── rapids_data_processing.ipynb ├── serving ├── IoT-detection │ ├── __init__.py │ ├── inference.ipynb │ ├── inference_dask.ipynb │ ├── main.py │ ├── models │ │ ├── __version__ │ │ ├── learner.pkl │ │ ├── predictor.pkl │ │ └── utils │ │ │ ├── attr │ │ │ ├── CatBoost │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── ExtraTreesEntr │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── ExtraTreesGini │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── KNeighborsDist │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── KNeighborsUnif │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── LightGBM │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── LightGBMLarge │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── LightGBMXT │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── NeuralNetFastAI │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── NeuralNetTorch │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── RandomForestEntr │ │ │ │ └── y_pred_proba_val.pkl │ │ │ ├── RandomForestGini │ │ │ │ └── y_pred_proba_val.pkl │ │ │ └── XGBoost │ │ │ │ └── y_pred_proba_val.pkl │ │ │ └── data │ │ │ ├── X.pkl │ │ │ ├── X_val.pkl │ │ │ ├── y.pkl │ │ │ └── y_val.pkl │ ├── pipeline.yaml │ ├── pipeline_service.yaml │ ├── requirements.txt │ ├── run.sh │ ├── train.ipynb │ └── utils.py ├── mlflow_servers │ ├── pipeline.yaml │ └── run.sh └── triton │ ├── config_examples │ └── xgb_config.pbtxt │ ├── convert_keras.ipynb │ ├── convert_pytorch.ipynb │ ├── convert_xgb.ipynb │ ├── covert_tf_to_onnx.ipynb │ ├── hyperplane-triton-api │ ├── app.py │ ├── clients │ │ └── image_client.py │ ├── hyperplane_triton_api.yaml │ ├── requirements.txt │ ├── start_server.sh │ └── test_image.jpeg │ └── sentence_classification_app │ ├── app.py │ ├── clients │ ├── deepset.py │ └── sentence_classification.py │ ├── hyperplane_triton_api.yaml │ ├── requirements.txt │ └── start_server.sh ├── training ├── ray_lightgbm_250GB.ipynb ├── ray_mlflow.ipynb ├── ray_tf_nlp.ipynb ├── ray_training_torch.ipynb ├── ray_tune_demo.ipynb └── tf_mlflow.ipynb └── utils ├── dask_snowflake.ipynb ├── ray_starter.ipynb ├── s3plugin.ipynb └── submit_graphql_with_python.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | LLMs/confluence_app/=0.0.19 131 | LLMs/confluence_app/.chroma/* 132 | LLMs/confluence_app/db/* 133 | LLMs/confluence_app/chroma_db/* 134 | 135 | ## Venvs in the folder 136 | *.venv* 137 | -------------------------------------------------------------------------------- /LLMs/QA_app/Readme.md: -------------------------------------------------------------------------------- 1 | # Building a PDF Knowledge Bot with Open-source LLMs on Shakudo 2 | 3 | ## Solution Overview: 4 | For any textual knowledge base (in our case, PDFs), we first need to extract text snippets from the knowledge base and use an embedding model to create a vector store representing the semantic content of the snippets. When a question is asked, we estimate its embedding and find relevant snippets using an efficient similarity search from vector stores. After extracting the snippets, we engineer a prompt and generate an answer using the LLM generation model. The prompt can be tuned based on the specific LLM used. 5 | 6 | ![kb-chatapp](./build-kb-chatapp-shakudo-buildOnShakudo.jpg) 7 | Experimentation and development are crucial elements in the field of data science. Shakudo's session facilitates the selection of the appropriate computing resources and provides the flexibility to choose Jupyter Notebooks, VS Code Server (provided by the platform) or connecting via SSH to use a preferred local editor. 8 | 9 | 10 | * Jupyter notebook for tutorial : [pdf_qa](./pdf_qa.ipynb) 11 | * Associated blog tutorial: [Building a PDF Knowledge Bot With Open-Source LLMs- A Step-by-Step Guide 12 | ](https://www.shakudo.io/blog) 13 | 14 | 15 | ### Code References: 16 | * The code is adapted based on the work in [LLM-WikipediaQA](https://github.com/georgesung/LLM-WikipediaQA/tree/main), where the author compares FastChat-T5, Flan-T5 with ChatGPT running a Q&A on Wikipedia Articles 17 | 18 | -------------------------------------------------------------------------------- /LLMs/QA_app/build-kb-chatapp-shakudo-buildOnShakudo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/LLMs/QA_app/build-kb-chatapp-shakudo-buildOnShakudo.jpg -------------------------------------------------------------------------------- /LLMs/QA_app/constants.py: -------------------------------------------------------------------------------- 1 | # Constants 2 | EMB_OPENAI_ADA = "text-embedding-ada-002" 3 | EMB_INSTRUCTOR_XL = "hkunlp/instructor-xl" 4 | EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2" # Chroma takes care if embeddings are None 5 | EMB_SBERT_MINILM = "sentence-transformers/all-MiniLM-L6-v2" # Chroma takes care if embeddings are None 6 | 7 | 8 | LLM_OPENAI_GPT35 = "gpt-3.5-turbo" 9 | LLM_FLAN_T5_XXL = "google/flan-t5-xxl" 10 | LLM_FLAN_T5_XL = "google/flan-t5-xl" 11 | LLM_FASTCHAT_T5_XL = "lmsys/fastchat-t5-3b-v1.0" 12 | LLM_FLAN_T5_SMALL = "google/flan-t5-small" 13 | LLM_FLAN_T5_BASE = "google/flan-t5-base" 14 | LLM_FLAN_T5_LARGE = "google/flan-t5-large" 15 | LLM_FALCON_SMALL = "tiiuae/falcon-7b-instruct" 16 | -------------------------------------------------------------------------------- /LLMs/QA_app/pdf_qa.py: -------------------------------------------------------------------------------- 1 | from langchain.document_loaders import PDFPlumberLoader 2 | from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter 3 | from transformers import pipeline 4 | from langchain.prompts import PromptTemplate 5 | from langchain.chat_models import ChatOpenAI 6 | from langchain.vectorstores import Chroma 7 | from langchain.chains import RetrievalQA 8 | from langchain import HuggingFacePipeline 9 | from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings 10 | from langchain.embeddings.openai import OpenAIEmbeddings 11 | from langchain.llms import OpenAI 12 | from constants import * 13 | from transformers import AutoTokenizer 14 | import torch 15 | import os 16 | import re 17 | 18 | class PdfQA: 19 | def __init__(self,config:dict = {}): 20 | self.config = config 21 | self.embedding = None 22 | self.vectordb = None 23 | self.llm = None 24 | self.qa = None 25 | self.retriever = None 26 | 27 | # The following class methods are useful to create global GPU model instances 28 | # This way we don't need to reload models in an interactive app, 29 | # and the same model instance can be used across multiple user sessions 30 | @classmethod 31 | def create_instructor_xl(cls): 32 | device = "cuda" if torch.cuda.is_available() else "cpu" 33 | return HuggingFaceInstructEmbeddings(model_name=EMB_INSTRUCTOR_XL, model_kwargs={"device": device}) 34 | 35 | @classmethod 36 | def create_sbert_mpnet(cls): 37 | device = "cuda" if torch.cuda.is_available() else "cpu" 38 | return HuggingFaceEmbeddings(model_name=EMB_SBERT_MPNET_BASE, model_kwargs={"device": device}) 39 | 40 | @classmethod 41 | def create_flan_t5_xxl(cls, load_in_8bit=False): 42 | # Local flan-t5-xxl with 8-bit quantization for inference 43 | # Wrap it in HF pipeline for use with LangChain 44 | return pipeline( 45 | task="text2text-generation", 46 | model="google/flan-t5-xxl", 47 | max_new_tokens=200, 48 | model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} 49 | ) 50 | @classmethod 51 | def create_flan_t5_xl(cls, load_in_8bit=False): 52 | return pipeline( 53 | task="text2text-generation", 54 | model="google/flan-t5-xl", 55 | max_new_tokens=200, 56 | model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} 57 | ) 58 | 59 | @classmethod 60 | def create_flan_t5_small(cls, load_in_8bit=False): 61 | # Local flan-t5-small for inference 62 | # Wrap it in HF pipeline for use with LangChain 63 | model="google/flan-t5-small" 64 | tokenizer = AutoTokenizer.from_pretrained(model) 65 | return pipeline( 66 | task="text2text-generation", 67 | model=model, 68 | tokenizer = tokenizer, 69 | max_new_tokens=100, 70 | model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} 71 | ) 72 | @classmethod 73 | def create_flan_t5_base(cls, load_in_8bit=False): 74 | # Wrap it in HF pipeline for use with LangChain 75 | model="google/flan-t5-base" 76 | tokenizer = AutoTokenizer.from_pretrained(model) 77 | return pipeline( 78 | task="text2text-generation", 79 | model=model, 80 | tokenizer = tokenizer, 81 | max_new_tokens=100, 82 | model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} 83 | ) 84 | @classmethod 85 | def create_flan_t5_large(cls, load_in_8bit=False): 86 | # Wrap it in HF pipeline for use with LangChain 87 | model="google/flan-t5-large" 88 | tokenizer = AutoTokenizer.from_pretrained(model) 89 | return pipeline( 90 | task="text2text-generation", 91 | model=model, 92 | tokenizer = tokenizer, 93 | max_new_tokens=100, 94 | model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} 95 | ) 96 | @classmethod 97 | def create_fastchat_t5_xl(cls, load_in_8bit=False): 98 | return pipeline( 99 | task="text2text-generation", 100 | model = "lmsys/fastchat-t5-3b-v1.0", 101 | max_new_tokens=100, 102 | model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.} 103 | ) 104 | 105 | @classmethod 106 | def create_falcon_instruct_small(cls, load_in_8bit=False): 107 | model = "tiiuae/falcon-7b-instruct" 108 | 109 | tokenizer = AutoTokenizer.from_pretrained(model) 110 | hf_pipeline = pipeline( 111 | task="text-generation", 112 | model = model, 113 | tokenizer = tokenizer, 114 | trust_remote_code = True, 115 | max_new_tokens=100, 116 | model_kwargs={ 117 | "device_map": "auto", 118 | "load_in_8bit": load_in_8bit, 119 | "max_length": 512, 120 | "temperature": 0.01, 121 | "torch_dtype":torch.bfloat16, 122 | } 123 | ) 124 | return hf_pipeline 125 | 126 | def init_embeddings(self) -> None: 127 | # OpenAI ada embeddings API 128 | if self.config["embedding"] == EMB_OPENAI_ADA: 129 | self.embedding = OpenAIEmbeddings() 130 | elif self.config["embedding"] == EMB_INSTRUCTOR_XL: 131 | # Local INSTRUCTOR-XL embeddings 132 | if self.embedding is None: 133 | self.embedding = PdfQA.create_instructor_xl() 134 | elif self.config["embedding"] == EMB_SBERT_MPNET_BASE: 135 | ## this is for SBERT 136 | if self.embedding is None: 137 | self.embedding = PdfQA.create_sbert_mpnet() 138 | else: 139 | self.embedding = None ## DuckDb uses sbert embeddings 140 | # raise ValueError("Invalid config") 141 | 142 | def init_models(self) -> None: 143 | """ Initialize LLM models based on config """ 144 | load_in_8bit = self.config.get("load_in_8bit",False) 145 | # OpenAI GPT 3.5 API 146 | if self.config["llm"] == LLM_OPENAI_GPT35: 147 | # OpenAI GPT 3.5 API 148 | pass 149 | elif self.config["llm"] == LLM_FLAN_T5_SMALL: 150 | if self.llm is None: 151 | self.llm = PdfQA.create_flan_t5_small(load_in_8bit=load_in_8bit) 152 | elif self.config["llm"] == LLM_FLAN_T5_BASE: 153 | if self.llm is None: 154 | self.llm = PdfQA.create_flan_t5_base(load_in_8bit=load_in_8bit) 155 | elif self.config["llm"] == LLM_FLAN_T5_LARGE: 156 | if self.llm is None: 157 | self.llm = PdfQA.create_flan_t5_large(load_in_8bit=load_in_8bit) 158 | elif self.config["llm"] == LLM_FLAN_T5_XL: 159 | if self.llm is None: 160 | self.llm = PdfQA.create_flan_t5_xl(load_in_8bit=load_in_8bit) 161 | elif self.config["llm"] == LLM_FLAN_T5_XXL: 162 | if self.llm is None: 163 | self.llm = PdfQA.create_flan_t5_xxl(load_in_8bit=load_in_8bit) 164 | elif self.config["llm"] == LLM_FASTCHAT_T5_XL: 165 | if self.llm is None: 166 | self.llm = PdfQA.create_fastchat_t5_xl(load_in_8bit=load_in_8bit) 167 | elif self.config["llm"] == LLM_FALCON_SMALL: 168 | if self.llm is None: 169 | self.llm = PdfQA.create_falcon_instruct_small(load_in_8bit=load_in_8bit) 170 | 171 | else: 172 | raise ValueError("Invalid config") 173 | def vector_db_pdf(self) -> None: 174 | """ 175 | creates vector db for the embeddings and persists them or loads a vector db from the persist directory 176 | """ 177 | pdf_path = self.config.get("pdf_path",None) 178 | persist_directory = self.config.get("persist_directory",None) 179 | if persist_directory and os.path.exists(persist_directory): 180 | ## Load from the persist db 181 | self.vectordb = Chroma(persist_directory=persist_directory, embedding_function=self.embedding) 182 | elif pdf_path and os.path.exists(pdf_path): 183 | ## 1. Extract the documents 184 | loader = PDFPlumberLoader(pdf_path) 185 | documents = loader.load() 186 | ## 2. Split the texts 187 | text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) 188 | texts = text_splitter.split_documents(documents) 189 | # text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002 190 | text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=10) # This the encoding for text-embedding-ada-002 191 | texts = text_splitter.split_documents(texts) 192 | 193 | ## 3. Create Embeddings and add to chroma store 194 | ##TODO: Validate if self.embedding is not None 195 | self.vectordb = Chroma.from_documents(documents=texts, embedding=self.embedding, persist_directory=persist_directory) 196 | else: 197 | raise ValueError("NO PDF found") 198 | 199 | def retreival_qa_chain(self): 200 | """ 201 | Creates retrieval qa chain using vectordb as retrivar and LLM to complete the prompt 202 | """ 203 | ##TODO: Use custom prompt 204 | self.retriever = self.vectordb.as_retriever(search_kwargs={"k":3}) 205 | 206 | if self.config["llm"] == LLM_OPENAI_GPT35: 207 | # Use ChatGPT API 208 | self.qa = RetrievalQA.from_chain_type(llm=OpenAI(model_name=LLM_OPENAI_GPT35, temperature=0.), chain_type="stuff",\ 209 | retriever=self.vectordb.as_retriever(search_kwargs={"k":3})) 210 | else: 211 | hf_llm = HuggingFacePipeline(pipeline=self.llm,model_id=self.config["llm"]) 212 | 213 | self.qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=self.retriever) 214 | if self.config["llm"] == LLM_FLAN_T5_SMALL or self.config["llm"] == LLM_FLAN_T5_BASE or self.config["llm"] == LLM_FLAN_T5_LARGE: 215 | question_t5_template = """ 216 | context: {context} 217 | question: {question} 218 | answer: 219 | """ 220 | QUESTION_T5_PROMPT = PromptTemplate( 221 | template=question_t5_template, input_variables=["context", "question"] 222 | ) 223 | self.qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT 224 | self.qa.combine_documents_chain.verbose = True 225 | self.qa.return_source_documents = True 226 | def answer_query(self,question:str) ->str: 227 | """ 228 | Answer the question 229 | """ 230 | 231 | answer_dict = self.qa({"query":question,}) 232 | print(answer_dict) 233 | answer = answer_dict["result"] 234 | if self.config["llm"] == LLM_FASTCHAT_T5_XL: 235 | answer = self._clean_fastchat_t5_output(answer) 236 | return answer 237 | def _clean_fastchat_t5_output(self, answer: str) -> str: 238 | # Remove tags, double spaces, trailing newline 239 | answer = re.sub(r"\s+", "", answer) 240 | answer = re.sub(r" ", " ", answer) 241 | answer = re.sub(r"\n$", "", answer) 242 | return answer 243 | -------------------------------------------------------------------------------- /LLMs/QA_app/pipeline_qa.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "QA demo" 3 | tasks: 4 | - name: "QA app" 5 | type: "bash script" 6 | port: 8787 7 | bash_script_path: "LLM/QA_app/run_qa.sh" -------------------------------------------------------------------------------- /LLMs/QA_app/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.189 2 | chromadb==0.3.25 3 | openai==0.27.7 4 | pdfplumber==0.9.0 5 | tiktoken==0.4.0 6 | torch==2.0.1 7 | transformers==4.29.2 8 | accelerate==0.19.0 9 | sentence-transformers==2.2.2 10 | einops==0.6.1 11 | xformers==0.0.20 12 | itsdangerous==2.1.2 13 | bitsandbytes==0.39.0 -------------------------------------------------------------------------------- /LLMs/QA_app/run_qa.sh: -------------------------------------------------------------------------------- 1 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 2 | 3 | cd "$PROJECT_DIR" 4 | 5 | # Set the name of the virtual environment 6 | VENV_NAME=.venv 7 | 8 | ## Check if the virtual environment exists 9 | if [ ! -d "$VENV_NAME" ]; then 10 | # Create the virtual environment 11 | python -m venv "$VENV_NAME" 12 | fi 13 | source "$VENV_NAME/bin/activate" 14 | # export PATH="/root/.local/bin:$PATH" 15 | pip freeze 16 | pip install -r requirements.txt --ignore-installed 17 | pip freeze 18 | export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python 19 | export STREAMLIT_RUNONSSAVE=True 20 | 21 | 22 | streamlit run streamlit_app_blog.py --server.port 8787 --browser.serverAddress localhost --server.fileWatcherType none 23 | 24 | deactivate -------------------------------------------------------------------------------- /LLMs/QA_app/streamlit_app_blog.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from pdf_qa import PdfQA 3 | from pathlib import Path 4 | from tempfile import NamedTemporaryFile 5 | import time 6 | import shutil 7 | from constants import * 8 | 9 | 10 | 11 | # Streamlit app code 12 | st.set_page_config( 13 | page_title='Q&A Bot for PDF', 14 | page_icon='🔖', 15 | layout='wide', 16 | initial_sidebar_state='auto', 17 | ) 18 | 19 | 20 | if "pdf_qa_model" not in st.session_state: 21 | st.session_state["pdf_qa_model"]:PdfQA = PdfQA() ## Intialisation 22 | 23 | ## To cache resource across multiple session 24 | @st.cache_resource 25 | def load_llm(llm,load_in_8bit): 26 | 27 | if llm == LLM_OPENAI_GPT35: 28 | pass 29 | elif llm == LLM_FLAN_T5_SMALL: 30 | return PdfQA.create_flan_t5_small(load_in_8bit) 31 | elif llm == LLM_FLAN_T5_BASE: 32 | return PdfQA.create_flan_t5_base(load_in_8bit) 33 | elif llm == LLM_FLAN_T5_LARGE: 34 | return PdfQA.create_flan_t5_large(load_in_8bit) 35 | elif llm == LLM_FASTCHAT_T5_XL: 36 | return PdfQA.create_fastchat_t5_xl(load_in_8bit) 37 | elif llm == LLM_FALCON_SMALL: 38 | return PdfQA.create_falcon_instruct_small(load_in_8bit) 39 | else: 40 | raise ValueError("Invalid LLM setting") 41 | 42 | ## To cache resource across multiple session 43 | @st.cache_resource 44 | def load_emb(emb): 45 | if emb == EMB_INSTRUCTOR_XL: 46 | return PdfQA.create_instructor_xl() 47 | elif emb == EMB_SBERT_MPNET_BASE: 48 | return PdfQA.create_sbert_mpnet() 49 | elif emb == EMB_SBERT_MINILM: 50 | pass ##ChromaDB takes care 51 | else: 52 | raise ValueError("Invalid embedding setting") 53 | 54 | 55 | 56 | st.title("PDF Q&A (Self hosted LLMs)") 57 | 58 | with st.sidebar: 59 | emb = st.radio("**Select Embedding Model**", [EMB_INSTRUCTOR_XL, EMB_SBERT_MPNET_BASE,EMB_SBERT_MINILM],index=1) 60 | llm = st.radio("**Select LLM Model**", [LLM_FASTCHAT_T5_XL, LLM_FLAN_T5_SMALL,LLM_FLAN_T5_BASE,LLM_FLAN_T5_LARGE,LLM_FLAN_T5_XL,LLM_FALCON_SMALL],index=2) 61 | load_in_8bit = st.radio("**Load 8 bit**", [True, False],index=1) 62 | pdf_file = st.file_uploader("**Upload PDF**", type="pdf") 63 | 64 | 65 | if st.button("Submit") and pdf_file is not None: 66 | with st.spinner(text="Uploading PDF and Generating Embeddings.."): 67 | with NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: 68 | shutil.copyfileobj(pdf_file, tmp) 69 | tmp_path = Path(tmp.name) 70 | st.session_state["pdf_qa_model"].config = { 71 | "pdf_path": str(tmp_path), 72 | "embedding": emb, 73 | "llm": llm, 74 | "load_in_8bit": load_in_8bit 75 | } 76 | st.session_state["pdf_qa_model"].embedding = load_emb(emb) 77 | st.session_state["pdf_qa_model"].llm = load_llm(llm,load_in_8bit) 78 | st.session_state["pdf_qa_model"].init_embeddings() 79 | st.session_state["pdf_qa_model"].init_models() 80 | st.session_state["pdf_qa_model"].vector_db_pdf() 81 | st.sidebar.success("PDF uploaded successfully") 82 | 83 | question = st.text_input('Ask a question', 'What is this document?') 84 | 85 | if st.button("Answer"): 86 | try: 87 | st.session_state["pdf_qa_model"].retreival_qa_chain() 88 | answer = st.session_state["pdf_qa_model"].answer_query(question) 89 | st.write(f"{answer}") 90 | except Exception as e: 91 | st.error(f"Error answering the question: {str(e)}") -------------------------------------------------------------------------------- /LLMs/confluence_app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | import json 4 | import time 5 | from dotenv import load_dotenv 6 | 7 | # Import the ConfluenceQA class 8 | from confluence_qa import ConfluenceQA 9 | 10 | try: 11 | from hyperplane.utils import is_jhub 12 | if is_jhub(): 13 | openaiKeyFile = '/root/.secret/openai_key.json' 14 | else: 15 | openaiKeyFile = '/etc/hyperplane/secrets/openai_key.json' 16 | with open(openaiKeyFile) as f: 17 | os.environ["OPENAI_API_KEY"] = json.load(f)['openai_key'] 18 | except Exception as e: 19 | print(e) 20 | load_dotenv() 21 | 22 | st.set_page_config( 23 | page_title='Q&A Bot for Confluence Page', 24 | page_icon='⚡', 25 | layout='wide', 26 | initial_sidebar_state='auto', 27 | ) 28 | if "config" not in st.session_state: 29 | st.session_state["config"] = {} 30 | if "confluence_qa" not in st.session_state: 31 | st.session_state["confluence_qa"] = None 32 | 33 | @st.cache_resource 34 | def load_confluence(config): 35 | # st.write("loading the confluence page") 36 | confluence_qa = ConfluenceQA(config=config) 37 | confluence_qa.init_embeddings() 38 | confluence_qa.init_models() 39 | confluence_qa.vector_db_confluence_docs() 40 | confluence_qa.retreival_qa_chain() 41 | return confluence_qa 42 | 43 | with st.sidebar.form(key ='Form1'): 44 | st.markdown('## Add your configs') 45 | confluence_url = st.text_input("paste the confluence URL", "https://templates.atlassian.net/wiki/") 46 | username = st.text_input(label="confluence username", 47 | help="leave blank if confluence page is public", 48 | type="password") 49 | space_key = st.text_input(label="confluence space", 50 | help="Space of Confluence", 51 | value="RD") 52 | api_key = st.text_input(label="confluence api key", 53 | help="leave blank if confluence page is public", 54 | type="password") 55 | submitted1 = st.form_submit_button(label='Submit') 56 | 57 | if submitted1 and confluence_url and space_key: 58 | st.session_state["config"] = { 59 | "persist_directory": None, 60 | "confluence_url": confluence_url, 61 | "username": username if username != "" else None, 62 | "api_key": api_key if api_key != "" else None, 63 | "space_key": space_key, 64 | } 65 | with st.spinner(text="Ingesting Confluence..."): 66 | ### Hardcoding for https://templates.atlassian.net/wiki/ and space RD to avoid multiple OpenAI calls. 67 | config = st.session_state["config"] 68 | if config["confluence_url"] == "https://templates.atlassian.net/wiki/" and config["space_key"] =="RD": 69 | config["persist_directory"] = "chroma_db" 70 | st.session_state["config"] = config 71 | 72 | st.session_state["confluence_qa"] = load_confluence(st.session_state["config"]) 73 | st.write("Confluence Space Ingested") 74 | 75 | 76 | st.title("Confluence Q&A Demo") 77 | 78 | question = st.text_input('Ask a question', "How do I make a space public?") 79 | 80 | if st.button('Get Answer', key='button2'): 81 | with st.spinner(text="Asking LLM..."): 82 | confluence_qa = st.session_state.get("confluence_qa") 83 | if confluence_qa is not None: 84 | result = confluence_qa.answer_confluence(question) 85 | st.write(result) 86 | else: 87 | st.write("Please load Confluence page first.") -------------------------------------------------------------------------------- /LLMs/confluence_app/confluence_qa.py: -------------------------------------------------------------------------------- 1 | import os 2 | from langchain.document_loaders import ConfluenceLoader 3 | from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter 4 | from langchain.embeddings.openai import OpenAIEmbeddings 5 | from langchain.prompts import PromptTemplate 6 | from langchain.chat_models import ChatOpenAI 7 | from constants import * 8 | 9 | from langchain.vectorstores import Chroma 10 | from langchain.chains import RetrievalQA 11 | 12 | class ConfluenceQA: 13 | def __init__(self,config:dict = {}): 14 | self.config = config 15 | self.embedding = None 16 | self.vectordb = None 17 | self.llm = None 18 | self.qa = None 19 | self.retriever = None 20 | def init_embeddings(self) -> None: 21 | # OpenAI ada embeddings API 22 | self.embedding = OpenAIEmbeddings() 23 | def init_models(self) -> None: 24 | # OpenAI GPT 3.5 API 25 | self.llm = ChatOpenAI(model_name=LLM_OPENAI_GPT35, temperature=0.) 26 | 27 | def vector_db_confluence_docs(self,force_reload:bool= False) -> None: 28 | """ 29 | creates vector db for the embeddings and persists them or loads a vector db from the persist directory 30 | """ 31 | persist_directory = self.config.get("persist_directory",None) 32 | confluence_url = self.config.get("confluence_url",None) 33 | username = self.config.get("username",None) 34 | api_key = self.config.get("api_key",None) 35 | space_key = self.config.get("space_key",None) 36 | if persist_directory and os.path.exists(persist_directory) and not force_reload: 37 | ## Load from the persist db 38 | self.vectordb = Chroma(persist_directory=persist_directory, embedding_function=self.embedding) 39 | else: 40 | ## 1. Extract the documents 41 | loader = ConfluenceLoader( 42 | url=confluence_url, 43 | username = username, 44 | api_key= api_key 45 | ) 46 | documents = loader.load( 47 | space_key=space_key, 48 | limit=100) 49 | ## 2. Split the texts 50 | text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0) 51 | texts = text_splitter.split_documents(documents) 52 | text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base") # This the encoding for text-embedding-ada-002 53 | texts = text_splitter.split_documents(texts) 54 | 55 | ## 3. Create Embeddings and add to chroma store 56 | ##TODO: Validate if self.embedding is not None 57 | self.vectordb = Chroma.from_documents(documents=texts, embedding=self.embedding, persist_directory=persist_directory) 58 | def retreival_qa_chain(self): 59 | """ 60 | Creates retrieval qa chain using vectordb as retrivar and LLM to complete the prompt 61 | """ 62 | ##TODO: Use custom prompt 63 | self.retriever = self.vectordb.as_retriever(search_kwargs={"k":4}) 64 | self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff",retriever=self.retriever) 65 | 66 | def answer_confluence(self,question:str) ->str: 67 | """ 68 | Answer the question 69 | """ 70 | answer = self.qa.run(question) 71 | return answer 72 | -------------------------------------------------------------------------------- /LLMs/confluence_app/constants.py: -------------------------------------------------------------------------------- 1 | # Constants 2 | EMB_OPENAI_ADA = "text-embedding-ada-002" 3 | EMB_SBERT = None # Chroma takes care 4 | 5 | LLM_OPENAI_GPT35 = "gpt-3.5-turbo" 6 | -------------------------------------------------------------------------------- /LLMs/confluence_app/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "QA demo" 3 | tasks: 4 | - name: "QA app" 5 | type: "bash script" 6 | port: 8787 7 | bash_script_path: "LLMs/confluence_app/run.sh" 8 | -------------------------------------------------------------------------------- /LLMs/confluence_app/readme.md: -------------------------------------------------------------------------------- 1 | # Building Confluence QA App on Shakudo 2 | With advancements in AI and NLP like OpenAI's GPT-4 and LangChain, we're revolutionizing chatbot interactions. In this post, we'll explore how to use Shakudo to simplify and enhance the process of building a Q&A app for Internal Knowledge base from conceptualization to deployment. 3 | 4 | 5 | ## References: 6 | 7 | * Code base reference [LLM-WikipediaQA](https://github.com/georgesung/LLM-WikipediaQA/tree/main): Compares Open source models like FastChat-T5, Flan-T5 with ChatGPT on Wikipedia QA 8 | 9 | -------------------------------------------------------------------------------- /LLMs/confluence_app/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.189 2 | chromadb==0.3.25 3 | openai==0.27.6 4 | pytesseract==0.3.10 5 | beautifulsoup4==4.12.2 6 | atlassian-python-api==3.38.0 7 | tiktoken==0.4.0 8 | streamlit==1.23.1 9 | lxml==4.9.2 -------------------------------------------------------------------------------- /LLMs/confluence_app/run.sh: -------------------------------------------------------------------------------- 1 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 2 | 3 | cd "$PROJECT_DIR" 4 | pip install -r requirements.txt 5 | 6 | export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python 7 | export STREAMLIT_RUNONSSAVE=True 8 | streamlit run app.py --server.port 8787 --browser.serverAddress localhost 9 | -------------------------------------------------------------------------------- /LLMs/milvus/requirements.txt: -------------------------------------------------------------------------------- 1 | pymilvus==2.3.0 2 | python-dotenv==1.0.0 3 | langchain==0.0.274 4 | -------------------------------------------------------------------------------- /LLMs/milvus_chatbot/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.274 2 | text-generation==0.6.0 3 | pymilvus==2.3.0 4 | python-dotenv==1.0.0 5 | grpcio==1.56.0 -------------------------------------------------------------------------------- /LLMs/milvus_chatbot/service/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | import json 4 | 5 | import numpy 6 | import copy 7 | 8 | from pymilvus import Collection, utility 9 | 10 | import os 11 | from dotenv import load_dotenv 12 | load_dotenv(override=True) 13 | 14 | import langchain 15 | from langchain.embeddings import HuggingFaceEmbeddings 16 | 17 | import text_generation 18 | from langchain import PromptTemplate 19 | 20 | from pymilvus import connections 21 | connection = connections.connect( 22 | alias="default", 23 | host="milvus.hyperplane-milvus.svc.cluster.local", 24 | port=19530, 25 | ) 26 | 27 | SVC_EP=os.environ['HYPERPLANE_JOB_PARAMETER_LLM_ENDPOINT'] 28 | client = text_generation.Client(SVC_EP) 29 | 30 | embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2') 31 | 32 | whcollection = Collection("WikiHow") 33 | whcollection.load() 34 | 35 | assistant_string = "ASSISTANT:\n" 36 | user_string = "USER:\n" 37 | document_string="DOCUMENT:\n" 38 | 39 | prompt_template = PromptTemplate.from_template( 40 | f"""\ 41 | {{turns}} 42 | {document_string}{{context}} 43 | {user_string}What does the document say about {{prompt}} 44 | Give me a summary. If the information is not there let me know. 45 | 46 | {assistant_string} 47 | """ 48 | ) 49 | 50 | def generate(what, turns, context, topic, whole_context): 51 | found = whcollection.search( 52 | [embeddings.embed_query(what)], 53 | anns_field="vector", 54 | param={'metric_type': 'L2', 55 | 'offset': 0, 56 | 'params': {'nprobe': 1} 57 | }, 58 | limit=1, 59 | output_fields=['text', 'title']) 60 | match_title = found[0][0].entity.get('title') 61 | match_text = found[0][0].entity.get('text') 62 | match_dist = found[0][0].distance 63 | 64 | retrieved = "" 65 | 66 | if match_title != topic and match_dist < 0.75: 67 | retrieved = match_text 68 | retrieved = retrieved[:1024] 69 | context = retrieved 70 | whole_context = match_text 71 | topic = match_title 72 | turntxt = "\n".join(turns)[-2048:] 73 | preface = ("No information available" if context is None else context) 74 | 75 | return { 76 | 'stream': client.generate_stream(prompt=prompt_template.format(turns=turntxt, prompt=what, context=preface), 77 | max_new_tokens=512, 78 | repetition_penalty=1.2, 79 | ), 80 | 81 | 'topic': topic, 82 | 'context': context, 83 | 'whole_context': whole_context 84 | } 85 | 86 | if "messages" not in st.session_state.keys(): 87 | st.session_state.messages = [] 88 | 89 | if "cache" not in st.session_state.keys(): 90 | st.session_state.cache = [[], "", None, ""] 91 | 92 | TURNS = 0 93 | CTX = 1 94 | TOPIC = 2 95 | WHOLE_CTX = 3 96 | 97 | turns = st.session_state.cache[TURNS] 98 | context = st.session_state.cache[CTX] 99 | topic = st.session_state.cache[TOPIC] 100 | whole_context = st.session_state.cache[WHOLE_CTX] 101 | 102 | for message in st.session_state.messages: 103 | with st.chat_message(message["role"]): 104 | st.write(message["content"]) 105 | 106 | ipt = st.chat_input(key="chat_query", placeholder="How can I run faster?") 107 | 108 | if ipt is not None: 109 | st.session_state.messages.append({'role': 'user', 'content': ipt}) 110 | with st.chat_message("user"): 111 | st.write(ipt) 112 | 113 | if len(st.session_state.messages) != 0 and st.session_state.messages[-1]["role"] != "response": 114 | with st.chat_message("response"): 115 | with st.spinner("Thinking..."): 116 | response = generate(ipt, turns, context, topic, whole_context) 117 | st.session_state.cache[CTX] = response['context'] 118 | st.session_state.cache[TOPIC] = response['topic'] 119 | st.session_state.cache[WHOLE_CTX] = response['whole_context'] 120 | with st.expander("Active Document"): 121 | st.write(response['whole_context']) 122 | 123 | turns.append(user_string + ipt) 124 | 125 | resp = "" 126 | placeholder = st.empty() 127 | for tok in response['stream']: 128 | if not tok.token.special: 129 | resp += tok.token.text 130 | placeholder.markdown(resp) 131 | placeholder.markdown(resp) 132 | 133 | st.session_state.cache[TURNS].append(assistant_string + resp) 134 | 135 | message = {"role": "response", "content": resp} 136 | st.session_state.messages.append(message) 137 | -------------------------------------------------------------------------------- /LLMs/milvus_chatbot/service/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.274 2 | text-generation==0.6.0 3 | pymilvus==2.3.0 4 | python-dotenv==1.0.0 5 | grpcio==1.56.0 6 | streamlit 7 | sentence_transformers -------------------------------------------------------------------------------- /LLMs/milvus_chatbot/service/run.sh: -------------------------------------------------------------------------------- 1 | streamlit run milvus-demo/chatbot/service/app.py --server.port 8787 --browser.serverAddress localhost -------------------------------------------------------------------------------- /LLMs/milvus_chatbot/service/svc.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "MultiturnRAG" 3 | requirements: "milvus-demo/chatbot/service/requirements.txt" 4 | tasks: 5 | - name: "Muliturn RAG demo" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "milvus-demo/chatbot/service/run.sh" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Example notebooks 2 | You can find here a list of the official notebooks and scrips provided by Hyperplane. 3 | 4 | Also, we would like to list here interesting content created by the community. If you wrote some notebook(s) leveraging Hyperplane and would like be listed here, please open a Pull Request so it can be included under the Community notebooks. 5 | 6 | | Notebook | Description | 7 | | :----------------| :--------------------------------------------------------------------------- | 8 | | [Data processing with DASK](https://github.com/devsentient/examples/blob/main/example_notebooks/data_prep/dask_group_sort.ipynb) | Speed up data preprocessing with distributed DASK cluster on Hyperplane | 9 | | [Data prepcessing with Ray](https://github.com/devsentient/examples/blob/main/example_notebooks/data_prep/ray_data_processing.ipynb) | speed up data preprocessing with distributed Ray cluster on Hyperplane | 10 | | [Data preprocessing with Spark](https://github.com/devsentient/examples/blob/main/example_notebooks/data_prep/spark_on_ray_data_processing.ipynb) | speed up data preprocessing with distributed spark on Ray clusters on Hyperplane | 11 | | [Question answering Tensorflow Training with Ray ](https://github.com/devsentient/examples/blob/main/example_notebooks/training/ray_tf_nlp.ipynb) | Tensorflow training with distributed hyperparameter tuning on Ray cluster | 12 | | [CIFAR Pytorch Training with Ray ](https://github.com/devsentient/examples/blob/main/example_notebooks/training/ray_training_torch.ipynb) | pytorch training with distributed hyperparameter tuning and MLFlow on Ray cluster | 13 | | [Ray Tune with MLFlow](https://github.com/devsentient/examples/blob/main/example_notebooks/training/ray_mlflow.ipynb) | Simple Ray Tune example wth MLFlow Tracking | 14 | | [Ray Tune Bayesian Optimization](https://github.com/devsentient/examples/blob/main/example_notebooks/training/ray_tune_demo.ipynb) | A collection of Ray Tune scheduler examples | 15 | | [Speed up inference on large data with DASK ](https://github.com/devsentient/examples/blob/main/example_notebooks/inference/batch_inference) | Advance example on speed inference with DASK by preload large model to DASK workers | 16 | | [Triton model prepration ](https://github.com/devsentient/examples/tree/main/example_notebooks/serving/triton) | Convert Pytorch Keras sklearn and xgboost model checkpoints for Triton serving | 17 | | [Hyperplane Triton service App ](https://github.com/devsentient/examples/tree/main/example_notebooks/serving/triton) | Triton client in a flask APP to be served as a Hyperplane service| 18 | | [Simple pipeline job ](https://github.com/devsentient/examples/tree/main/example_notebooks/pipelines/a_simple_pipelinen) | A basic pipeline to automate a jupyter notebook with parameterization | 19 | | [GraphQL within jupyter ](https://github.com/devsentient/examples/blob/main/example_notebooks/utils/submit_graphql_with_python.ipynb) | Submit pipeline jobs using graphql queries within jupyter a notebook 20 | -------------------------------------------------------------------------------- /example_notebooks/data_prep/dask_parallelize_a_loop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1510e2e2-8001-4ce9-b5d8-cc78a2b966fc", 6 | "metadata": {}, 7 | "source": [ 8 | "### example code for submitting a distributed dask job with a function and a list\n", 9 | "#### more info on dask distributed client https://distributed.dask.org/en/stable/client.html" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "f92fa29f-4ed4-4c9b-9f04-c3e3996456f4", 15 | "metadata": {}, 16 | "source": [ 17 | "## A normal looping of a list" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "id": "3b4e40f6-62bb-417e-ae32-1a2d0ef68abc", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import time\n", 28 | "from tqdm.notebook import tqdm" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "id": "c8785cd2-7519-4fbc-9130-330dcaab0070", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "def process_one_item(item):\n", 39 | " # result = some_magic(item)\n", 40 | " time.sleep(1)\n", 41 | " result = 'done'\n", 42 | " return result" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "id": "d2b1cb1d-130b-47c5-ad34-7143bcc1304a", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "list_of_things = list(range(30))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "id": "d50ec4cc-6c8b-4ae3-93c4-4f69b5128ba7", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "application/vnd.jupyter.widget-view+json": { 64 | "model_id": "a631d9619e3c4adf99dff0243d4b2d78", 65 | "version_major": 2, 66 | "version_minor": 0 67 | }, 68 | "text/plain": [ 69 | " 0%| | 0/30 [00:00\n", 149 | "\n", 150 | "\n", 151 | "

Client

\n", 152 | "\n", 156 | "\n", 157 | "\n", 158 | "

Cluster

\n", 159 | "\n", 164 | "\n", 165 | "\n", 166 | "" 167 | ], 168 | "text/plain": [ 169 | "" 170 | ] 171 | }, 172 | "execution_count": 4, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "## whenever you want to clean up the dask memory this is the magic line\n", 179 | "client.restart()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "6e815a06-170d-4e1e-b93d-81bc5b62ed39", 185 | "metadata": {}, 186 | "source": [ 187 | "## use dask bags" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "id": "2e13648c-252b-4eef-83eb-5f7dda567277", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "30" 200 | ] 201 | }, 202 | "execution_count": 6, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "len(list_of_things)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 7, 214 | "id": "2dd8b3ab-1519-4691-bd36-8730b2641862", 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "30 {'done'}\n", 222 | "CPU times: user 24.4 ms, sys: 4.15 ms, total: 28.6 ms\n", 223 | "Wall time: 1.22 s\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "%%time \n", 229 | "from dask import bag as db\n", 230 | "bag_list = db.from_sequence(list_of_things, npartitions=len(client.nthreads()))\n", 231 | "results = db.map(process_one_item, bag_list).compute()\n", 232 | "print(len(results), set(results))" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "a1334a52-75cd-402b-ab02-67df40676479", 238 | "metadata": {}, 239 | "source": [ 240 | "## use barebone dask distributed" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 8, 246 | "id": "5336925c-c560-42e6-b8b2-6f3f0c67ad1a", 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "CPU times: user 3.1 ms, sys: 116 µs, total: 3.21 ms\n", 254 | "Wall time: 2.41 ms\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "%%time\n", 260 | "## run it on dask cluster for all urls\n", 261 | "L = client.map(process_one_item, list_of_things)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "id": "6a3ac9f7-0a92-40f9-aa58-5523c663b56a", 268 | "metadata": { 269 | "tags": [] 270 | }, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "['done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done', 'done']\n", 277 | "CPU times: user 5.31 ms, sys: 1.04 ms, total: 6.35 ms\n", 278 | "Wall time: 15.7 ms\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "%%time\n", 284 | "results_distributed = client.gather(L)\n", 285 | "print(results_distributed) ## this will display a list of result, results being the return from process_one_url function" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 13, 291 | "id": "2fac2023-3167-4208-bc5d-69081f017d3a", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "client.close()\n", 296 | "cluster.close()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "id": "e1251987-2a07-48fb-ba89-1b6da85a9e9a", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [] 306 | } 307 | ], 308 | "metadata": { 309 | "kernelspec": { 310 | "display_name": "Python 3 (ipykernel)", 311 | "language": "python", 312 | "name": "python3" 313 | }, 314 | "language_info": { 315 | "codemirror_mode": { 316 | "name": "ipython", 317 | "version": 3 318 | }, 319 | "file_extension": ".py", 320 | "mimetype": "text/x-python", 321 | "name": "python", 322 | "nbconvert_exporter": "python", 323 | "pygments_lexer": "ipython3", 324 | "version": "3.8.10" 325 | } 326 | }, 327 | "nbformat": 4, 328 | "nbformat_minor": 5 329 | } 330 | -------------------------------------------------------------------------------- /example_notebooks/data_prep/rapids_data_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e8facdaf", 6 | "metadata": {}, 7 | "source": [ 8 | "## An example of using RAPIDS to speed up pandas operations on Hyperplane\n", 9 | "- The task is to groupby and sorting about 3G of data on s3 bucket " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "id": "af2c97bf", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import warnings\n", 20 | "warnings.filterwarnings(\"ignore\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "8cb76ab5", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import os\n", 31 | "import dask\n", 32 | "from hyperplane import notebook_common as nc" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "id": "d7c4064b", 39 | "metadata": { 40 | "tags": [] 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "👉 Hyperplane: selecting worker node pool\n", 48 | "👉 Hyperplane: selecting scheduler node pool\n", 49 | "👉 Hyperplane: you can access your dask dashboard at https://jhub.ds.hyperplane.dev/hub/user-redirect/proxy/45601/status\n", 50 | "👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "client, cluster = nc.initialize_cluster(\n", 56 | " nprocs=1,\n", 57 | " nthreads=8,\n", 58 | " ram_gb_per_proc=7,\n", 59 | " cores_per_worker=2,\n", 60 | " num_workers = 2,\n", 61 | " ngpus = 1,\n", 62 | " scheduler_deploy_mode=\"local\"\n", 63 | ")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "id": "1382c475", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "\n", 76 | "\n", 77 | "\n", 84 | "\n", 92 | "\n", 93 | "
\n", 78 | "

Client

\n", 79 | "\n", 83 | "
\n", 85 | "

Cluster

\n", 86 | "
    \n", 87 | "
  • Workers: 2
  • \n", 88 | "
  • Cores: 16
  • \n", 89 | "
  • Memory: 14.57 GiB
  • \n", 90 | "
\n", 91 | "
" 94 | ], 95 | "text/plain": [ 96 | "" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "client" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "id": "e309f622", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from dask.distributed import Client\n", 116 | "client = Client(cluster)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "id": "24425127", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "import dask_cudf" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "id": "3c138276", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "file_path = \"s3://dask-data/airline-data/*.csv\"" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 9, 142 | "id": "c221a12f", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "flight_df = dask_cudf.read_csv(file_path, assume_missing=True,\n", 147 | " usecols = [\"UniqueCarrier\",\"FlightNum\",\"Distance\"])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 10, 153 | "id": "1ee25618", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/html": [ 159 | "
\n", 160 | "\n", 173 | "\n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | "
UniqueCarrierFlightNumDistance
0PS1451.0447.0
1PS1451.0447.0
2PS1451.0447.0
3PS1451.0447.0
4PS1451.0447.0
\n", 215 | "
" 216 | ], 217 | "text/plain": [ 218 | " UniqueCarrier FlightNum Distance\n", 219 | "0 PS 1451.0 447.0\n", 220 | "1 PS 1451.0 447.0\n", 221 | "2 PS 1451.0 447.0\n", 222 | "3 PS 1451.0 447.0\n", 223 | "4 PS 1451.0 447.0" 224 | ] 225 | }, 226 | "execution_count": 10, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "flight_df.head()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 11, 238 | "id": "fac2898a", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "flight_df_opt = flight_df.groupby(by=[\"UniqueCarrier\",\"FlightNum\"]).Distance.mean()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 12, 248 | "id": "79d49a53", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "CPU times: user 2.08 s, sys: 223 ms, total: 2.3 s\n", 256 | "Wall time: 1min 21s\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "%%time\n", 262 | "flight_df_results = flight_df_opt.compute()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 13, 268 | "id": "e7a88d28", 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "UniqueCarrier FlightNum\n", 275 | "AS 994.0 586.166667\n", 276 | " 920.0 828.272727\n", 277 | "XE 4089.0 583.000000\n", 278 | "WN 3524.0 839.513333\n", 279 | "CO 874.0 588.002579\n", 280 | " ... \n", 281 | "PI 1809.0 195.542662\n", 282 | "NW 1912.0 354.528543\n", 283 | "MQ 3238.0 212.074221\n", 284 | "UA 2563.0 271.504673\n", 285 | "WN 25.0 298.849527\n", 286 | "Name: Distance, Length: 50003, dtype: float64" 287 | ] 288 | }, 289 | "execution_count": 13, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "flight_df_results" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "b3bcb2fa", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "cluster.close()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "1ce927cb", 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [] 315 | } 316 | ], 317 | "metadata": { 318 | "kernelspec": { 319 | "display_name": "Python [conda env:root] *", 320 | "language": "python", 321 | "name": "conda-root-py" 322 | }, 323 | "language_info": { 324 | "codemirror_mode": { 325 | "name": "ipython", 326 | "version": 3 327 | }, 328 | "file_extension": ".py", 329 | "mimetype": "text/x-python", 330 | "name": "python", 331 | "nbconvert_exporter": "python", 332 | "pygments_lexer": "ipython3", 333 | "version": "3.8.10" 334 | } 335 | }, 336 | "nbformat": 4, 337 | "nbformat_minor": 5 338 | } 339 | -------------------------------------------------------------------------------- /example_notebooks/frontend/sentence_classification/run.sh: -------------------------------------------------------------------------------- 1 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 2 | cd "$PROJECT_DIR" 3 | pip install streamlit fastapi opencv-python pillow uvicorn 4 | pip install --force-reinstall --no-deps bokeh==2.4.1 5 | streamlit run slnlp.py --server.port 8787 --browser.serverAddress localhost -------------------------------------------------------------------------------- /example_notebooks/frontend/sentence_classification/slnlp.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import streamlit as st 3 | import pandas as pd 4 | import json 5 | from io import StringIO 6 | 7 | def run_sentiment_analysis(string): 8 | ## send request 9 | endpoint = "your_model_end_point:8787/" 10 | data = { 11 | "string": string 12 | } 13 | headers = { 14 | "Content-Type": "application/json", 15 | } 16 | 17 | try: 18 | result = requests.post(endpoint, data = json.dumps(data), headers=headers).json() 19 | # print('result obtained') 20 | # print(result) 21 | except requests.exceptions.HTTPError as err: 22 | raise SystemExit(err) 23 | 24 | return result 25 | 26 | 27 | 28 | # defines an h1 header 29 | st.title("Sentence analysis") 30 | 31 | st.subheader('Paste some text here get classified') 32 | 33 | ## type text in the box 34 | txt = st.text_area('Text to analyze', ''' 35 | ''') 36 | 37 | st.write("Sentence:", txt) 38 | 39 | txt_result = run_sentiment_analysis(txt) 40 | # print('txt_result', txt_result) 41 | 42 | # st.write('Topic:', txt_result) 43 | st.metric(label="Topic", value=txt_result) 44 | 45 | st.subheader('Upload a text file here to get classified') 46 | 47 | ## file uploader 48 | uploaded_file = st.file_uploader("Choose a file") 49 | if uploaded_file is not None: 50 | # To read file as bytes: 51 | bytes_data = uploaded_file.getvalue() 52 | # st.write(bytes_data) 53 | 54 | # # To convert to a string based IO: 55 | stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) 56 | # st.write(stringio) 57 | 58 | # # To read file as string: 59 | string_data = stringio.read() 60 | # st.write(string_data) 61 | 62 | 63 | # Can be used wherever a "file-like" object is accepted: 64 | dataframe = pd.read_csv(uploaded_file) 65 | # print(dataframe) 66 | results = [] 67 | for sentence in dataframe.Sentence.tolist(): 68 | results.append(run_sentiment_analysis(sentence)) 69 | dataframe['Topic Prediction'] = results 70 | st.write(dataframe) -------------------------------------------------------------------------------- /example_notebooks/frontend/sentence_classification/streamlit.yml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Streamlit test" 3 | job_type: "basic" 4 | tasks: 5 | - name: "st text" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "frontend/sentence_classification/run.sh" -------------------------------------------------------------------------------- /example_notebooks/frontend/streamlit_app_example/image_recognition/README.MD: -------------------------------------------------------------------------------- 1 | This is a skeleton App for building a streamlit frontend for a Hyperplane served model. 2 | ## sample frontend 3 | 4 | 5 | 6 | ## How to use 7 | - Streamlit can be used when you need to quickly expose a frontend for your Hyperplane service 8 | - To get started, replace the dummy function in `sent_infer_request_in_cluster` in skexample.py with your real request function 9 | - Copy the image_recognition folder to your github repo (and change it to the name that makes sense to your App) 10 | - Go to the service tab on the Hyperplane dashboard and start a service with the `streamlit_pipelin.yaml' 11 | - Choose the service URL prefix for your Frontend App, this will be the URL of your frontend. Assume it's `my_awesome_frontend` 12 | - After you hit the create button of the service and the service is up, you can then go to `https://yourdomain.hyperplane.dev/my_awesome_frontend` to see the frontend 13 | - Streamlit has python APIs to build frontend components such as boxes, buttons, radio inputs and so on. [Read more about Streamlit](https://docs.streamlit.io/) 14 | - Full documentation for streamlit please see the steamlit Documentation page. 15 | -------------------------------------------------------------------------------- /example_notebooks/frontend/streamlit_app_example/image_recognition/cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/frontend/streamlit_app_example/image_recognition/cat.png -------------------------------------------------------------------------------- /example_notebooks/frontend/streamlit_app_example/image_recognition/example_frontend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/frontend/streamlit_app_example/image_recognition/example_frontend.png -------------------------------------------------------------------------------- /example_notebooks/frontend/streamlit_app_example/image_recognition/run.sh: -------------------------------------------------------------------------------- 1 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 2 | 3 | cd "$PROJECT_DIR" 4 | 5 | apt-get update 6 | apt-get -y install libgl1 7 | apt-get install -yq libgtk2.0-dev 8 | 9 | pip install streamlit fastapi opencv-python pillow uvicorn gcsfs 10 | pip install --force-reinstall --no-deps bokeh==2.4.1 11 | 12 | streamlit run slexample.py --server.port 8787 --browser.serverAddress localhost -------------------------------------------------------------------------------- /example_notebooks/frontend/streamlit_app_example/image_recognition/slexample.py: -------------------------------------------------------------------------------- 1 | # frontend/main.py 2 | 3 | import requests 4 | import streamlit as st 5 | from PIL import Image 6 | import json 7 | import numpy as np 8 | import pandas as pd 9 | import base64 10 | import argparse 11 | import time 12 | import random 13 | 14 | 15 | def sent_infer_request_in_cluster(img): 16 | ## send request to a model inference endpoint that's served on hyperplane 17 | ## As the services are all internally authenticated, no need to pass JWT tokens 18 | endpoint = "http://image_recognition-inference-endpoint:8787/infer" 19 | data = { 20 | "image": img 21 | } 22 | headers = { 23 | "Content-Type": "application/json", 24 | } 25 | 26 | # result = requests.post(endpoint, data = json.dumps(data), headers=headers).json() 27 | 28 | ## for demo purpose we are going to use a random result 29 | try: 30 | result = {"category": random.choice(["cat", "cat", "cat"])} 31 | print(result) 32 | except: 33 | result = {} 34 | 35 | return result 36 | 37 | 38 | ## 39 | ## Streamlit UI components 40 | ## 41 | 42 | 43 | st.set_option("deprecation.showfileUploaderEncoding", False) 44 | 45 | # defines an h1 header 46 | st.title("Image Recognition Cats Dogs Fruits") 47 | 48 | # displays a file uploader widget 49 | image_upload = st.file_uploader("Choose an image") 50 | 51 | # displays a file uploader widget 52 | image_type = st.selectbox("Choose an image type", ['fruit', 'animal', 'other']) 53 | 54 | # displays a button 55 | if st.button("Get annotation"): 56 | if image_upload is not None: 57 | # try: 58 | image_raw = Image.open(image_upload) 59 | image = np.array(image_raw)[:, :, ::-1] 60 | 61 | result = sent_infer_request_in_cluster(image) 62 | st.json(result) 63 | # except: 64 | # st.text("Could not process image") 65 | 66 | 67 | image = Image.open('cat.png') 68 | 69 | st.image(image, caption='Cat') 70 | 71 | -------------------------------------------------------------------------------- /example_notebooks/frontend/streamlit_app_example/image_recognition/streamlit_pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Streamlit test" 3 | job_type: "basic" 4 | tasks: 5 | - name: "streamlit image example" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "example_notebooks/frontend/streamlit_app_example/image_recognition/run.sh" -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# So easy, *voilà*!\n", 7 | "\n", 8 | "In this example notebook, we demonstrate how Voilà can render Jupyter notebooks with interactions requiring a roundtrip to the kernel." 9 | ], 10 | "metadata": {} 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "source": [ 15 | "## Jupyter Widgets" 16 | ], 17 | "metadata": {} 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "source": [ 23 | "import ipywidgets as widgets\n", 24 | "\n", 25 | "slider = widgets.FloatSlider(description='$x$')\n", 26 | "text = widgets.FloatText(disabled=True, description='$x^2$')\n", 27 | "\n", 28 | "def compute(*ignore):\n", 29 | " text.value = str(slider.value ** 2)\n", 30 | "\n", 31 | "slider.observe(compute, 'value')\n", 32 | "\n", 33 | "slider.value = 4\n", 34 | "\n", 35 | "widgets.VBox([slider, text])" 36 | ], 37 | "outputs": [], 38 | "metadata": {} 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "source": [ 43 | "## Basic outputs of code cells" 44 | ], 45 | "metadata": {} 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "source": [ 51 | "import pandas as pd\n", 52 | "\n", 53 | "iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')\n", 54 | "iris" 55 | ], 56 | "outputs": [], 57 | "metadata": {} 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "source": [], 63 | "outputs": [], 64 | "metadata": {} 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 3", 70 | "language": "python", 71 | "name": "python3" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 3 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython3", 83 | "version": "3.8.5" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 4 88 | } -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/bqplot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# So easy, *voilà*!\n", 8 | "\n", 9 | "In this example notebook, we demonstrate how Voilà can render custom Jupyter widgets such as [bqplot](https://github.com/bloomberg/bqplot). " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import warnings\n", 19 | "warnings.filterwarnings('ignore')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np\n", 29 | "from bqplot import pyplot as plt\n", 30 | "\n", 31 | "plt.figure(1, title='Line Chart')\n", 32 | "np.random.seed(0)\n", 33 | "n = 200\n", 34 | "x = np.linspace(0.0, 10.0, n)\n", 35 | "y = np.cumsum(np.random.randn(n))\n", 36 | "plt.plot(x, y)\n", 37 | "plt.show()" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.7.3" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 2 62 | } 63 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/dashboard.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This demo uses Voilà to render a notebook to a custom HTML page using gridstack.js for the layout of each output. In the cell metadata you can change the default cell with and height (in grid units between 1 and 12) by specifying.\n", 8 | " * `grid_row`\n", 9 | " * `grid_columns`" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "n = 200\n", 20 | "\n", 21 | "x = np.linspace(0.0, 10.0, n)\n", 22 | "y = np.cumsum(np.random.randn(n)*10).astype(int)\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import ipywidgets as widgets" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "label_selected = widgets.Label(value=\"Selected: 0\")\n", 41 | "label_selected" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "grid_columns": 8, 49 | "grid_rows": 4 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "import numpy as np\n", 54 | "from bqplot import pyplot as plt\n", 55 | "import bqplot\n", 56 | "\n", 57 | "fig = plt.figure( title='Histogram')\n", 58 | "np.random.seed(0)\n", 59 | "hist = plt.hist(y, bins=25)\n", 60 | "hist.scales['sample'].min = float(y.min())\n", 61 | "hist.scales['sample'].max = float(y.max())\n", 62 | "display(fig)\n", 63 | "fig.layout.width = 'auto'\n", 64 | "fig.layout.height = 'auto'\n", 65 | "fig.layout.min_height = '300px' # so it shows nicely in the notebook\n", 66 | "fig.layout.flex = '1'" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "grid_columns": 12, 74 | "grid_rows": 6 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "import numpy as np\n", 79 | "from bqplot import pyplot as plt\n", 80 | "import bqplot\n", 81 | "\n", 82 | "fig = plt.figure( title='Line Chart')\n", 83 | "np.random.seed(0)\n", 84 | "n = 200\n", 85 | "p = plt.plot(x, y)\n", 86 | "fig" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "fig.layout.width = 'auto'\n", 96 | "fig.layout.height = 'auto'\n", 97 | "fig.layout.min_height = '300px' # so it shows nicely in the notebook\n", 98 | "fig.layout.flex = '1'" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "brushintsel = bqplot.interacts.BrushIntervalSelector(scale=p.scales['x'])" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "def update_range(*args):\n", 117 | " label_selected.value = \"Selected range {}\".format(brushintsel.selected)\n", 118 | " mask = (x > brushintsel.selected[0]) & (x < brushintsel.selected[1])\n", 119 | " hist.sample = y[mask]\n", 120 | " \n", 121 | "brushintsel.observe(update_range, 'selected')\n", 122 | "fig.interaction = brushintsel" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "celltoolbar": "Edit Metadata", 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.6.4" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 2 148 | } 149 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/gridspecLayout.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Which multiplication table do you want to learn?\n", 8 | "\n", 9 | "In this example notebook we demonstrate how Voilà can render different Jupyter widgets using [GridspecLayout](https://ipywidgets.readthedocs.io/en/latest/examples/Layout%20Templates.html#Grid-layout)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "scrolled": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from ipywidgets import GridspecLayout, Button, BoundedIntText, Valid, Layout, Dropdown\n", 21 | "\n", 22 | "def create_expanded_button(description, button_style):\n", 23 | " return Button(description=description, button_style=button_style, layout=Layout(height='auto', width='auto'))\n", 24 | " \n", 25 | "rows = 11\n", 26 | "columns = 6\n", 27 | "\n", 28 | "gs = GridspecLayout(rows, columns)\n", 29 | "\n", 30 | "def on_result_change(change):\n", 31 | " row = int(change[\"owner\"].layout.grid_row)\n", 32 | " gs[row, 5].value = gs[0, 0].value * row == change[\"new\"]\n", 33 | " \n", 34 | "def on_multipler_change(change):\n", 35 | " for i in range(1, rows):\n", 36 | " gs[i, 0].description = str(change[\"new\"])\n", 37 | " gs[i, 4].max = change[\"new\"] * 10\n", 38 | " gs[i, 4].value = 1\n", 39 | " gs[i, 4].step = change[\"new\"]\n", 40 | " gs[i, 5].value = False\n", 41 | "\n", 42 | "gs[0, 0] = Dropdown(\n", 43 | " options=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", 44 | " value=2,\n", 45 | ")\n", 46 | "gs[0, 0].observe(on_multipler_change, names=\"value\")\n", 47 | "multiplier = gs[0, 0].value\n", 48 | "\n", 49 | "for i in range(1, rows):\n", 50 | " gs[i, 0] = create_expanded_button(str(multiplier), \"\")\n", 51 | " gs[i, 1] = create_expanded_button(\"*\", \"\")\n", 52 | " gs[i, 2] = create_expanded_button(str(i), \"info\")\n", 53 | " gs[i, 3] = create_expanded_button(\"=\", \"\")\n", 54 | "\n", 55 | " gs[i, 4] = BoundedIntText(\n", 56 | " min=0,\n", 57 | " max=multiplier * 10,\n", 58 | " layout=Layout(grid_row=str(i)),\n", 59 | " value=1,\n", 60 | " step=multiplier,\n", 61 | " disabled=False\n", 62 | " )\n", 63 | "\n", 64 | " gs[i, 5] = Valid(\n", 65 | " value=False,\n", 66 | " description='Valid!',\n", 67 | " )\n", 68 | "\n", 69 | " gs[i, 4].observe(on_result_change, names='value')\n", 70 | "\n", 71 | "gs" 72 | ] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.7.3" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/interactive.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# So easy, *voilà*!\n", 8 | "\n", 9 | "In this example notebook, we demonstrate how Voilà can render notebooks making use of ipywidget's `@interact`." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from ipywidgets import HBox, VBox, IntSlider, interactive_output\n", 19 | "from IPython.display import display\n", 20 | "\n", 21 | "a = IntSlider()\n", 22 | "b = IntSlider()\n", 23 | "\n", 24 | "def f(a, b):\n", 25 | " print(\"{} * {} = {}\".format(a, b, a * b))\n", 26 | "\n", 27 | "out = interactive_output(f, { \"a\": a, \"b\": b })\n", 28 | "\n", 29 | "display(HBox([VBox([a, b]), out]))" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.7.3" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/ipympl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# So easy, *voilà*!\n", 8 | "\n", 9 | "In this example notebook, we demonstrate how Voilà can render custom interactive matplotlib figures using the [ipympl](https://github.com/matplotlib/ipympl) widget." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib widget\n", 19 | "import ipympl\n", 20 | "\n", 21 | "import numpy as np\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "\n", 24 | "x = np.linspace(0, 2 * np.pi, 500)\n", 25 | "y1 = np.sin(x)\n", 26 | "y2 = np.sin(3 * x)\n", 27 | "\n", 28 | "fig, ax = plt.subplots()\n", 29 | "ax.fill(x, y1, 'b', x, y2, 'r', alpha=0.3)\n", 30 | "plt.show()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import numpy as np\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "\n", 42 | "plt.style.use('ggplot')\n", 43 | "\n", 44 | "fig, axes = plt.subplots(ncols=2, nrows=2)\n", 45 | "ax1, ax2, ax3, ax4 = axes.ravel()\n", 46 | "\n", 47 | "# scatter plot (Note: `plt.scatter` doesn't use default colors)\n", 48 | "x, y = np.random.normal(size=(2, 200))\n", 49 | "ax1.plot(x, y, 'o')\n", 50 | "\n", 51 | "# sinusoidal lines with colors from default color cycle\n", 52 | "L = 2 * np.pi\n", 53 | "x = np.linspace(0, L)\n", 54 | "ncolors = len(plt.rcParams['axes.prop_cycle'])\n", 55 | "shift = np.linspace(0, L, ncolors, endpoint=False)\n", 56 | "for s in shift:\n", 57 | " ax2.plot(x, np.sin(x + s), '-')\n", 58 | "ax2.margins(0)\n", 59 | "\n", 60 | "# bar graphs\n", 61 | "x = np.arange(5)\n", 62 | "y1, y2 = np.random.randint(1, 25, size=(2, 5))\n", 63 | "width = 0.25\n", 64 | "ax3.bar(x, y1, width)\n", 65 | "ax3.bar(x + width, y2, width,\n", 66 | " color=list(plt.rcParams['axes.prop_cycle'])[2]['color'])\n", 67 | "ax3.set_xticks(x + width)\n", 68 | "ax3.set_xticklabels(['a', 'b', 'c', 'd', 'e'])\n", 69 | "\n", 70 | "# circles with colors from default color cycle\n", 71 | "for i, color in enumerate(plt.rcParams['axes.prop_cycle']):\n", 72 | " xy = np.random.normal(size=2)\n", 73 | " ax4.add_patch(plt.Circle(xy, radius=0.3, color=color['color']))\n", 74 | "\n", 75 | "ax4.axis('equal')\n", 76 | "ax4.margins(0)\n", 77 | "\n", 78 | "plt.show()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "Python 3", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.7.3" 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/ipyvolume.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# So easy, *voilà*!\n", 8 | "\n", 9 | "In this example notebook, we demonstrate how Voilà can render custom Jupyter widgets such as [ipyvolume](https://github.com/maartenbreddels/ipyvolume). " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import ipyvolume as ipv\n", 19 | "ipv.examples.example_ylm();" 20 | ] 21 | } 22 | ], 23 | "metadata": { 24 | "kernelspec": { 25 | "display_name": "Python 3", 26 | "language": "python", 27 | "name": "python3" 28 | }, 29 | "language_info": { 30 | "codemirror_mode": { 31 | "name": "ipython", 32 | "version": 3 33 | }, 34 | "file_extension": ".py", 35 | "mimetype": "text/x-python", 36 | "name": "python", 37 | "nbconvert_exporter": "python", 38 | "pygments_lexer": "ipython3", 39 | "version": "3.7.3" 40 | } 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 2 44 | } 45 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/multiple_widgets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "ebaa184b-765f-4261-9c5c-285bbde8fad5", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import ipywidgets as widgets\n", 11 | "def generate(n):\n", 12 | " widget = []\n", 13 | " for i in range(n):\n", 14 | " inner = []\n", 15 | " for j in range(n):\n", 16 | " inner.append(widgets.Button(description=f'{i*n+j+1}'))\n", 17 | " widget.append(widgets.HBox(inner))\n", 18 | "\n", 19 | " return widgets.VBox(widget)\n", 20 | "generate(20)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "8b59589a-7c71-4b1a-896a-b1139cbcf70f", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "Python 3 (ipykernel)", 35 | "language": "python", 36 | "name": "python3" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.9.6" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 5 53 | } 54 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "voila demo" 3 | job_type: "experimental" 4 | tasks: 5 | - name: "voila serve" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "example_notebooks/frontend/voila/multiple_notebooks/run.sh" -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/reveal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "slide" 9 | } 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "print('hi')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import ipywidgets as widgets\n", 23 | "slider = widgets.FloatSlider(description='x')\n", 24 | "text = widgets.FloatText(disabled=True, description='$x^2$')\n", 25 | "text.disabled\n", 26 | "def compute(*ignore):\n", 27 | " text.value = str(slider.value**2)\n", 28 | "slider.observe(compute, 'value')\n", 29 | "slider.value = 14\n", 30 | "widgets.VBox([slider, text])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "slideshow": { 38 | "slide_type": "slide" 39 | } 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "print('voila')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "slideshow": { 51 | "slide_type": "subslide" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "1+2" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [] 65 | } 66 | ], 67 | "metadata": { 68 | "celltoolbar": "Slideshow", 69 | "kernelspec": { 70 | "display_name": "Python 3 (ipykernel)", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.8.10" 85 | }, 86 | "voila": { 87 | "template": "reveal" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 4 92 | } 93 | -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/multiple_notebooks/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # set -euo pipefail 3 | 4 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 5 | cd "$PROJECT_DIR" 6 | voila --port 8787 --no-browser --Voila.ip 0.0.0.0 -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "voila demo" 3 | job_type: "experimental" 4 | tasks: 5 | - name: "voila serve" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "example_notebooks/frontend/voila/run.sh" -------------------------------------------------------------------------------- /example_notebooks/frontend/voila/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # set -euo pipefail 3 | 4 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 5 | cd "$PROJECT_DIR" 6 | voila --port 8787 --no-browser --Voila.ip 0.0.0.0 voila_demo.ipynb 7 | -------------------------------------------------------------------------------- /example_notebooks/inference/batch_inference/requirements.txt: -------------------------------------------------------------------------------- 1 | s3fs==2021.08.0 2 | pyarrow==5.0.0 3 | transformers==4.11.3 4 | tensorflow==2.6.0 -------------------------------------------------------------------------------- /example_notebooks/inference/batch_inference/scheduler_setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | startup script to be preloaded to the workers 3 | 4 | """ 5 | import pip._internal as pip 6 | def install(package): 7 | pip.main(['install', package]) 8 | 9 | try: 10 | import s3fs 11 | except ImportError: 12 | install('s3fs') 13 | import s3fs 14 | 15 | import gcsfs 16 | import os 17 | import pickle 18 | from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification 19 | 20 | 21 | 22 | ## start up script 23 | 24 | def download_from_cloud(local_file_name, remote_file_name): 25 | """ 26 | Download a file to gcp or s3. 27 | """ 28 | import os 29 | import s3fs 30 | import gcsfs 31 | cloud_name = remote_file_name.split('://')[0] 32 | if cloud_name =='gs': 33 | fs = gcsfs.GCSFileSystem(project=os.environ['GCP_PROJECT']) 34 | elif cloud_name =='s3': 35 | fs = s3fs.S3FileSystem() 36 | else: 37 | raise NameError(f'cloud name {cloud_name} unknown') 38 | try: 39 | print(f'download from {remote_file_name} to {local_file_name}') 40 | fs.get(remote_file_name, local_file_name, recursive=True) 41 | print("done downloading!") 42 | except Exception as exp: 43 | print(f"download failed: {exp}") 44 | 45 | return 46 | 47 | 48 | def load_models(model_path): 49 | """ 50 | Load the model from the unzipped local model path 51 | """ 52 | 53 | model = TFDistilBertForSequenceClassification.from_pretrained(f'{model_path}/clf') 54 | model_name, max_len = pickle.load(open(f'{model_path}/info.pkl', 'rb')) 55 | loaded_models['model'] = (model, model_name, max_len) 56 | 57 | return loaded_models 58 | 59 | 60 | def load_model_from_pretrained(model_name): 61 | """ 62 | Load the model from the unzipped local model path 63 | """ 64 | 65 | model = TFDistilBertForSequenceClassification.from_pretrained(model_name) 66 | max_len = 20 67 | loaded_models['model'] = (model, model_name, max_len) 68 | 69 | return loaded_models 70 | 71 | loaded_models = {} 72 | model_name = "distilbert-base-uncased" 73 | loaded_models = load_model_from_pretrained(model_name) 74 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/a_simple_pipeline/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "data prep pipeline" 3 | job_type: "basic" 4 | tasks: 5 | - name: "Dask groupby data" 6 | type: "jupyter notebook" 7 | notebook_path: "example_notebooks/pipelines/a_simple_pipeline/dask_group_sort.ipynb" 8 | notebook_output_path: "dask_group_sort_output.ipynb" -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/para_tasks.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Example pipeline that runs all tasks in parallel" 3 | tasks: 4 | - name: "step1" 5 | type: "jupyter notebook" 6 | notebook_path: "fake_pipeline/step1.ipynb" 7 | notebook_output_path: "step1_output.ipynb" 8 | depends_on: [] 9 | - name: "step2" 10 | type: "jupyter notebook" 11 | notebook_path: "fake_pipeline/step2.ipynb" 12 | notebook_output_path: "step2_output.ipynb" 13 | depends_on: [] 14 | - name: "step3" 15 | type: "jupyter notebook" 16 | notebook_path: "fake_pipeline/step3.ipynb" 17 | notebook_output_path: "step3_output.ipynb" 18 | depends_on: [] 19 | - name: "step4" 20 | type: "jupyter notebook" 21 | notebook_path: "fake_pipeline/step4.ipynb" 22 | notebook_output_path: "step4_output.ipynb" 23 | depends_on: [] 24 | - name: "step5" 25 | type: "jupyter notebook" 26 | notebook_path: "fake_pipeline/step5.ipynb" 27 | notebook_output_path: "step5_output.ipynb" 28 | depends_on: [] -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/para_test.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Example pipeline steps" 3 | tasks: 4 | - name: "step1" 5 | type: "jupyter notebook" 6 | notebook_path: "fake_pipeline/step1.ipynb" 7 | notebook_output_path: "step1_output.ipynb" 8 | - name: "step2" 9 | type: "jupyter notebook" 10 | notebook_path: "fake_pipeline/step2.ipynb" 11 | notebook_output_path: "step2_output.ipynb" 12 | depends_on: ["step1"] 13 | - name: "step3" 14 | type: "jupyter notebook" 15 | notebook_path: "fake_pipeline/step3.ipynb" 16 | notebook_output_path: "step3_output.ipynb" 17 | depends_on: ["step1"] 18 | - name: "step4" 19 | type: "jupyter notebook" 20 | notebook_path: "fake_pipeline/step4.ipynb" 21 | notebook_output_path: "step4_output.ipynb" 22 | depends_on: ["step1"] -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/para_test_close.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Example pipeline steps" 3 | tasks: 4 | - name: "step1" 5 | type: "jupyter notebook" 6 | notebook_path: "fake_pipeline/step1.ipynb" 7 | notebook_output_path: "step1_output.ipynb" 8 | - name: "step2" 9 | type: "jupyter notebook" 10 | notebook_path: "fake_pipeline/step2.ipynb" 11 | notebook_output_path: "step2_output.ipynb" 12 | depends_on: ["step1"] 13 | - name: "step3" 14 | type: "jupyter notebook" 15 | notebook_path: "fake_pipeline/step3.ipynb" 16 | notebook_output_path: "step3_output.ipynb" 17 | depends_on: ["step1"] 18 | - name: "step4" 19 | type: "jupyter notebook" 20 | notebook_path: "fake_pipeline/step4.ipynb" 21 | notebook_output_path: "step4_output.ipynb" 22 | depends_on: ["step1"] 23 | - name: "step5" 24 | type: "jupyter notebook" 25 | notebook_path: "fake_pipeline/step5.ipynb" 26 | notebook_output_path: "step5_output.ipynb" 27 | depends_on: ["step2", "step3", "step4"] -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/seq_test.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Example pipeline steps" 3 | tasks: 4 | - name: "step1" 5 | type: "jupyter notebook" 6 | notebook_path: "fake_pipeline/step1.ipynb" 7 | - name: "step3" 8 | type: "jupyter notebook" 9 | notebook_path: "fake_pipeline/step3.ipynb" 10 | - name: "step4" 11 | type: "jupyter notebook" 12 | notebook_path: "fake_pipeline/step6.ipynb" 13 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/start_para_end_tasks.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Example pipeline steps" 3 | tasks: 4 | - name: "step1" 5 | type: "jupyter notebook" 6 | notebook_path: "fake_pipeline/step1.ipynb" 7 | notebook_output_path: "step1_output.ipynb" 8 | - name: "step2" 9 | type: "jupyter notebook" 10 | notebook_path: "fake_pipeline/step2.ipynb" 11 | notebook_output_path: "step2_output.ipynb" 12 | depends_on: ["step1"] 13 | - name: "step3" 14 | type: "jupyter notebook" 15 | notebook_path: "fake_pipeline/step3.ipynb" 16 | notebook_output_path: "step3_output.ipynb" 17 | depends_on: ["step1"] 18 | - name: "step4" 19 | type: "jupyter notebook" 20 | notebook_path: "fake_pipeline/step4.ipynb" 21 | notebook_output_path: "step4_output.ipynb" 22 | depends_on: ["step1"] 23 | - name: "step5" 24 | type: "jupyter notebook" 25 | notebook_path: "fake_pipeline/step5.ipynb" 26 | notebook_output_path: "step5_output.ipynb" 27 | depends_on: ["step2", "step3", "step4"] -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/step1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4f6780a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(10)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "cb56abb9", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/step2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "73b986b7", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(40)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "2838dc6a", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/step3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f2214387", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(30)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "27948a04", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/step4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c0d5fe5a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(20)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "4ac782ae", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/step5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "a9827cd2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(5)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "669fcdc7", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/advanced_dag_pipeline/step6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4f6780a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(10)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "cb56abb9", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/crypto/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "crypto trades monitoring App" 3 | tasks: 4 | - name: "watch_trade" 5 | type: "jupyter notebook" 6 | notebook_path: "example_notebooks/pipelines/crypto/watch_ob_multi.ipynb" 7 | notebook_output_path: "output.ipynb" 8 | depends_on: [] -------------------------------------------------------------------------------- /example_notebooks/pipelines/crypto/watch_ob_multi.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "deff5e3c-7789-4a7e-84ad-3e6da2279aaf", 6 | "metadata": {}, 7 | "source": [ 8 | "## Watch OrderBook of multiple ticker pair and multiple exchange asynchronous \n", 9 | "- this code runs on the `crypto` image \n", 10 | "- you can change the `slack_channel` to a your existing slack channel to get alert to the job\n", 11 | "- you can use the `pipeline.yaml` to spin up a `Hyperplane service` to watch the orderbook indefinitly " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "26a17fa6-9e70-47c7-a9fc-71528cc7be54", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import warnings\n", 22 | "warnings.filterwarnings(\"ignore\")\n", 23 | "\n", 24 | "import json\n", 25 | "import os\n", 26 | "import ccxtpro\n", 27 | "import ccxt\n", 28 | "import asyncio\n", 29 | "import pandas as pd\n", 30 | "import numpy as np\n", 31 | "import time\n", 32 | "from datetime import datetime, timedelta\n", 33 | "import traceback\n", 34 | "\n", 35 | "import nest_asyncio\n", 36 | "nest_asyncio.apply()\n", 37 | "\n", 38 | "## initialize a slack notification for trades or other alerts\n", 39 | "from hyperplane import notebook_common as nc\n", 40 | "sh = nc.SlackHelper()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "id": "afab5fb3-1676-47d5-accd-be40017e3989", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "id": "1ef92959-ebd5-4c2d-bd1e-713c44e6d514", 55 | "metadata": { 56 | "tags": [ 57 | "parameters" 58 | ] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "## parameters can be injected when spijn up the jobs\n", 63 | "\n", 64 | "slack_channel = '#orderbook'\n", 65 | "\n", 66 | "exchanges = {\n", 67 | " 'kucoin': ['BTC/USDT', 'ETH/BTC', 'ETH/USDT'],\n", 68 | " 'binance': ['BTC/USDT', 'ETH/USDT','XRP/USDT']\n", 69 | " }" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "030c37f4-5c00-4bc8-84c1-9924e6de6c37", 75 | "metadata": {}, 76 | "source": [ 77 | "## Watch multiple orderbooks \n", 78 | "- available apis : watch_trades, watch_ticker, watch_orderbook, watch_market, watchOHLCV" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 22, 84 | "id": "28052eb3-96f5-4784-9ec7-4cc6d3ddce9c", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "orderbooks = {}\n", 89 | "\n", 90 | "def handle_all_orderbooks(orderbooks):\n", 91 | " \"\"\"your function to use the OrderBooks\"\"\"\n", 92 | "# print('We have the following orderbooks:')\n", 93 | " for exchange_id, orderbooks_by_symbol in orderbooks.items():\n", 94 | " for symbol in orderbooks_by_symbol.keys():\n", 95 | " orderbook = orderbooks_by_symbol[symbol]\n", 96 | " print(ccxtpro.Exchange.iso8601(orderbook['timestamp']), exchange_id, symbol, \n", 97 | " orderbook['asks'][0], orderbook['bids'][0])\n", 98 | " # print(orderbook)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 23, 104 | "id": "6a007133-1f83-465c-bb5b-a8131d946990", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "async def loop(asyncio_loop, exchange_id, symbol):\n", 109 | " exchange = getattr(ccxtpro, exchange_id)({\n", 110 | " 'enableRateLimit': True,\n", 111 | " 'asyncio_loop': asyncio_loop,\n", 112 | " })\n", 113 | " \n", 114 | " output_parent_path = f\"gs://{os.environ['HYPERPLANE_GCP_BUCKET']}/data/crypto_trading/{exchange_id}\"\n", 115 | " logs = []\n", 116 | " \n", 117 | " starttime = time.time()\n", 118 | " starttime_h = datetime.fromtimestamp(starttime).strftime(\"%Y-%m-%d %I:%M:%S\")\n", 119 | " sh.post_message(json.dumps({'starttime_h': \"test\"}), channel=slack_channel)\n", 120 | "\n", 121 | " while True:\n", 122 | " try:\n", 123 | " orderbook = await exchange.watch_order_book(symbol, limit = 10)\n", 124 | " orderbooks[exchange.id] = orderbooks.get(exchange.id, {})\n", 125 | " orderbooks[exchange.id][symbol] = orderbook\n", 126 | " print('===========================================================')\n", 127 | " # print(type(orderbooks), orderbooks.keys())\n", 128 | " ## code for profit calculation and order \n", 129 | "\n", 130 | " sh.post_message(json.dumps({'orderbooks': orderbooks}), channel=slack_channel)\n", 131 | " \n", 132 | " handle_all_orderbooks(orderbooks)\n", 133 | " \n", 134 | " except Exception as e:\n", 135 | " print(str(e))\n", 136 | " # raise e # uncomment to break all loops in case of an error in any one of them\n", 137 | " # break # uncomment to break just this one loop if it fails\n", 138 | " await exchange.close()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 24, 144 | "id": "f516d1da-163a-48e0-b6bc-83018d36b106", 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "symbols ['BTC/USDT', 'ETH/USDT']\n", 152 | "===========================================================\n", 153 | "None binance ETH/USDT [2587.04, 9.3855] [2587.03, 0.0386]\n", 154 | "===========================================================\n", 155 | "None binance ETH/USDT [2587.04, 9.3855] [2587.03, 0.0386]\n", 156 | "None binance BTC/USDT [37870.02, 1.86727] [37870.01, 1e-05]\n", 157 | "===========================================================\n", 158 | "2022-01-29T20:18:27.411Z binance ETH/USDT [2587.04, 9.3855] [2587.03, 0.0386]\n", 159 | "None binance BTC/USDT [37870.02, 1.86727] [37870.01, 1e-05]\n", 160 | "===========================================================\n", 161 | "2022-01-29T20:18:27.411Z binance ETH/USDT [2587.04, 9.3855] [2587.03, 0.0386]\n", 162 | "2022-01-29T20:18:27.411Z binance BTC/USDT [37870.02, 1.09138] [37870.01, 0.21132]\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "async def main(asyncio_loop):\n", 168 | " symbols = [f\"{t}/{base_ticker}\" for t in trade_ticker]\n", 169 | " print('symbols', symbols)\n", 170 | " exchanges = {\n", 171 | " exchange: symbols,\n", 172 | " }\n", 173 | " loops = [loop(asyncio_loop, exchange_id, symbol) for exchange_id, symbols in exchanges.items() for symbol in symbols]\n", 174 | " await asyncio.gather(*loops)\n", 175 | "\n", 176 | "\n", 177 | "if __name__ == '__main__':\n", 178 | " asyncio_loop = asyncio.get_event_loop()\n", 179 | " asyncio_loop.run_until_complete(main(asyncio_loop))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "id": "0816123c-8d23-4cbf-a55a-58d65d54ef9b", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.8.10" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 5 212 | } 213 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/pipeline_with_requirements/READ.md: -------------------------------------------------------------------------------- 1 | ## This example shows a pipeline with requirements.txt file 2 | Simply add the following line to the YAML file, it'll be installed before running the tasks. 3 | The path to the requirements file and all tasks files are with respect to the root of the repository. 4 | ```Python 5 | requirements: "pipelines/pipeline_with_requirements/requirements.txt" 6 | ``` 7 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/pipeline_with_requirements/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Example pipeline steps" 3 | requirements: "pipelines/pipeline_with_requirements/requirements.txt" 4 | tasks: 5 | - name: "step1" 6 | type: "jupyter notebook" 7 | notebook_path: "pipelines/pipeline_with_requirements/step1.ipynb" 8 | notebook_output_path: "step1_output.ipynb" 9 | - name: "step2" 10 | type: "jupyter notebook" 11 | notebook_path: "pipelines/pipeline_with_requirements/step2.ipynb" 12 | notebook_output_path: "step2_output.ipynb" 13 | - name: "step3" 14 | type: "jupyter notebook" 15 | notebook_path: "pipelines/pipeline_with_requirements/step3.ipynb" 16 | notebook_output_path: "step3_output.ipynb" 17 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/pipeline_with_requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | nc-time-axis==1.2.0 2 | geopandas==0.10.2 3 | rtree==0.9.7 4 | zarr==2.6.1 5 | xclim==0.22.0 6 | xarray-spatial==0.1.2 7 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/pipeline_with_requirements/step1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4f6780a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(10)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "cb56abb9", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/pipeline_with_requirements/step2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "73b986b7", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(40)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "2838dc6a", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/pipeline_with_requirements/step3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f2214387", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import time\n", 11 | "time.sleep(30)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "27948a04", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "print('done')" 22 | ] 23 | } 24 | ], 25 | "metadata": { 26 | "kernelspec": { 27 | "display_name": "Python 3", 28 | "language": "python", 29 | "name": "python3" 30 | }, 31 | "language_info": { 32 | "codemirror_mode": { 33 | "name": "ipython", 34 | "version": 3 35 | }, 36 | "file_extension": ".py", 37 | "mimetype": "text/x-python", 38 | "name": "python", 39 | "nbconvert_exporter": "python", 40 | "pygments_lexer": "ipython3", 41 | "version": "3.8.10" 42 | } 43 | }, 44 | "nbformat": 4, 45 | "nbformat_minor": 5 46 | } 47 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/python_hello_world_pipeline/hello_world.py: -------------------------------------------------------------------------------- 1 | print('Hello, World!') 2 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/python_hello_world_pipeline/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Hello World with Python" 3 | tasks: 4 | - name: "Run Python Script" 5 | type: "vscode notebook" 6 | py_path: "example_notebooks/pipelines/python_hello_world_pipeline/hello_world.py" 7 | -------------------------------------------------------------------------------- /example_notebooks/pipelines/rapids_pipeline/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Getting Started Pipeline" 3 | job_type: "gpu" 4 | tasks: 5 | - name: "RAPIDs Notebook" 6 | type: "jupyter notebook" 7 | notebook_path: "example_notebooks/pipelines/rapids_pipeline/rapids_data_processing.ipynb" 8 | notebook_output_path: "rapids_data_processing_output.ipynb" -------------------------------------------------------------------------------- /example_notebooks/pipelines/rapids_pipeline/rapids_data_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e8facdaf", 6 | "metadata": {}, 7 | "source": [ 8 | "## An example of using RAPIDS to speed up pandas operations on Hyperplane\n", 9 | "- The task is to groupby and sorting about 3G of data on s3 bucket " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "id": "af2c97bf", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import warnings\n", 20 | "warnings.filterwarnings(\"ignore\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "8cb76ab5", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import os\n", 31 | "import dask\n", 32 | "from hyperplane import notebook_common as nc" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "id": "d7c4064b", 39 | "metadata": { 40 | "tags": [] 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "👉 Hyperplane: selecting worker node pool\n", 48 | "👉 Hyperplane: selecting scheduler node pool\n", 49 | "👉 Hyperplane: you can access your dask dashboard at https://jhub.ds.hyperplane.dev/hub/user-redirect/proxy/45601/status\n", 50 | "👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "client, cluster = nc.initialize_cluster(\n", 56 | " nprocs=1,\n", 57 | " nthreads=8,\n", 58 | " ram_gb_per_proc=7,\n", 59 | " cores_per_worker=2,\n", 60 | " num_workers = 2,\n", 61 | " ngpus = 1,\n", 62 | " scheduler_deploy_mode=\"local\"\n", 63 | ")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "id": "1382c475", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "\n", 76 | "\n", 77 | "\n", 84 | "\n", 92 | "\n", 93 | "
\n", 78 | "

Client

\n", 79 | "\n", 83 | "
\n", 85 | "

Cluster

\n", 86 | "
    \n", 87 | "
  • Workers: 2
  • \n", 88 | "
  • Cores: 16
  • \n", 89 | "
  • Memory: 14.57 GiB
  • \n", 90 | "
\n", 91 | "
" 94 | ], 95 | "text/plain": [ 96 | "" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "client" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "id": "e309f622", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from dask.distributed import Client\n", 116 | "client = Client(cluster)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "id": "24425127", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "import dask_cudf" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "id": "3c138276", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "file_path = \"s3://dask-data/airline-data/*.csv\"" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 9, 142 | "id": "c221a12f", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "flight_df = dask_cudf.read_csv(file_path, assume_missing=True,\n", 147 | " usecols = [\"UniqueCarrier\",\"FlightNum\",\"Distance\"])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 10, 153 | "id": "1ee25618", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/html": [ 159 | "
\n", 160 | "\n", 173 | "\n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | "
UniqueCarrierFlightNumDistance
0PS1451.0447.0
1PS1451.0447.0
2PS1451.0447.0
3PS1451.0447.0
4PS1451.0447.0
\n", 215 | "
" 216 | ], 217 | "text/plain": [ 218 | " UniqueCarrier FlightNum Distance\n", 219 | "0 PS 1451.0 447.0\n", 220 | "1 PS 1451.0 447.0\n", 221 | "2 PS 1451.0 447.0\n", 222 | "3 PS 1451.0 447.0\n", 223 | "4 PS 1451.0 447.0" 224 | ] 225 | }, 226 | "execution_count": 10, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "flight_df.head()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 11, 238 | "id": "fac2898a", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "flight_df_opt = flight_df.groupby(by=[\"UniqueCarrier\",\"FlightNum\"]).Distance.mean()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 12, 248 | "id": "79d49a53", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "CPU times: user 2.08 s, sys: 223 ms, total: 2.3 s\n", 256 | "Wall time: 1min 21s\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "%%time\n", 262 | "flight_df_results = flight_df_opt.compute()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 13, 268 | "id": "e7a88d28", 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "UniqueCarrier FlightNum\n", 275 | "AS 994.0 586.166667\n", 276 | " 920.0 828.272727\n", 277 | "XE 4089.0 583.000000\n", 278 | "WN 3524.0 839.513333\n", 279 | "CO 874.0 588.002579\n", 280 | " ... \n", 281 | "PI 1809.0 195.542662\n", 282 | "NW 1912.0 354.528543\n", 283 | "MQ 3238.0 212.074221\n", 284 | "UA 2563.0 271.504673\n", 285 | "WN 25.0 298.849527\n", 286 | "Name: Distance, Length: 50003, dtype: float64" 287 | ] 288 | }, 289 | "execution_count": 13, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "flight_df_results" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "b3bcb2fa", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "cluster.close()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "1ce927cb", 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [] 315 | } 316 | ], 317 | "metadata": { 318 | "kernelspec": { 319 | "display_name": "Python [conda env:root] *", 320 | "language": "python", 321 | "name": "conda-root-py" 322 | }, 323 | "language_info": { 324 | "codemirror_mode": { 325 | "name": "ipython", 326 | "version": 3 327 | }, 328 | "file_extension": ".py", 329 | "mimetype": "text/x-python", 330 | "name": "python", 331 | "nbconvert_exporter": "python", 332 | "pygments_lexer": "ipython3", 333 | "version": "3.8.10" 334 | } 335 | }, 336 | "nbformat": 4, 337 | "nbformat_minor": 5 338 | } 339 | -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/__init__.py -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | ### simple FastAPI App for inference with the trained AutoML model 3 | """ 4 | 5 | from fastapi import FastAPI, Request 6 | from fastapi.encoders import jsonable_encoder 7 | from pydantic import BaseModel 8 | import time 9 | import pandas as pd 10 | import numpy as np 11 | import json 12 | import io 13 | import os 14 | import sys 15 | from autogluon.tabular import TabularDataset, TabularPredictor 16 | from utils import upload_to_cloud, download_from_cloud 17 | 18 | 19 | def download_model(): 20 | try: 21 | local_file_path = 'models' 22 | remote_file_path =f"gs://shakdemo-hyperplane/data/environmental-sensor-data/{local_file_path}" 23 | download_from_cloud(local_file_path, remote_file_path) 24 | print('downloaded model successfully') 25 | except Exception as e: 26 | print(f'download failed with {e}') 27 | return 28 | 29 | 30 | def run_inference(data:dict) ->pd.DataFrame: 31 | df = pd.DataFrame.from_dict(data, orient = 'index').T 32 | data = TabularDataset(df) 33 | predictor = TabularPredictor.load('models') 34 | y_pred = predictor.predict(df) 35 | return y_pred 36 | 37 | 38 | global model 39 | model = download_model() 40 | 41 | class MyRequest(BaseModel): 42 | data: dict 43 | 44 | app = FastAPI() 45 | 46 | 47 | @app.get("/health-ready") 48 | def health_check(): 49 | return jsonable_encoder({"message": "Ready"}) 50 | 51 | @app.get("/") 52 | def root(): 53 | return {"hello":"world"} 54 | 55 | @app.post("/infer") 56 | async def infer(req: Request): 57 | if req.headers['Content-Type'] == 'application/json': 58 | i = MyRequest(** await req.json()) 59 | elif req.headers['Content-Type'] == 'multipart/form-data': 60 | i = MyRequest(** await req.form()) 61 | elif req.headers['Content-Type'] == 'application/x-www-form-urlencoded': 62 | i = MyRequest(** await req.form()) 63 | r = json.loads(i.json()) 64 | return { 65 | 'prediction': str(run_inference(data = r['data']).values[0]) 66 | } 67 | 68 | 69 | if __name__ == '__main__': 70 | jsondata = {} 71 | data = { 72 | 'ts': 1594512094.3859746, 73 | 'co': 0.0049559386483912, 74 | 'humidity': 51.0, 75 | 'light': 0.0, 76 | 'lpg': 0.0076508222705571, 77 | 'smoke': 0.0204112701224129, 78 | 'temp': 22.7 79 | } 80 | jsondata['data'] = data 81 | jsondata = json.dumps(jsondata) 82 | print(jsondata) 83 | starttime = time.time() 84 | r = json.loads(jsondata) 85 | output = run_inference(r['data']) 86 | print(output) 87 | print(f'used {time.time() - starttime} seconds') 88 | -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/__version__: -------------------------------------------------------------------------------- 1 | 0.5.2 -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/learner.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/learner.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/predictor.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/predictor.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/CatBoost/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/CatBoost/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/ExtraTreesEntr/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/ExtraTreesEntr/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/ExtraTreesGini/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/ExtraTreesGini/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/KNeighborsDist/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/KNeighborsDist/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/KNeighborsUnif/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/KNeighborsUnif/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/LightGBM/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/LightGBM/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/LightGBMLarge/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/LightGBMLarge/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/LightGBMXT/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/LightGBMXT/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/NeuralNetFastAI/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/NeuralNetFastAI/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/NeuralNetTorch/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/NeuralNetTorch/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/RandomForestEntr/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/RandomForestEntr/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/RandomForestGini/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/RandomForestGini/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/attr/XGBoost/y_pred_proba_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/attr/XGBoost/y_pred_proba_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/data/X.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/data/X.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/data/X_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/data/X_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/data/y.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/data/y.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/models/utils/data/y_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/IoT-detection/models/utils/data/y_val.pkl -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "IoT_pipeline" 3 | requirements: "IoT-detection/requirements.txt" 4 | tasks: 5 | - name: "inference" 6 | type: "jupyter notebook" 7 | notebook_path: "IoT-detection/inference.ipynb" 8 | -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/pipeline_service.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "run inference app" 3 | tasks: 4 | - name: "start App for IoT event prediction" 5 | type: "bash script" 6 | port: 8787 7 | bash_script_path: "IoT-detection/run.sh" -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/requirements.txt: -------------------------------------------------------------------------------- 1 | seaborn 2 | mxnet==1.8.0 3 | autogluon 4 | imblearn 5 | fastapi==0.70.0 6 | uvicorn==0.15.0 7 | boto3==1.17.106 -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 3 | cd "$PROJECT_DIR" 4 | 5 | apt-get update 6 | pip install -r requirements.txt 7 | python -m uvicorn main:app --host 0.0.0.0 --port 8787 8 | -------------------------------------------------------------------------------- /example_notebooks/serving/IoT-detection/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | helper function 3 | 4 | """ 5 | def download_from_cloud(local_file_name, remote_file_name): 6 | """ 7 | Upload a file to gcs bucket or s3 bucket. 8 | """ 9 | import os 10 | 11 | cloud_name = remote_file_name.split('://')[0] 12 | if cloud_name =='gs': 13 | import gcsfs 14 | fs = gcsfs.GCSFileSystem(project=os.environ['GCP_PROJECT']) 15 | elif cloud_name =='s3': 16 | import s3fs 17 | fs = s3fs.S3FileSystem() 18 | else: 19 | raise NameError(f'cloud name {cloud_name} unknown') 20 | try: 21 | print(f'Downloading from {remote_file_name} to {local_file_name}') 22 | fs.get(remote_file_name, local_file_name) 23 | print("done downloading!") 24 | except Exception as exp: 25 | print(f"Download failed: {exp}") 26 | return 27 | 28 | def upload_to_cloud(local_file_name, remote_file_name): 29 | """ 30 | Upload a file to gcs bucket or s3 bucket. 31 | """ 32 | import os 33 | cloud_name = remote_file_name.split('://')[0] 34 | if cloud_name =='gs': 35 | import gcsfs 36 | fs = gcsfs.GCSFileSystem(project=os.environ['GCP_PROJECT']) 37 | elif cloud_name =='s3': 38 | import s3fs 39 | fs = s3fs.S3FileSystem() 40 | else: 41 | raise NameError(f'cloud name {cloud_name} unknown') 42 | try: 43 | print(f'Uploading from {local_file_name} to {remote_file_name}') 44 | fs.put(local_file_name, remote_file_name) 45 | print("done uploading!") 46 | except Exception as exp: 47 | print(f"Uploading failed: {exp}") 48 | 49 | return fs -------------------------------------------------------------------------------- /example_notebooks/serving/mlflow_servers/pipeline.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Prop up mlflow service" 3 | job_type: "basic" 4 | tasks: 5 | - name: "mlflow" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "example_notebooks/serving/mlflow_servers/run.sh" 9 | -------------------------------------------------------------------------------- /example_notebooks/serving/mlflow_servers/run.sh: -------------------------------------------------------------------------------- 1 | v=$DATABASE_URL_NO_PARAMS 2 | v2=${v::-7} 3 | 4 | 5 | if [$CLOUD_PROVIDER == "AWS"] 6 | then 7 | export MLFLOW_BUCKET=s3://$HYPERPLANE_AWS_BUCKET/user-mlflow 8 | fi 9 | 10 | if [$CLOUD_PROVIDER == "GCP"] 11 | then 12 | export MLFLOW_BUCKET=gs://$HYPERPLANE_GCP_BUCKET/user-mlflow 13 | fi 14 | 15 | mlflow server --backend-store-uri $v2/$HYPERPLANE_JOB_PARAMETER_USERDIR --default-artifact-root $MLFLOW_BUCKET/$HYPERPLANE_JOB_PARAMETER_USERDIR --host 0.0.0.0 --port 8787 -------------------------------------------------------------------------------- /example_notebooks/serving/triton/config_examples/xgb_config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "fil" 2 | backend: "fil" 3 | max_batch_size: 8192 4 | input [ 5 | { 6 | name: "input__0" 7 | data_type: TYPE_FP32 8 | dims: [ 117 ] 9 | } 10 | ] 11 | output [ 12 | { 13 | name: "output__0" 14 | data_type: TYPE_FP32 15 | dims: [ 2 ] 16 | } 17 | ] 18 | instance_group [{ kind: KIND_CPU }] 19 | parameters [ 20 | { 21 | key: "model_type" 22 | value: { string_value: "xgboost" } 23 | }, 24 | { 25 | key: "predict_proba" 26 | value: { string_value: "true" } 27 | }, 28 | { 29 | key: "output_class" 30 | value: { string_value: "true" } 31 | }, 32 | { 33 | key: "threshold" 34 | value: { string_value: "0.5" } 35 | }, 36 | { 37 | key: "algo" 38 | value: { string_value: "ALGO_AUTO" } 39 | }, 40 | { 41 | key: "storage_type" 42 | value: { string_value: "AUTO" } 43 | }, 44 | { 45 | key: "blocks_per_sm" 46 | value: { string_value: "0" } 47 | }, 48 | { 49 | key: "threads_per_tree" 50 | value: { string_value: "1" } 51 | } 52 | ] 53 | 54 | dynamic_batching { 55 | max_queue_delay_microseconds: 100 56 | } -------------------------------------------------------------------------------- /example_notebooks/serving/triton/convert_pytorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3ef528ba", 6 | "metadata": {}, 7 | "source": [ 8 | "## How to trace/convert Transformer model into Triton acceptable models?\n", 9 | "- source https://github.com/sachinsharma9780/AI-Enterprise-Workshop-Building-ML-Pipelines/blob/main/" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "id": "3d57e014", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "!pip3 install torch torchvision torchaudio --quiet" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "c24f512d", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import torch\n", 30 | "from torch.nn import functional as F" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "id": "e28cbf33", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stderr", 41 | "output_type": "stream", 42 | "text": [ 43 | "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n", 44 | "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 45 | "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 46 | "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n", 47 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification\n", 53 | "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", 54 | "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')\n", 55 | "\n", 56 | "sentence = \"Hello, my dog is cute!\"\n", 57 | "labels = \"happy\"\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "24ec7710", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "input_ids = tokenizer.encode(sentence, hypothesis, return_tensors = 'pt', max_length= 256, truncation = True, \n", 68 | " padding = \"max_length\")\n", 69 | "\n", 70 | "mask = input_ids !=1\n", 71 | "mask = mask.long()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "43d84a99", 77 | "metadata": {}, 78 | "source": [ 79 | "## Tracing PyTorch Model with torchscript\n", 80 | "- this is to create serializable and optimizable models, in equivalent to convert a tensorflow model with TensorRT" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 23, 86 | "id": "b7ec709f", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "class PyTorch_to_TorchScript(torch.nn.Module):\n", 91 | " def __init__(self):\n", 92 | " super(PyTorch_to_TorchScript, self).__init__()\n", 93 | " self.model = AutoModelForSequenceClassification.from_pretrained('joeddav/xlm-roberta-large-xnli')\n", 94 | " def forward(self, data, attention_mask=None):\n", 95 | " return self.model(data, attention_mask)[0]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 24, 101 | "id": "bd39c0aa", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# after trace it will save the model in cwd\n", 106 | "pt_model = PyTorch_to_TorchScript().eval()\n", 107 | "\n", 108 | "traced_script_module = torch.jit.trace(pt_model, (input_ids, attention_mask), strict=False)\n", 109 | "traced_script_module.save(\"./model.pt\")" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "id": "4fddef14", 115 | "metadata": {}, 116 | "source": [ 117 | "## Save the converted model to the Triton model repository folder" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 26, 123 | "id": "4610ee46", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "'/root/model_repository/sentence_bert/1/model.pt'" 130 | ] 131 | }, 132 | "execution_count": 26, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "import shutil\n", 139 | "import os\n", 140 | "folder_name = 'sentence_bert'\n", 141 | "# os.mkdir(f'/root/model_repository/{folder_name}')\n", 142 | "# os.mkdir(f'/root/model_repository/{folder_name}/1')\n", 143 | "shutil.copy('model.pt', f'/root/model_repository/{folder_name}/1')" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "6abe8baf", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python [conda env:root] *", 158 | "language": "python", 159 | "name": "conda-root-py" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.8.10" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 5 176 | } 177 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/convert_xgb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4735f476", 6 | "metadata": {}, 7 | "source": [ 8 | "## Create sklearn and xgboost model" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 2, 14 | "id": "da936a98", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import matplotlib.pyplot as plt\n", 19 | "import pandas as pd\n", 20 | "from sklearn.metrics import classification_report\n", 21 | "from sklearn.preprocessing import OrdinalEncoder\n", 22 | "from xgboost import XGBClassifier\n", 23 | "\n", 24 | "from sklearn import svm\n", 25 | "import warnings\n", 26 | "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", 27 | "from sklearn.model_selection import train_test_split\n", 28 | "import numpy as np\n", 29 | "from sklearn.model_selection import GridSearchCV\n", 30 | "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", 31 | "warnings.filterwarnings('ignore')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "id": "78a91659", 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 57 | "\n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | "
classcap-shapecap-surfacecap-colorbruisesodorgill-attachmentgill-spacinggill-sizegill-color...stalk-surface-below-ringstalk-color-above-ringstalk-color-below-ringveil-typeveil-colorring-numberring-typespore-print-colorpopulationhabitat
0pxsntpfcnk...swwpwopksu
1exsytafcbk...swwpwopnng
\n", 135 | "

2 rows × 23 columns

\n", 136 | "
" 137 | ], 138 | "text/plain": [ 139 | " class cap-shape cap-surface cap-color bruises odor gill-attachment \\\n", 140 | "0 p x s n t p f \n", 141 | "1 e x s y t a f \n", 142 | "\n", 143 | " gill-spacing gill-size gill-color ... stalk-surface-below-ring \\\n", 144 | "0 c n k ... s \n", 145 | "1 c b k ... s \n", 146 | "\n", 147 | " stalk-color-above-ring stalk-color-below-ring veil-type veil-color \\\n", 148 | "0 w w p w \n", 149 | "1 w w p w \n", 150 | "\n", 151 | " ring-number ring-type spore-print-color population habitat \n", 152 | "0 o p k s u \n", 153 | "1 o p n n g \n", 154 | "\n", 155 | "[2 rows x 23 columns]" 156 | ] 157 | }, 158 | "execution_count": 3, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "data = pd.read_csv(\"mushrooms.csv\")\n", 165 | "data.head(2)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "id": "8cd8ec79", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "class object\n", 178 | "cap-shape object\n", 179 | "cap-surface object\n", 180 | "cap-color object\n", 181 | "bruises object\n", 182 | "odor object\n", 183 | "gill-attachment object\n", 184 | "gill-spacing object\n", 185 | "gill-size object\n", 186 | "gill-color object\n", 187 | "stalk-shape object\n", 188 | "stalk-root object\n", 189 | "stalk-surface-above-ring object\n", 190 | "stalk-surface-below-ring object\n", 191 | "stalk-color-above-ring object\n", 192 | "stalk-color-below-ring object\n", 193 | "veil-type object\n", 194 | "veil-color object\n", 195 | "ring-number object\n", 196 | "ring-type object\n", 197 | "spore-print-color object\n", 198 | "population object\n", 199 | "habitat object\n", 200 | "dtype: object" 201 | ] 202 | }, 203 | "execution_count": 6, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "data.dtypes" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 7, 215 | "id": "ca633a98", 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "labels = ['Edible', 'Poisonous']\n", 220 | "values = [data.describe()['class']['freq'], data.describe()['class']['count']-data.describe()['class']['freq']]\n", 221 | "colors = ['green', 'red']" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 15, 227 | "id": "f686aeed", 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "((6499, 117), (6499, 2), (1625, 117), (1625, 2))" 234 | ] 235 | }, 236 | "execution_count": 15, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "# Encoding the string-type data points\n", 243 | "\n", 244 | "data2 = pd.get_dummies(data)\n", 245 | "\n", 246 | "y = data2[['class_e', 'class_p']] # The label for the machine learning models\n", 247 | "X = data2.drop(['class_e', 'class_p'], axis=1) #Features\n", 248 | "\n", 249 | "X = X.astype('float32') # for triton support\n", 250 | "trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=13)\n", 251 | "trainX.shape, trainY.shape, testX.shape, testY.shape" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 29, 257 | "id": "0f3a4acb", 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "name": "stdout", 262 | "output_type": "stream", 263 | "text": [ 264 | "Train Adjusted R2: % 100.0\n", 265 | "Test Adjusted R2: % 100.0\n", 266 | "OOB Score: % 100.0\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "rf = RandomForestClassifier()\n", 272 | "rf.fit(trainX, trainY['class_e'])\n", 273 | "\n", 274 | "train_scoreRF = rf.score(trainX, trainY['class_e'])\n", 275 | "# oob_score = rf.oob_score_\n", 276 | "# Adjusted_R2_trainRF = 1 - (1 - rf.score(trainX, trainY)) * (len(trainY) - 1) / (len(trainY) - trainX.shape[1] - 1)\n", 277 | "# Adjusted_R2_testRF = 1 - (1 - rf.score(testX, testY)) * (len(testY) - 1) / (len(testY) - testX.shape[1] - 1)\n", 278 | "\n", 279 | "print('Train Adjusted R2: %', Adjusted_R2_trainRF * 100)\n", 280 | "print('Test Adjusted R2: %', Adjusted_R2_testRF * 100)\n", 281 | "print('OOB Score: %', oob_score * 100)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 30, 287 | "id": "928802a3", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "import pickle\n", 292 | "with open('rf.pkl', 'wb') as model_file:\n", 293 | " pickle.dump(rf, model_file)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 17, 299 | "id": "359f8f05", 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "----------------------------------------------------\n", 307 | "Accuracy of XGBClassifier: % 100.0\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "model = XGBClassifier(learning_rate=0.005, max_depth=10, n_estimators=30,\n", 313 | " colsample_bytree=0.3, min_child_weight=0.5, reg_alpha=0.3,\n", 314 | " )\n", 315 | "model.fit(trainX, trainY['class_e'])\n", 316 | "\n", 317 | "predictions_XGBC = model.predict(testX)\n", 318 | "acc_XGBC = accuracy_score(predictions_XGBC, testY['class_e'])\n", 319 | "print('----------------------------------------------------')\n", 320 | "print('Accuracy of XGBClassifier: %', 100 * acc_XGBC)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 21, 326 | "id": "2815885f", 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "model.save_model('xgboost.model')" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "347b5223", 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [] 340 | } 341 | ], 342 | "metadata": { 343 | "kernelspec": { 344 | "display_name": "Python [conda env:root] *", 345 | "language": "python", 346 | "name": "conda-root-py" 347 | }, 348 | "language_info": { 349 | "codemirror_mode": { 350 | "name": "ipython", 351 | "version": 3 352 | }, 353 | "file_extension": ".py", 354 | "mimetype": "text/x-python", 355 | "name": "python", 356 | "nbconvert_exporter": "python", 357 | "pygments_lexer": "ipython3", 358 | "version": "3.8.10" 359 | } 360 | }, 361 | "nbformat": 4, 362 | "nbformat_minor": 5 363 | } 364 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/hyperplane-triton-api/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from flask import request 4 | from flask import jsonify 5 | from datetime import datetime 6 | 7 | from geventhttpclient import client 8 | from clients.image_client import run 9 | 10 | from flask import Flask, Response, render_template 11 | 12 | application = Flask(__name__) 13 | 14 | def get_boolean(val, default): 15 | try: 16 | if(str(val).lower() == "false"): 17 | val = False 18 | elif(str(val).lower() == "true"): 19 | val = True 20 | except Exception as e: 21 | print(e) 22 | return default 23 | 24 | def get_number(val, default): 25 | try: 26 | n = int(val) 27 | return n 28 | except Exception as e: 29 | return default 30 | 31 | @application.route("/health-ready", methods=['GET']) 32 | def health_check(): 33 | return jsonify({"message": "Ready"}) 34 | 35 | @application.route("/", methods=['POST'], strict_slashes=False) 36 | def image_client(): 37 | try: 38 | data = request.get_json() 39 | if(data != None): 40 | image = data.get("image", "") 41 | model_name = data.get("model_name", "") 42 | url = data.get("url", "hyperplane-triton.default:8000") 43 | elif(request.form != None): 44 | image = request.form.to_dict()["image"] 45 | model_name = request.form.to_dict()["model_name"] 46 | url = request.form.to_dict()["url"] 47 | else: 48 | data = request.get_json(force=True) 49 | image = data.get("image", "") 50 | model_name = data.get("model_name", "") 51 | url = data.get("url", "hyperplane-triton.default:8000") 52 | 53 | client_run = run( 54 | image = image, 55 | model_name = model_name, 56 | verbose = False, 57 | async_set = False, 58 | streaming = False, 59 | model_version = "", 60 | batch_size= 1, 61 | classes = 3, 62 | scaling = "INCEPTION", 63 | url = url, 64 | protocol = "http" 65 | ) 66 | response = application.response_class( 67 | response=client_run, 68 | status=200, 69 | mimetype='application/json' 70 | ) 71 | return response 72 | except Exception as e: 73 | print(e) 74 | response = json.dumps({ 75 | "hasError" : True, 76 | "error": "There was an error processing your request. Please check your inputs", 77 | "errorMessage": str(e) 78 | }) 79 | return jsonify(response) 80 | 81 | 82 | if __name__ == "__main__": 83 | application.run(debug=True) 84 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/hyperplane-triton-api/hyperplane_triton_api.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Hyperplane Triton API" 3 | job_type: "basic" 4 | tasks: 5 | - name: "hyperplane triton flask api" 6 | type: "bash script" 7 | port: 8787 8 | bash_script_path: "hyperplane-triton-api/start_server.sh" -------------------------------------------------------------------------------- /example_notebooks/serving/triton/hyperplane-triton-api/requirements.txt: -------------------------------------------------------------------------------- 1 | attrdict==2.0.1 2 | certifi==2021.5.30 3 | click==8.0.1 4 | Flask==2.0.1 5 | gevent==21.1.2 6 | geventhttpclient==1.4.4 7 | greenlet==1.1.0 8 | grpcio==1.39.0 9 | itsdangerous==2.0.1 10 | Jinja2==3.0.1 11 | MarkupSafe==2.0.1 12 | numpy==1.21.1 13 | nvidia-pyindex==1.0.9 14 | Pillow==8.3.1 15 | protobuf==3.17.3 16 | python-rapidjson==1.4 17 | six==1.16.0 18 | tritonclient==2.12.0 19 | Werkzeug==2.0.1 20 | zope.event==4.5.0 21 | zope.interface==5.4.0 22 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/hyperplane-triton-api/start_server.sh: -------------------------------------------------------------------------------- 1 | pip install -r hyperplane-triton-api/requirements.txt 2 | export FLASK_APP='hyperplane-triton-api/app.py' 3 | pip install flask 4 | python -m flask run -p 8787 --host=0.0.0.0 -------------------------------------------------------------------------------- /example_notebooks/serving/triton/hyperplane-triton-api/test_image.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devsentient/examples/3f6fa29bb900c6572a463f2139bf0cb9440c2ff7/example_notebooks/serving/triton/hyperplane-triton-api/test_image.jpeg -------------------------------------------------------------------------------- /example_notebooks/serving/triton/sentence_classification_app/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from flask import request 4 | from flask import jsonify 5 | from datetime import datetime 6 | 7 | from geventhttpclient import client 8 | from clients.deepset import run_inference 9 | 10 | from flask import Flask, Response, render_template 11 | 12 | application = Flask(__name__) 13 | 14 | 15 | @application.route("/health-ready", methods=['GET']) 16 | def health_check(): 17 | return jsonify({"message": "Ready"}) 18 | 19 | @application.route("/", methods=['POST'], strict_slashes=False) 20 | def client(): 21 | try: 22 | data = request.get_json() 23 | url = data.get("url", "hyperplane-triton.default:8000") 24 | print('line 24 data', data) 25 | results = run_inference(sentence=data["string"], url=url) 26 | print('line 26', results) 27 | return json.dumps(results) 28 | except Exception as e: 29 | print(e) 30 | response = json.dumps({ 31 | "hasError" : True, 32 | "error": "There was an error processing your request. Please check your inputs", 33 | "errorMessage": str(e) 34 | }) 35 | return jsonify(response) 36 | 37 | if __name__ == "__main__": 38 | application.run(debug=True) 39 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/sentence_classification_app/clients/deepset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from functools import partial 4 | import os 5 | import tritonclient.grpc as tritongrpcclient 6 | import tritonclient.grpc.model_config_pb2 as mc 7 | import tritonclient.http as tritonhttpclient 8 | from tritonclient.utils import triton_to_np_dtype 9 | from tritonclient.utils import InferenceServerException 10 | import torch 11 | from transformers import AutoTokenizer 12 | from torch.nn import functional as F 13 | import json 14 | 15 | tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert') 16 | VERBOSE = False 17 | # sentence1 = 'Who are you voting for 2021?' 18 | # sentence2 = 'Jupiter’s Biggest Moons Started as Tiny Grains of Hail' 19 | # sentence3 = 'Hi Matt, your payment is one week past due. Please use the link below to make your payment' 20 | labels = ['business', 'space and science', 'politics'] 21 | input_name = ['input__0', 'input__1'] 22 | output_name = 'output__0' 23 | 24 | 25 | def run_inference(sentence, model_name='deepset', url='hyperplane-triton.default:8000', model_version='1'): 26 | triton_client = tritonhttpclient.InferenceServerClient( 27 | url=url, verbose=VERBOSE) 28 | model_metadata = triton_client.get_model_metadata( 29 | model_name=model_name, model_version=model_version) 30 | model_config = triton_client.get_model_config( 31 | model_name=model_name, model_version=model_version) 32 | # I have restricted the input sequence length to 256 33 | inputs = tokenizer.batch_encode_plus([sentence] + labels, 34 | return_tensors='pt', max_length=256, 35 | truncation=True, padding='max_length') 36 | 37 | input_ids = inputs['input_ids'] 38 | input_ids = np.array(input_ids, dtype=np.int32) 39 | mask = inputs['attention_mask'] 40 | mask = np.array(mask, dtype=np.int32) 41 | mask = mask.reshape(4, 256) 42 | input_ids = input_ids.reshape(4, 256) 43 | input0 = tritonhttpclient.InferInput(input_name[0], (4, 256), 'INT32') 44 | input0.set_data_from_numpy(input_ids, binary_data=False) 45 | input1 = tritonhttpclient.InferInput(input_name[1], (4, 256), 'INT32') 46 | input1.set_data_from_numpy(mask, binary_data=False) 47 | output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=False) 48 | response = triton_client.infer(model_name, model_version=model_version, inputs=[input0, input1], outputs=[output]) 49 | embeddings = response.as_numpy('output__0') 50 | embeddings = torch.from_numpy(embeddings) 51 | sentence_rep = embeddings[:1].mean(dim=1) 52 | label_reps = embeddings[1:].mean(dim=1) 53 | similarities = F.cosine_similarity(sentence_rep, label_reps) 54 | closest = similarities.argsort(descending=True) 55 | # results = [] 56 | # for ind in closest: 57 | # results.append(f'label: {labels[ind]} \t similarity: {similarities[ind]}') 58 | # return json.dumps(results) 59 | return labels[closest[0]] 60 | 61 | if __name__ == '__main__': 62 | import sys 63 | sent = sys.argv[1] 64 | print(run_inference(sys.argv[1])) 65 | 66 | 67 | # def run(string="", url="hyperplane-triton.default:8000"): 68 | # results = run_inference(sentence=string, url=url) 69 | # return results -------------------------------------------------------------------------------- /example_notebooks/serving/triton/sentence_classification_app/clients/sentence_classification.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | from functools import partial 5 | import os 6 | import tritongrpcclient 7 | import tritongrpcclient.model_config_pb2 as mc 8 | import tritonhttpclient 9 | from tritonclientutils import triton_to_np_dtype 10 | from tritonclientutils import InferenceServerException 11 | from transformers import XLMRobertaTokenizer 12 | from scipy.special import softmax 13 | R_tokenizer = XLMRobertaTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli') 14 | VERBOSE = False 15 | # hypothesis for topic classification 16 | topic = 'This text is about space & cosmos' 17 | input_name = ['input__0', 'input__1'] 18 | output_name = 'output__0' 19 | def run_inference(premise, model_name='deepset', url='hyperplane-triton.default:8000', model_version='1'): 20 | triton_client = tritonhttpclient.InferenceServerClient( 21 | url=url, verbose=VERBOSE) 22 | model_metadata = triton_client.get_model_metadata( 23 | model_name=model_name, model_version=model_version) 24 | model_config = triton_client.get_model_config( 25 | model_name=model_name, model_version=model_version) 26 | # I have restricted the input sequence length to 256 27 | input_ids = R_tokenizer.encode(premise, topic, max_length=256, truncation=True, padding='max_length') 28 | input_ids = np.array(input_ids, dtype=np.int32) 29 | mask = input_ids != 1 30 | mask = np.array(mask, dtype=np.int32) 31 | 32 | mask = mask.reshape(1, 256) 33 | input_ids = input_ids.reshape(1, 256) 34 | input0 = tritonhttpclient.InferInput(input_name[0], (1, 256), 'INT32') 35 | input0.set_data_from_numpy(input_ids, binary_data=False) 36 | input1 = tritonhttpclient.InferInput(input_name[1], (1, 256), 'INT32') 37 | input1.set_data_from_numpy(mask, binary_data=False) 38 | output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=False) 39 | response = triton_client.infer(model_name, model_version=model_version, inputs=[input0, input1], outputs=[output]) 40 | logits = response.as_numpy('output__0') 41 | logits = np.asarray(logits, dtype=np.float32) 42 | # we throw away "neutral" (dim 1) and take the probability of 43 | # "entailment" (2) as the probability of the label being true 44 | entail_contradiction_logits = logits[:,[0,2]] 45 | probs = softmax(entail_contradiction_logits) 46 | true_prob = probs[:,1].item() * 100 47 | print(f'Probability that the label is true: {true_prob:0.2f}%') 48 | # topic classification premises 49 | if __name__ == '__main__': 50 | run_inference('Jupiter’s Biggest Moons Started as Tiny Grains of Hail') 51 | 52 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/sentence_classification_app/hyperplane_triton_api.yaml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | name: "Hyperplane sentence classificaton API" 3 | tasks: 4 | - name: "start Flask api" 5 | type: "bash script" 6 | port: 8787 7 | bash_script_path: "triton_endpoints/sentence_classification_app/start_server.sh" -------------------------------------------------------------------------------- /example_notebooks/serving/triton/sentence_classification_app/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.68.1 2 | Flask==2.0.1 3 | geventhttpclient==1.4.4 4 | gevent==21.1.2 5 | transformers==4.9.2 6 | tritonclient==2.12.0 7 | torch==1.9.0 8 | -------------------------------------------------------------------------------- /example_notebooks/serving/triton/sentence_classification_app/start_server.sh: -------------------------------------------------------------------------------- 1 | PROJECT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 2 | cd "$PROJECT_DIR" 3 | pip install -r requirements.txt 4 | export FLASK_APP='app.py' 5 | python -m flask run -p 8787 --host=0.0.0.0 -------------------------------------------------------------------------------- /example_notebooks/training/ray_mlflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bb0bea8b", 6 | "metadata": {}, 7 | "source": [ 8 | "## Use Ray Tune and MLFlow on Hyperplane " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "6b29316b", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# !pip install tensorboardX --quiet\n", 19 | "# !pip install kubernetes==18.20 --quiet" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "b624a2c5", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "2021-12-08 05:29:55.971324: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n" 33 | ] 34 | }, 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "ray version 1.8.0\n", 40 | "tf version 2.4.1\n", 41 | "torch version 1.7.1+cpu\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import ray\n", 47 | "import tensorflow as tf\n", 48 | "import torch \n", 49 | "print(f'ray version {ray.__version__}')\n", 50 | "print(f'tf version {tf.__version__}')\n", 51 | "print(f'torch version {torch.__version__}')\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "2aa2d67b", 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "👉 Hyperplane: selecting worker node pool\n", 65 | "best pool spec {'pool_env_var': 'DASK_POOL_16_16', 'allocatable_cores': 15.0, 'allocatable_ram': 12.0}\n" 66 | ] 67 | }, 68 | { 69 | "name": "stderr", 70 | "output_type": "stream", 71 | "text": [ 72 | "2021-12-08 05:30:05,885\tWARNING services.py:1748 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 67108864 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=8.39gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM.\n" 73 | ] 74 | }, 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Waiting for worker ray-worker-d41fba09-79e7-47c2-8da7-0a75d0eab126...\n", 80 | "Waiting for worker ray-worker-24b53498-d460-42f6-a1a7-ade2d34dc1e9...\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers\n", 86 | "num_workers = 2\n", 87 | "cpu_core_per_worker = 7\n", 88 | "ram_gb_per_worker = 6 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes\n", 89 | "ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "d7ab1c6e", 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stderr", 100 | "output_type": "stream", 101 | "text": [ 102 | "\u001b[2m\u001b[33m(raylet, ip=10.1.155.4)\u001b[0m [2021-12-08 05:30:09,168 E 17 17] agent_manager.cc:134: Not all required Ray dependencies for the runtime_env feature were found. To install the required dependencies, please run `pip install 'ray[default]'`.\n", 103 | "\u001b[2m\u001b[33m(raylet, ip=10.1.155.4)\u001b[0m [2021-12-08 05:30:09,168 E 17 17] worker_pool.cc:566: [Eagerly] Couldn't create a runtime environment for job 01000000.\n" 104 | ] 105 | }, 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "mlflow version 1.17.0\n" 111 | ] 112 | }, 113 | { 114 | "name": "stderr", 115 | "output_type": "stream", 116 | "text": [ 117 | "\u001b[2m\u001b[33m(raylet, ip=10.1.156.4)\u001b[0m [2021-12-08 05:30:09,377 E 16 16] agent_manager.cc:134: Not all required Ray dependencies for the runtime_env feature were found. To install the required dependencies, please run `pip install 'ray[default]'`.\n", 118 | "\u001b[2m\u001b[33m(raylet, ip=10.1.156.4)\u001b[0m [2021-12-08 05:30:09,377 E 16 16] worker_pool.cc:566: [Eagerly] Couldn't create a runtime environment for job 01000000.\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "import os\n", 124 | "import tempfile\n", 125 | "import time\n", 126 | "\n", 127 | "import mlflow\n", 128 | "\n", 129 | "from ray import tune\n", 130 | "from ray.tune.integration.mlflow import MLflowLoggerCallback, mlflow_mixin\n", 131 | "print('mlflow version', mlflow.__version__)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "f681277e", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "def evaluation_fn(step, width, height):\n", 142 | " return (0.1 + width * step / 100)**(-1) + height * 0.1\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "6d8bd767", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "def easy_objective(config):\n", 153 | " # Hyperparameters\n", 154 | " width, height = config[\"width\"], config[\"height\"]\n", 155 | "\n", 156 | " for step in range(config.get(\"steps\", 100)):\n", 157 | " # Iterative training function - can be any arbitrary training procedure\n", 158 | " intermediate_score = evaluation_fn(step, width, height)\n", 159 | " # Feed the score back to Tune.\n", 160 | " tune.report(iterations=step, mean_loss=intermediate_score)\n", 161 | " time.sleep(0.1)\n" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "id": "f66b2d6a", 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "def tune_function(mlflow_tracking_uri, finish_fast=False):\n", 172 | " tune.run(\n", 173 | " easy_objective,\n", 174 | " name=\"mlflow\",\n", 175 | " num_samples=5,\n", 176 | " callbacks=[\n", 177 | " MLflowLoggerCallback(\n", 178 | " tracking_uri=mlflow_tracking_uri,\n", 179 | " experiment_name=\"mixin_example\",\n", 180 | " save_artifact=True)\n", 181 | " ],\n", 182 | " config={\n", 183 | " \"width\": tune.randint(10, 100),\n", 184 | " \"height\": tune.randint(0, 100),\n", 185 | " \"steps\": 5 if finish_fast else 100,\n", 186 | " })" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "b2d1a387", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "@mlflow_mixin\n", 197 | "def decorated_easy_objective(config):\n", 198 | " # Hyperparameters\n", 199 | " width, height = config[\"width\"], config[\"height\"]\n", 200 | "\n", 201 | " for step in range(config.get(\"steps\", 100)):\n", 202 | " # Iterative training function - can be any arbitrary training procedure\n", 203 | " intermediate_score = evaluation_fn(step, width, height)\n", 204 | " # Log the metrics to mlflow\n", 205 | " mlflow.log_metrics(dict(mean_loss=intermediate_score), step=step)\n", 206 | " # Feed the score back to Tune.\n", 207 | " tune.report(iterations=step, mean_loss=intermediate_score)\n", 208 | " time.sleep(0.1)\n" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "936e7e30", 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "def tune_decorated(mlflow_tracking_uri, finish_fast=False):\n", 219 | " # Set the experiment, or create a new one if does not exist yet.\n", 220 | " mlflow.set_tracking_uri(mlflow_tracking_uri)\n", 221 | " mlflow.set_experiment(experiment_name=\"mixin_example\")\n", 222 | " tune.run(\n", 223 | " decorated_easy_objective,\n", 224 | " name=\"mlflow\",\n", 225 | " verbose = 1, \n", 226 | " num_samples=5,\n", 227 | " config={\n", 228 | " \"width\": tune.randint(10, 100),\n", 229 | " \"height\": tune.randint(0, 100),\n", 230 | " \"steps\": 5 if finish_fast else 100,\n", 231 | " \"mlflow\": {\n", 232 | " \"experiment_name\": \"mixin_example\",\n", 233 | " \"tracking_uri\": mlflow.get_tracking_uri()\n", 234 | " }\n", 235 | " })\n" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "49479f22", 241 | "metadata": {}, 242 | "source": [ 243 | "## setup MLFlow tracking URI" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "1d0910af", 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "postgresql://postgres:postgres@postgresql.postgres-m288j5y2\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "import os\n", 262 | "mlflow.set_tracking_uri(os.environ.get('DATABASE_URL_NO_PARAMS')[:-12]) ## this one \n", 263 | "tracking_uri = mlflow.get_tracking_uri()\n", 264 | "print(tracking_uri)\n", 265 | "\n", 266 | "experiment_name = 'pbt_babi_memnn'\n" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "id": "49dcba6e", 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "data": { 277 | "text/html": [ 278 | "== Status ==
Current time: 2021-12-08 05:36:35 (running for 00:00:18.28)
Memory usage on this node: 2.4/31.4 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/37 CPUs, 0/0 GPUs, 0.0/32.0 GiB heap, 0.0/14.8 GiB objects
Result logdir: /root/ray_results/mlflow
Number of trials: 5/5 (5 TERMINATED)

" 279 | ], 280 | "text/plain": [ 281 | "" 282 | ] 283 | }, 284 | "metadata": {}, 285 | "output_type": "display_data" 286 | }, 287 | { 288 | "name": "stderr", 289 | "output_type": "stream", 290 | "text": [ 291 | "2021-12-08 05:36:35,856\tINFO tune.py:630 -- Total run time: 18.43 seconds (18.26 seconds for the tuning loop).\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "tune_decorated(tracking_uri)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "id": "a1457fd5", 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "Deleting ray-worker-d41fba09-79e7-47c2-8da7-0a75d0eab126\n", 310 | "Deleting ray-worker-24b53498-d460-42f6-a1a7-ade2d34dc1e9\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "stop_ray_cluster(ray_cluster)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "59f23b80-f277-4c60-a2bc-443a38611e5f", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Python 3 (ipykernel)", 330 | "language": "python", 331 | "name": "python3" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.8.10" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 5 348 | } 349 | -------------------------------------------------------------------------------- /example_notebooks/training/tf_mlflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0da9e94a-ec02-4fbc-9b00-82f461009494", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import warnings\n", 11 | "warnings.filterwarnings('ignore')\n", 12 | "\n", 13 | "import os\n", 14 | "os.environ['TF_CPP_MIN_LOG_LEVEL'] = \"2\"\n", 15 | "import warnings\n", 16 | "import sys\n", 17 | "import datetime\n", 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "from typing import Dict, Any, Tuple\n", 22 | "import mlflow\n", 23 | "from zipfile import ZipFile\n", 24 | "import tensorflow as tf\n", 25 | "from tensorflow import keras\n", 26 | "from tensorflow.keras import layers\n", 27 | "from pathlib import Path\n", 28 | "tf.get_logger().setLevel(\"WARNING\")\n", 29 | "tf.autograph.set_verbosity(2)\n", 30 | "\n", 31 | "import mlflow.keras\n", 32 | "mlflow.tensorflow.autolog()\n", 33 | "\n", 34 | "from hyperplane import notebook_common as nc" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "e7d57232-41ea-467a-aa9d-a9176e5c89c6", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "postgresql://postgres:postgres@postgresql.postgres-m288j5y2\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "import os\n", 53 | "mlflow.set_tracking_uri(os.environ.get('HYPERPLANE_MLFLOW_URL')) ## this one \n", 54 | "tracking_uri = mlflow.get_tracking_uri()\n", 55 | "print(tracking_uri)\n", 56 | "experiment_name = 'recommender'\n", 57 | "\n", 58 | "mlflow.set_experiment(experiment_name=experiment_name)\n", 59 | "\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "df909d77-9af8-4677-9895-09122c82fa55", 65 | "metadata": {}, 66 | "source": [ 67 | "## read data " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "id": "31c82277-f692-4b05-834a-62d9c86658d7", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "## read movielens data\n", 78 | "movielens_data_file_url = (\n", 79 | " \"http://files.grouplens.org/datasets/movielens/ml-latest-small.zip\"\n", 80 | ")\n", 81 | "movielens_zipped_file = keras.utils.get_file(\n", 82 | " \"ml-latest-small.zip\", movielens_data_file_url, extract=False\n", 83 | ")\n", 84 | "keras_datasets_path = Path(movielens_zipped_file).parents[0]\n", 85 | "movielens_dir = keras_datasets_path / \"ml-latest-small\"" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "id": "567b3a04-00f5-4d49-965e-ad75cfa2e96f", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "(100836, 4)\n" 99 | ] 100 | }, 101 | { 102 | "data": { 103 | "text/html": [ 104 | "
\n", 105 | "\n", 118 | "\n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | "
userIdmovieIdratingtimestamp
0114.0964982703
1134.0964981247
\n", 145 | "
" 146 | ], 147 | "text/plain": [ 148 | " userId movieId rating timestamp\n", 149 | "0 1 1 4.0 964982703\n", 150 | "1 1 3 4.0 964981247" 151 | ] 152 | }, 153 | "execution_count": 4, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "# Only extract the data the first time the script is run.\n", 160 | "if not movielens_dir.exists():\n", 161 | " with ZipFile(movielens_zipped_file, \"r\") as zip:\n", 162 | " # Extract files\n", 163 | " print(\"Extracting all the files now...\")\n", 164 | " zip.extractall(path=keras_datasets_path)\n", 165 | " print(\"Done!\")\n", 166 | " \n", 167 | "ratings_file = movielens_dir / \"ratings.csv\"\n", 168 | "df = pd.read_csv(ratings_file)\n", 169 | "print(df.shape)\n", 170 | "df.head(2)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 5, 176 | "id": "6d362291-11eb-407b-b85b-8b8904032e87", 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "Number of users: 610, Number of Movies: 9724, Min rating: 0.5, Max rating: 5.0\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "#First, need to perform some preprocessing to encode users and movies as integer indices.\n", 189 | "\n", 190 | "user_ids = df[\"userId\"].unique().tolist()\n", 191 | "user2user_encoded = {x: i for i, x in enumerate(user_ids)}\n", 192 | "userencoded2user = {i: x for i, x in enumerate(user_ids)}\n", 193 | "movie_ids = df[\"movieId\"].unique().tolist()\n", 194 | "movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}\n", 195 | "movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}\n", 196 | "df[\"user\"] = df[\"userId\"].map(user2user_encoded)\n", 197 | "df[\"movie\"] = df[\"movieId\"].map(movie2movie_encoded)\n", 198 | "\n", 199 | "num_users = len(user2user_encoded)\n", 200 | "num_movies = len(movie_encoded2movie)\n", 201 | "df[\"rating\"] = df[\"rating\"].values.astype(np.float32)\n", 202 | "# min and max ratings will be used to normalize the ratings later\n", 203 | "min_rating = min(df[\"rating\"])\n", 204 | "max_rating = max(df[\"rating\"])\n", 205 | "\n", 206 | "print(\n", 207 | " \"Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}\".format(\n", 208 | " num_users, num_movies, min_rating, max_rating\n", 209 | " )\n", 210 | ")" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 6, 216 | "id": "947947ad-413a-4159-8e64-eda4d1e98fe3", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# split into train test\n", 221 | "df = df.sample(frac=1, random_state=42)\n", 222 | "x = df[[\"user\", \"movie\"]].values\n", 223 | "# Normalize the targets between 0 and 1. Makes it easy to train.\n", 224 | "y = df[\"rating\"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values\n", 225 | "# Assuming training on 90% of the data and validating on 10%.\n", 226 | "train_indices = int(0.9 * df.shape[0])\n", 227 | "x_train, x_val, y_train, y_val = (\n", 228 | " x[:train_indices],\n", 229 | " x[train_indices:],\n", 230 | " y[:train_indices],\n", 231 | " y[train_indices:],\n", 232 | ")" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "b012f095-4cdd-4060-9d1d-0955c70bd781", 238 | "metadata": {}, 239 | "source": [ 240 | "## model" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 7, 246 | "id": "44d9e9ae-9165-431a-96ca-90cfad7e19d5", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "class RecommenderNet(keras.Model):\n", 251 | " def __init__(self, num_users, num_movies, embedding_size, **kwargs):\n", 252 | " super(RecommenderNet, self).__init__(**kwargs)\n", 253 | " self.num_users = num_users\n", 254 | " self.num_movies = num_movies\n", 255 | " self.embedding_size = embedding_size\n", 256 | " self.user_embedding = layers.Embedding(\n", 257 | " num_users,\n", 258 | " embedding_size,\n", 259 | " embeddings_initializer=\"he_normal\",\n", 260 | " embeddings_regularizer=keras.regularizers.l2(1e-6),\n", 261 | " )\n", 262 | " self.user_bias = layers.Embedding(num_users, 1)\n", 263 | " self.movie_embedding = layers.Embedding(\n", 264 | " num_movies,\n", 265 | " embedding_size,\n", 266 | " embeddings_initializer=\"he_normal\",\n", 267 | " embeddings_regularizer=keras.regularizers.l2(1e-6),\n", 268 | " )\n", 269 | " self.movie_bias = layers.Embedding(num_movies, 1)\n", 270 | "\n", 271 | " def call(self, inputs):\n", 272 | " user_vector = self.user_embedding(inputs[:, 0])\n", 273 | " user_bias = self.user_bias(inputs[:, 0])\n", 274 | " movie_vector = self.movie_embedding(inputs[:, 1])\n", 275 | " movie_bias = self.movie_bias(inputs[:, 1])\n", 276 | " dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)\n", 277 | " # Add all the components (including bias)\n", 278 | " x = dot_user_movie + user_bias + movie_bias\n", 279 | " # The sigmoid activation forces the rating to between 0 and 1\n", 280 | " return tf.nn.sigmoid(x)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 8, 286 | "id": "d9927d3a-b59b-44ef-946f-59febec4924f", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "def train_model(lr, batch_size):\n", 291 | " embedding_size = 100\n", 292 | " callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)\n", 293 | " model = RecommenderNet(num_users, num_movies, embedding_size)\n", 294 | " model.compile(\n", 295 | " loss=tf.keras.losses.BinaryCrossentropy(), \n", 296 | " optimizer=keras.optimizers.Adam(learning_rate = lr),\n", 297 | "# metrics = [\"accuracy\"]\n", 298 | " )\n", 299 | " history = model.fit(\n", 300 | " x=x_train,\n", 301 | " y=y_train,\n", 302 | " batch_size=batch_size,\n", 303 | " epochs=1,\n", 304 | " callbacks = [callback],\n", 305 | " verbose=1,\n", 306 | " validation_data=(x_val, y_val),\n", 307 | " )" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 9, 313 | "id": "226238b0-09e2-4b31-af19-99b4b71d5bf7", 314 | "metadata": { 315 | "tags": [ 316 | "parameters" 317 | ] 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "## parameter cell that can be replaced with injected parameters\n", 322 | "batch_size = 32\n", 323 | "lr = 0.001" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 10, 329 | "id": "81b18d39-788f-4c32-89c6-eabb20a34253", 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stderr", 334 | "output_type": "stream", 335 | "text": [ 336 | "2022/01/31 16:59:51 WARNING mlflow.utils.autologging_utils: Encountered unexpected error during autologging: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.\n" 337 | ] 338 | }, 339 | { 340 | "name": "stdout", 341 | "output_type": "stream", 342 | "text": [ 343 | "2836/2836 [==============================] - 13s 4ms/step - loss: 0.6520 - val_loss: 0.6209\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "# iterate experiment for hyperparameter search\n", 349 | "mlflow.start_run()\n", 350 | "train_model(lr, batch_size)\n", 351 | "mlflow.end_run() " 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "id": "3e444754-62e7-43c7-b7dc-ba43cc84cc0a", 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3 (ipykernel)", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.8.10" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 5 384 | } 385 | -------------------------------------------------------------------------------- /example_notebooks/utils/dask_snowflake.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "248d39d3", 6 | "metadata": {}, 7 | "source": [ 8 | "## A boilerplate code for using Dask to read from Snowflake " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "0657a24f", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import dask.dataframe as dd\n", 19 | "df = dd.read_sql_table(\n", 20 | " 'accounts', \n", 21 | " 'snowflake://user:pass@...warehouse=...role=...', \n", 22 | " npartitions=10, \n", 23 | " index_col='id'\n", 24 | ")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "063437a0-0743-424c-867f-9f9fed346e3a", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "## Please checkout the API documentation https://yourdomain.hyperplane.dev/hyperplane_docs/api.html for more details\n", 35 | "\n", 36 | "from hyperplane import notebook_common as nc\n", 37 | "num_workers = 2 # number of nodes to spin up\n", 38 | "\n", 39 | "## node specific parameters\n", 40 | "total_memory = 110 #110 GB allocatible for 16_128 nodes, 12G for 16_16 nodes, 27G for 32_32\n", 41 | "cors_per_worker = 15 # 15 cores for 16_128 nodes and 16_16 nodes, 28 cores for 32_32 nodes\n", 42 | "nprocs = 15\n", 43 | "ram_gb_per_proc = total_memory/nprocs\n", 44 | "nthreads = int(cors_per_worker/nprocs)\n", 45 | "\n", 46 | "print(f'initializing with {num_workers} num_workers, {nprocs} nprocs each proc has {ram_gb_per_proc} GB')\n", 47 | "client, cluster = nc.initialize_cluster(\n", 48 | " num_workers = num_workers,\n", 49 | " nprocs = nprocs,\n", 50 | " nthreads = nthreads,\n", 51 | " ram_gb_per_proc = ram_gb_per_proc,\n", 52 | " cores_per_worker = cors_per_worker\n", 53 | " )" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "490df05b", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import snowflake.connector\n", 64 | "from dask.dataframe import from_delayed\n", 65 | "from dask.distributed import delayed\n", 66 | "\n", 67 | "@delayed\n", 68 | "def load(connection_info, query, start, end):\n", 69 | " conn = snowflake.connector.connect(**connection_info)\n", 70 | " cur = conn.cursor()\n", 71 | " cur.execute(query, start, end)\n", 72 | " return cur.fetch_pandas_all()\n", 73 | "ddf = from_delayed(*[load(connection_info, query, st, ed) for st, ed in partitions])\n", 74 | "ddf.persist()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "07b1394c-0157-4148-a978-a1a0a64a1047", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "client.close()\n", 85 | "cluster.close()" 86 | ] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "Python 3 (ipykernel)", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.8.10" 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 5 110 | } 111 | -------------------------------------------------------------------------------- /example_notebooks/utils/ray_starter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "31f5f060-610d-44d1-ac83-938c82870904", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers\n", 11 | "num_workers = 2\n", 12 | "cpu_core_per_worker = 15\n", 13 | "ram_gb_per_worker = 12 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes\n", 14 | "ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "42fc1e86-ee3e-4b18-8b3a-2a7a73b25b61", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import ray\n", 25 | "# Create a Dataset of Python objects.\n", 26 | "ds = ray.data.range(10000)\n", 27 | "ds" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "a8561398-4bc6-43c5-aab5-516f156c8e02", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import pyarrow\n", 38 | "pyarrow.__version__" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "724f59e5-a70a-4ce2-8eef-e598b106af75", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "ds.take(5)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "5386dfd4-7073-4165-8a5d-da346705a23a", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "ds.count()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "8710a815-db9e-4bc6-a477-a5cc21ef1dbd", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# Create a Dataset of Arrow records.\n", 69 | "ds = ray.data.from_items([{\"col1\": i, \"col2\": str(i)} for i in range(10000)])" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "db5cc7ea-12d3-4ca8-a597-47058695e71c", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "ds.show(5)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "6b013285-e164-493e-80e0-32f7885e4cf6", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "ds.schema()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "335814b0-e935-4f75-8a9b-a49cf3b19996", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "stop_ray_cluster(ray_cluster)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "91312729-f717-4655-8e8e-1607564b03ac", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [] 109 | } 110 | ], 111 | "metadata": { 112 | "kernelspec": { 113 | "display_name": "Python [conda env:root] *", 114 | "language": "python", 115 | "name": "conda-root-py" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 3 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython3", 127 | "version": "3.8.10" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 5 132 | } 133 | -------------------------------------------------------------------------------- /example_notebooks/utils/s3plugin.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "6e82ddaa", 6 | "metadata": {}, 7 | "source": [ 8 | "## S3 plugin to download data to dask workers" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "4258997f", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install boto3 --quiet\n", 19 | "!pip install s3urls --quiet" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "fb4535f4", 25 | "metadata": {}, 26 | "source": [ 27 | "### customized dask worker plugin for files and gz files" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "id": "a5878f47", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from distributed.diagnostics.plugin import WorkerPlugin\n", 38 | "class S3DownloadPlugin(WorkerPlugin):\n", 39 | " def __init__(self, s3Url, filename):\n", 40 | " self.s3Url = s3Url\n", 41 | " self.filename = filename\n", 42 | " \n", 43 | " def setup(self, worker):\n", 44 | " self.worker = worker\n", 45 | " import boto3\n", 46 | " import click\n", 47 | " from s3urls import parse_url\n", 48 | " import tarfile\n", 49 | "\n", 50 | " import logging, traceback, sys, os \n", 51 | " logger = logging.getLogger(\"embeddings microservice\")\n", 52 | " logger.debug(\"downloading file...\")\n", 53 | " vocab_parsed_url = parse_url(self.s3Url)\n", 54 | " s3 = boto3.client('s3')\n", 55 | " s3.download_file(vocab_parsed_url['bucket'], vocab_parsed_url['key'], self.filename)\n", 56 | " logger.debug(\"done downloading....\")\n", 57 | " logger.debug(\"extracting....\")\n", 58 | " if self.filename.endswith(\"tar.gz\") or self.filename.endswith(\"tgz\"):\n", 59 | " tar = tarfile.open(self.filename, \"r:gz\")\n", 60 | " tar.extractall(path='/tmp/')\n", 61 | " tar.close()\n", 62 | " os.remove(self.filename)\n", 63 | " logger.debug(\"done extracting....\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 2, 69 | "id": "43fd1edb", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "s3://d2v-tmp/demo/data/aclImdb_v1.tar.gz\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "## this is the file you want to download to dask workers, here assumes it's sitting in a s3 bucket \n", 82 | "tgz_file_global = \"aclImdb_v1.tar.gz\"\n", 83 | "s3Url=f\"s3://d2v-tmp/demo/data/{tgz_file_global}\"\n", 84 | "filename=tgz_file_global\n", 85 | "print(s3Url)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "55bb1331", 91 | "metadata": {}, 92 | "source": [ 93 | "## Download data to local " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 9, 99 | "id": "ec46cc7d", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "import boto3\n", 104 | "import click\n", 105 | "from s3urls import parse_url\n", 106 | "import tarfile\n", 107 | "\n", 108 | "import logging, traceback, sys, os \n", 109 | "vocab_parsed_url = parse_url(s3Url)\n", 110 | "s3 = boto3.client('s3')\n", 111 | "s3.download_file(vocab_parsed_url['bucket'], vocab_parsed_url['key'], filename)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "fee1a5e7", 117 | "metadata": {}, 118 | "source": [ 119 | "## Upload data to dask wokers" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "id": "3b275fdc", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "👉 Hyperplane: selecting worker node pool\n", 133 | "👉 Hyperplane: selecting scheduler node pool\n", 134 | "Creating scheduler pod on cluster. This may take some time.\n", 135 | "👉 Hyperplane: spinning up a dask cluster with a scheduler as a standalone container.\n", 136 | "👉 Hyperplane: In a few minutes you'll be able to access the dashboard at https://ds.hyperplane.dev/dask-cluster-45548bff-fd74-4136-8efa-8fde70d27961/status\n", 137 | "👉 Hyperplane: to get logs from all workers, do `cluster.get_logs()`\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "from hyperplane import notebook_common as nc\n", 143 | "client, cluster = nc.initialize_cluster(num_workers=1)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "id": "85731ca9", 149 | "metadata": {}, 150 | "source": [ 151 | "## Upload file to remote workers and extract if it's a compressed file\n", 152 | "- first install necessary packages on the workers \n", 153 | "- wait for 30 seconds for all workers to finish installation before moving to the next step" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "df1fa98c", 160 | "metadata": { 161 | "tags": [] 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "from dask.distributed import PipInstall\n", 166 | "import time\n", 167 | "plugin = PipInstall(packages=[\"boto3\", \"s3urls\", \"botocore\", \"click\"], pip_options=[\"--upgrade\"])\n", 168 | "client.register_worker_plugin(plugin)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "740742b1", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "client.register_worker_plugin(S3DownloadPlugin(s3Url=s3Url, filename=tgz_file_global))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 17, 184 | "id": "5616d2a8", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "cluster.close()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "76d6cfae", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python [conda env:root] *", 203 | "language": "python", 204 | "name": "conda-root-py" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.8.10" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 5 221 | } 222 | -------------------------------------------------------------------------------- /example_notebooks/utils/submit_graphql_with_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "66400b02", 6 | "metadata": {}, 7 | "source": [ 8 | "\n", 9 | "### Example of submit GraphQL query with the `graphql_operation` function" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 7, 15 | "id": "7a025c40", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from hyperplane.notebook_common import graphql_operation\n", 20 | "import numpy as np" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "237e53e3", 26 | "metadata": {}, 27 | "source": [ 28 | "## submit one job" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 36, 34 | "id": "ba283457", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "gql_query = \"\"\"\n", 39 | " mutation submitJob {\n", 40 | " createPipelineJob(\n", 41 | " data: {\n", 42 | " jobType: \"basic\"\n", 43 | " timeout: 1800\n", 44 | " activeTimeout: 1800\n", 45 | " maxRetries: 1\n", 46 | " pipelineYamlPath: \"crypto-bot/balance_pipeline/get_balances.yaml\"\n", 47 | " }\n", 48 | " ) {\n", 49 | " id\n", 50 | " status\n", 51 | " }\n", 52 | " }\n", 53 | "\n", 54 | "\"\"\" " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 37, 60 | "id": "19d0a352", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "[{'createPipelineJob': {'id': 'cdd1e16c-9e42-4d2d-add7-12d974cf0d08', 'status': 'pending'}}]\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "### the graphql_operation funcition is a async function so we add await. \n", 73 | "data = await graphql_operation(gql_query)\n", 74 | "print(data)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "1b5098b2", 80 | "metadata": {}, 81 | "source": [ 82 | "## submit with parameters" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 32, 88 | "id": "76063976", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/plain": [ 94 | "[{'createPipelineJob': {'id': 'bf63b1c4-1a92-4ce3-bbac-e2e590241c3e',\n", 95 | " 'status': 'pending'}}]" 96 | ] 97 | }, 98 | "execution_count": 32, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "timeout = 180\n", 105 | "max_retries = 1\n", 106 | "param = \"param1\"\n", 107 | "\n", 108 | "gql_query = \"\"\"\n", 109 | " mutation submitModel { \n", 110 | " createPipelineJob (data: {\n", 111 | " jobType: \"basic\",\n", 112 | " timeout: %d,\n", 113 | " maxRetries: %d,\n", 114 | " pipelineYamlPath: \"example.yaml\",\n", 115 | " parameters: {\n", 116 | " create: [\n", 117 | " {key: \"param1\", value: \"%s\"},\n", 118 | " ]\n", 119 | " }\n", 120 | " }) { \n", 121 | " id \n", 122 | " status\n", 123 | "\n", 124 | " } \n", 125 | " }\n", 126 | " \"\"\" % (timeout, max_retries, param)\n", 127 | "\n", 128 | "data = await graphql_operation(gql_query)\n", 129 | "data" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "00097b87", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python [conda env:root] *", 144 | "language": "python", 145 | "name": "conda-root-py" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 3 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython3", 157 | "version": "3.8.10" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 5 162 | } 163 | --------------------------------------------------------------------------------