├── .gitignore ├── LICENSE ├── README.md ├── ai ├── autogen │ ├── README.md │ ├── autogen-code-execution.ipynb │ ├── autogen-tools.ipynb │ ├── autogent-tutorial.ipynb │ ├── docker-example.ipynb │ └── requirements.txt ├── langchain │ └── langchain-rag-basics │ │ ├── basics.ipynb │ │ ├── data │ │ ├── chroma │ │ │ ├── 39b238f5-b82f-42ff-a683-cc8d5aea4747 │ │ │ │ ├── data_level0.bin │ │ │ │ ├── header.bin │ │ │ │ ├── length.bin │ │ │ │ └── link_lists.bin │ │ │ └── chroma.sqlite3 │ │ ├── getting-real │ │ │ ├── getting-real-01-introduction.pdf │ │ │ ├── getting-real-02-starting-line.pdf │ │ │ ├── getting-real-03-stay-lean.pdf │ │ │ ├── getting-real-04-priorities.pdf │ │ │ ├── getting-real-05-feature-selection.pdf │ │ │ ├── getting-real-06-process.pdf │ │ │ ├── getting-real-07-organization.pdf │ │ │ ├── getting-real-08-staffing.pdf │ │ │ ├── getting-real-09-interface-design.pdf │ │ │ ├── getting-real-10-code.pdf │ │ │ ├── getting-real-11-words.pdf │ │ │ ├── getting-real-12-pricing-signup.pdf │ │ │ ├── getting-real-13-promotion.pdf │ │ │ ├── getting-real-14-support.pdf │ │ │ ├── getting-real-15-post-launch.pdf │ │ │ └── getting-real-full.pdf │ │ └── nba-rules-2023.pdf │ │ ├── qa.ipynb │ │ ├── rag-retrieval.ipynb │ │ └── requirements.txt └── litellm │ ├── README.md │ ├── deepseek_example.py │ └── requirements.txt ├── ds ├── ab-testing │ ├── README.md │ ├── Walkthrough.ipynb │ ├── ab-testing-math.ipynb │ ├── requirements.txt │ └── utils │ │ ├── data.py │ │ ├── plot.py │ │ └── stats.py ├── airflow │ ├── README.md │ ├── requirements.txt │ └── simple_bash_dag.py ├── aws-pyspark │ ├── README.md │ └── emr_bootstrap.sh ├── cohort-analysis │ ├── README.md │ ├── cohort-analysis.ipynb │ └── requirements.txt ├── dask │ ├── .gitignore │ ├── README.md │ ├── dask-array.ipynb │ ├── dask-big-dataset.ipynb │ ├── dask-intro.ipynb │ ├── dask-taxi.ipynb │ ├── dask-worker-space │ │ ├── global.lock │ │ ├── purge.lock │ │ ├── worker-3y9yh5wc.dirlock │ │ ├── worker-5u5lbrxx.dirlock │ │ ├── worker-82zb8rgu.dirlock │ │ ├── worker-9wl7s6m3.dirlock │ │ ├── worker-_n7kuuyd.dirlock │ │ ├── worker-bbjm31ih.dirlock │ │ ├── worker-fwxxmool.dirlock │ │ ├── worker-l28a891y.dirlock │ │ ├── worker-l8y7v2oj.dirlock │ │ ├── worker-lckuq0ub.dirlock │ │ ├── worker-ofkwc26n.dirlock │ │ └── worker-wuu54xyo.dirlock │ ├── mydask.png │ └── requirements.txt ├── data-driven-growth │ ├── .gitignore │ ├── README.md │ ├── know-your-metrics.ipynb │ ├── requirements.txt │ └── utils │ │ ├── data.py │ │ ├── plot.py │ │ └── stats.py ├── diff-in-diff │ ├── Panel101.dta │ ├── README.md │ ├── did-min-wage.ipynb │ ├── did-panel101.ipynb │ ├── mini-wage.csv │ ├── mini-wage.dat │ ├── panel101.csv │ └── requirements.txt ├── dvc │ ├── .gitignore │ └── README.md ├── hypo-testing │ ├── README.md │ ├── blood-pressure.csv │ ├── chi-test.csv │ ├── crop-yield.csv │ ├── hypo-testing.ipynb │ ├── plant-growth.csv │ └── requirements.txt ├── inside-airbnb │ ├── .idea │ │ ├── inside-airbnb.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── workspace.xml │ ├── README.md │ ├── add-columns.py │ ├── data │ │ ├── nyc-listings.csv │ │ └── nyc-listings_new.csv │ ├── get-one-photo.py │ ├── get-photos.py │ └── requirements.txt ├── matplotlib │ ├── README.md │ ├── grouped-bar-plot-with-precentage-change-matplotlib.ipynb │ └── requirements.txt ├── multi-armed-bandit │ ├── README.md │ ├── mab.ipynb │ └── requirements.txt ├── pymongo │ ├── README.md │ ├── pymongo.ipynb │ └── requirements.txt ├── seaborn │ ├── README.md │ ├── pokemon.csv │ ├── requirements.txt │ └── seaborn_basics.ipynb ├── spark-basics │ ├── datacamp-notes.md │ └── datacamp-spark.ipynb ├── statsmodels-tutorial │ ├── README.md │ ├── lr-python.ipynb │ ├── requirements.txt │ ├── statsmodels.ipynb │ └── statsmodels_getstarted.ipynb ├── streamlit │ ├── README.md │ ├── airbnb.py │ ├── listings.csv │ └── requirements.txt ├── superset │ └── README.md ├── time-series-additive-model │ ├── README.md │ ├── additive_models.ipynb │ ├── data │ │ ├── Workbook1.xlsx │ │ ├── gm_sales.csv │ │ ├── gm_sales.xlsx │ │ ├── recessions.csv │ │ ├── recessions.xlsx │ │ └── tesla_search_terms.csv │ └── requirements.txt └── time-series-basics │ ├── README.md │ ├── data │ └── opsd_germany_daily.csv │ ├── requirements.txt │ └── time_series_basics.ipynb ├── ml ├── attention │ ├── README.md │ ├── attention_explained.ipynb │ └── requirements.txt ├── autogluon │ ├── README.md │ ├── agModels-predictClass │ │ ├── learner.pkl │ │ ├── models │ │ │ ├── CatBoost │ │ │ │ └── model.pkl │ │ │ ├── ExtraTreesEntr │ │ │ │ └── model.pkl │ │ │ ├── ExtraTreesGini │ │ │ │ └── model.pkl │ │ │ ├── KNeighborsDist │ │ │ │ └── model.pkl │ │ │ ├── KNeighborsUnif │ │ │ │ └── model.pkl │ │ │ ├── LightGBM │ │ │ │ └── model.pkl │ │ │ ├── LightGBMLarge │ │ │ │ └── model.pkl │ │ │ ├── LightGBMXT │ │ │ │ └── model.pkl │ │ │ ├── NeuralNetFastAI │ │ │ │ ├── model-internals.pkl │ │ │ │ └── model.pkl │ │ │ ├── RandomForestEntr │ │ │ │ └── model.pkl │ │ │ ├── RandomForestGini │ │ │ │ └── model.pkl │ │ │ ├── WeightedEnsemble_L2 │ │ │ │ ├── model.pkl │ │ │ │ └── utils │ │ │ │ │ ├── model_template.pkl │ │ │ │ │ └── oof.pkl │ │ │ ├── XGBoost │ │ │ │ └── model.pkl │ │ │ └── trainer.pkl │ │ ├── predictor.pkl │ │ └── utils │ │ │ └── data │ │ │ ├── X.pkl │ │ │ ├── X_val.pkl │ │ │ ├── y.pkl │ │ │ └── y_val.pkl │ ├── autogluon.ipynb │ ├── housing-prediction.ipynb │ ├── input │ │ ├── anscombe.csv │ │ └── housing.csv │ └── requirements.txt ├── clearml-server │ └── README.md ├── clearml │ ├── README.md │ ├── matplotlib │ │ ├── Allegro_Trains_matplotlib_example.ipynb │ │ ├── matplotlib_example.py │ │ ├── mlp_grouped_errorbar.py │ │ └── requirements.txt │ ├── pytorch │ │ ├── manual_model_upload.py │ │ ├── notebooks │ │ │ ├── audio │ │ │ │ ├── README.md │ │ │ │ ├── audio_classifier_UrbanSound8K.ipynb │ │ │ │ └── audio_preprocessing_example.ipynb │ │ │ ├── image │ │ │ │ ├── hyperparameter_search.ipynb │ │ │ │ └── image_classification_CIFAR10.ipynb │ │ │ ├── table │ │ │ │ ├── download_and_preprocessing.ipynb │ │ │ │ ├── download_and_split.ipynb │ │ │ │ ├── pick_best_model.ipynb │ │ │ │ ├── preprocessing_and_encoding.ipynb │ │ │ │ ├── tabular_ml_pipeline.ipynb │ │ │ │ └── train_tabular_predictor.ipynb │ │ │ └── text │ │ │ │ └── text_classification_AG_NEWS.ipynb │ │ ├── pytorch_distributed_example.py │ │ ├── pytorch_matplotlib.py │ │ ├── pytorch_mnist.py │ │ ├── pytorch_tensorboard.py │ │ ├── pytorch_tensorboardx.py │ │ ├── requirements.txt │ │ └── tensorboard_toy_pytorch.py │ ├── requirements.txt │ ├── scikit-learn │ │ ├── model-harry.pkl │ │ ├── model.pkl │ │ ├── requirements.txt │ │ ├── sklearn_joblib_example.py │ │ └── sklearn_matplotlib_example.py │ ├── tensorflow │ │ ├── legacy │ │ │ ├── requirements.txt │ │ │ ├── tensorboard_pr_curve.py │ │ │ ├── tensorboard_toy.py │ │ │ ├── tensorflow_eager.py │ │ │ └── tensorflow_mnist_with_summaries.py │ │ ├── manual_model_upload.py │ │ ├── requirements.txt │ │ ├── tensorboard_pr_curve.py │ │ ├── tensorboard_toy.py │ │ └── tensorflow_mnist.py │ ├── wandb │ │ ├── latest-run │ │ ├── pytorch_mnist_clearml.py │ │ ├── pytorch_mnist_wandb.py │ │ └── requirements.txt │ └── xgboost │ │ ├── requirements.txt │ │ └── xgboost_sample.py ├── clip-image-classification │ └── clip-img-cls.ipynb ├── document-clustering │ ├── README.md │ ├── data │ │ ├── genres_list.txt │ │ ├── synopses_list_imdb.txt │ │ ├── synopses_list_wiki.txt │ │ └── title_list.txt │ ├── doc_clustering.ipynb │ └── requirements.txt ├── feature-importance │ ├── README.md │ ├── breast-cancer.csv │ ├── feature-importance.ipynb │ ├── feature-selection.ipynb │ └── requirements.txt ├── few-shot-learning │ ├── .gitignore │ ├── README.md │ ├── datasets │ │ ├── mini_imagenet │ │ │ └── dataloader_mini_imagenet.py │ │ └── omniglot │ │ │ └── dataloader_omniglot.py │ ├── loader_omniglot.py │ ├── mini_imagenet │ │ ├── mini_proto_model.py │ │ ├── mini_proto_test.py │ │ ├── mini_proto_train.py │ │ └── mini_protoloader.py │ ├── model_omniglot.py │ ├── notebooks │ │ └── dataloader_notebook │ │ │ ├── Omniglot.ipynb │ │ │ ├── dataloader.ipynb │ │ │ ├── images_background_small2.zip │ │ │ └── loss_test.ipynb │ ├── requirements.txt │ ├── test_omniglot.py │ ├── train_omniglot.py │ └── util │ │ ├── loss.py │ │ └── tensor_op.py ├── fine-tune-pegasus │ ├── README.md │ ├── pegasus_finetuning_xsum.ipynb │ └── requirements.txt ├── graph │ ├── .gitignore │ ├── README.md │ ├── data │ │ ├── fb-pages-food.edges │ │ ├── fb-pages-food.nodes │ │ └── shakespeare.txt │ ├── deepwalk.ipynb │ ├── fb-page-link-prediction.ipynb │ ├── metadata.tsv │ ├── requirements.txt │ ├── vectors.tsv │ └── word2vec.ipynb ├── greedy-layer-wise-pretraning │ ├── README.md │ ├── layer-wise-pretrain.ipynb │ └── requirements.txt ├── house-price-prediction │ ├── README.md │ ├── house_price_prediction.ipynb │ ├── input │ │ ├── anscombe.csv │ │ └── housing.csv │ └── requirements.txt ├── imbalanced-multi-classification │ ├── README.md │ ├── glass.csv │ ├── imbalanced-classification.ipynb │ └── requirements.txt ├── openml-csv-arff │ ├── README.md │ └── news-aggregator.ipynb ├── process-mining │ ├── README.md │ ├── log-eda.ipynb │ ├── log2csv.ipynb │ ├── pm-log.csv │ ├── requirements.txt │ ├── sample.csv │ └── sample.txt ├── tf-serving │ ├── .gitignore │ ├── README.md │ ├── client.py │ ├── client_curl.sh │ ├── data │ │ └── ImageNetLabels.txt │ ├── images │ │ ├── animal.jpg │ │ ├── clear.jpg │ │ └── ponds.png │ ├── make_servables.py │ └── requirements.txt ├── tfidf-bm25 │ ├── README.md │ ├── data │ │ ├── cranfield_docs.json │ │ ├── cranfield_queries.json │ │ └── cranfield_relevance.json │ ├── requirements.txt │ └── tfidf-bm25.ipynb ├── topic-modeling │ ├── .gitignore │ ├── LDA_news_headlines.ipynb │ ├── README.md │ ├── abcnews-small.csv │ ├── lda_from_scratch.ipynb │ └── requirements.txt └── tweet-sentiment-analysis │ ├── README.md │ ├── requirements.txt │ ├── trump-tweets.csv │ ├── tutorial.md │ └── tweet_sentiment_analysis.ipynb └── other ├── chinese-to-pinyin ├── .gitignore ├── README.md ├── ch-to-pinyin.py ├── data │ ├── .DS_Store11 │ ├── .DS_Store111 │ ├── .DS_Store111111 │ ├── .DS_Store1111111 │ └── 白 │ │ └── 0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg └── uni2pinyin ├── color-palette ├── color-palette.ipynb ├── test-palette.png └── test.png ├── csv-to-bert-text ├── README.md ├── csv-to-txt.ipynb ├── csv-to-txt.py ├── data │ ├── neg │ │ ├── 43.txt │ │ └── 44.txt │ ├── neu │ │ └── 29.txt │ └── pos │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 20.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 40.txt │ │ ├── 41.txt │ │ ├── 42.txt │ │ ├── 46.txt │ │ ├── 47.txt │ │ ├── 5.txt │ │ ├── 50.txt │ │ └── 8.txt ├── requirements.txt └── sample.csv ├── list-like-to-list ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── movie-genre.iml │ └── workspace.xml ├── README.md ├── input.csv ├── movie.py ├── movie_genre.csv └── requirements.txt ├── list-of-dicts-to-columns ├── README.md ├── example.csv ├── list-to-columns.ipynb └── requirements.txt └── screenshot-gif-generation ├── .gitignore ├── README.md ├── gif-generation.ipynb ├── git-gen.py ├── requirements.txt └── screenshots └── screenshot-folder.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | .DS_Store 104 | 105 | .npy 106 | .pkl 107 | 108 | # vscode 109 | 110 | .vscode 111 | 112 | tmp/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Harry Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo contains a set of AI and Data Science tutorials in Python curated and revised by me. I modified most of the tutorials to add more instructions and make sure they work well in configured virtual environments. Many thanks to the tutorial authors and other contributors. See the README in each tutorial folder for details. 2 | 3 | I organize the tutorials into four folders: 4 | 5 | - `ai` for AI tutorials 6 | - `ds` for Data Science tutorials 7 | - `ml` for machine learning/deep learning tutorials 8 | - `other` for code on things like data processing, one-off tricks, etc. 9 | 10 | ## Setup 11 | 12 | Each tutorial may have different version requirements for certain packages. So, each tutorial will use a separate virtual environment. 13 | 14 | For some tutorials, you may need to set API keys. You need to add a `.env` file and include the API keys as follows (see my blog post on [Manage Environment Variables in Python Projects](https://harrywang.me/env)): 15 | 16 | ``` 17 | OPENAI_API_KEY=sk-proj-xxxx 18 | LANGCHAIN_API_KEY=ls__69650xxxx 19 | REPLICATE_API_TOKEN=r8_W0V3rJxxx 20 | ``` 21 | 22 | To run each tutorial, you need to do the following at the root of this project - I use `document_clustering` tutorial as an example: 23 | 24 | ``` 25 | cd document_clustering 26 | python3 -m venv venv 27 | source venv/bin/activate 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | Then, you can use VSCode `code .` to open the notebooks. 32 | -------------------------------------------------------------------------------- /ai/autogen/README.md: -------------------------------------------------------------------------------- 1 | Start autogenstudio 2 | 3 | ``` 4 | autogenstudio ui 5 | ``` -------------------------------------------------------------------------------- /ai/autogen/docker-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "\u001b[31m\n", 13 | ">>>>>>>> EXECUTING CODE BLOCK (inferred language is shell)...\u001b[0m\n", 14 | "exitcode: 0 (execution succeeded)\n", 15 | "Code output: \n", 16 | "\u001b[31m\n", 17 | ">>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...\u001b[0m\n", 18 | "exitcode: 0 (execution succeeded)\n", 19 | "Code output: Scatter plot saved to line.png\n", 20 | "\n" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "import os\n", 26 | "from dotenv import load_dotenv\n", 27 | "from autogen import ConversableAgent\n", 28 | "from autogen.coding import DockerCommandLineCodeExecutor\n", 29 | "\n", 30 | "load_dotenv() # take environment variables from .env.\n", 31 | "\n", 32 | "llm_config={\"config_list\": [{\n", 33 | " \"model\": \"gpt-4-turbo\",\n", 34 | " \"cache\": None,\n", 35 | " \"temperature\": 0.9, \n", 36 | " \"api_key\": os.environ.get(\"OPENAI_API_KEY\")}]}\n", 37 | "\n", 38 | "\n", 39 | "# Create a temporary directory to store the code files.\n", 40 | "temp_dir = './tmp'\n", 41 | "\n", 42 | "docker_container_name = 'autogen'\n", 43 | "\n", 44 | "docker_executor = DockerCommandLineCodeExecutor(\n", 45 | " image=\"python:3.12-slim\", # Execute code using the given docker image name.\n", 46 | " container_name=docker_container_name, # Name of the Docker container.\n", 47 | " timeout=180, # Timeout for each code execution in seconds - 3 minutes\n", 48 | " work_dir=temp_dir, # Use the temporary directory to store the code files.\n", 49 | ")\n", 50 | "\n", 51 | "# Create an agent with code executor configuration that uses docker.\n", 52 | "code_executor_agent_using_docker = ConversableAgent(\n", 53 | " \"code_executor_agent_docker\",\n", 54 | " llm_config=False, # Turn off LLM for this agent.\n", 55 | " code_execution_config={\"executor\": docker_executor}, # Use the docker command line code executor.\n", 56 | " human_input_mode=\"NEVER\", # Change to ALWAYS to take human input for this agent for safety.\n", 57 | ")\n", 58 | "\n", 59 | "message_with_code_block = \"\"\"This is a message with code block.\n", 60 | "The code block is below:\n", 61 | "```shell\n", 62 | "pip install matplotlib numpy\n", 63 | "```\n", 64 | "This is the end of the message.\n", 65 | "\"\"\"\n", 66 | "\n", 67 | "reply = code_executor_agent_using_docker.generate_reply(messages=[{\"role\": \"user\", \"content\": message_with_code_block}])\n", 68 | "print(reply)\n", 69 | "\n", 70 | "\n", 71 | "message_with_code_block = \"\"\"This is a message with code block.\n", 72 | "The code block is below:\n", 73 | "```python\n", 74 | "import numpy as np\n", 75 | "import matplotlib.pyplot as plt\n", 76 | "x = range(100)\n", 77 | "y = np.random.randint(0, 100, 100)\n", 78 | "plt.plot(x, y)\n", 79 | "plt.savefig('line.png')\n", 80 | "print('Scatter plot saved to line.png')\n", 81 | "```\n", 82 | "This is the end of the message.\n", 83 | "\"\"\"\n", 84 | "\n", 85 | "reply = code_executor_agent_using_docker.generate_reply(messages=[{\"role\": \"user\", \"content\": message_with_code_block}])\n", 86 | "print(reply)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "venv", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.10.6" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 2 118 | } 119 | -------------------------------------------------------------------------------- /ai/autogen/requirements.txt: -------------------------------------------------------------------------------- 1 | pyautogen 2 | python-dotenv 3 | matplotlib 4 | numpy 5 | yfinance 6 | autogenstudio -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/header.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/header.bin -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/length.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/link_lists.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/link_lists.bin -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/chroma/chroma.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/chroma.sqlite3 -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-01-introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-01-introduction.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-02-starting-line.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-02-starting-line.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-03-stay-lean.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-03-stay-lean.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-04-priorities.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-04-priorities.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-05-feature-selection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-05-feature-selection.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-06-process.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-06-process.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-07-organization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-07-organization.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-08-staffing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-08-staffing.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-09-interface-design.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-09-interface-design.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-10-code.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-10-code.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-11-words.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-11-words.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-12-pricing-signup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-12-pricing-signup.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-13-promotion.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-13-promotion.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-14-support.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-14-support.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-15-post-launch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-15-post-launch.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/getting-real/getting-real-full.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-full.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/data/nba-rules-2023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/nba-rules-2023.pdf -------------------------------------------------------------------------------- /ai/langchain/langchain-rag-basics/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain 2 | openai 3 | python-dotenv 4 | pypdf 5 | yt_dlp 6 | pydub 7 | bs4 8 | tiktoken 9 | langchain_openai 10 | chromadb 11 | PyPDF2 12 | lark 13 | scikit-learn 14 | panel 15 | docarray -------------------------------------------------------------------------------- /ai/litellm/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | 1. Create and activate a virtual environment: 4 | ```bash 5 | python -m venv venv 6 | source venv/bin/activate # On Windows: venv\Scripts\activate 7 | ``` 8 | 9 | 2. Install the required packages: 10 | ```bash 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | 3. Create a .env file with your API key: 15 | ```bash 16 | echo 'API_KEY="sk-Vi-wwJMyM8vJX"' > .env 17 | ``` 18 | 19 | 4. Run the Python script: 20 | ```bash 21 | python deepseek_example.py 22 | ``` 23 | -------------------------------------------------------------------------------- /ai/litellm/deepseek_example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from dotenv import load_dotenv 4 | from litellm import completion 5 | 6 | # Load environment variables from .env file 7 | load_dotenv() 8 | 9 | # Configure the API endpoint and key 10 | api_key = os.environ.get("API_KEY") 11 | api_base = "https://litellmud.takin.ai" # Hardcoded API base URL 12 | model = "litellm_proxy/ollama/deepseek-r1:7b" # Hardcoded model name 13 | 14 | # Verify API key is set 15 | if not api_key: 16 | print("Error: API_KEY must be set in .env file") 17 | sys.exit(1) 18 | 19 | def generate_response(prompt, stream=False): 20 | try: 21 | # Set up the request parameters 22 | params = { 23 | "model": model, 24 | "messages": [{"role": "user", "content": prompt}], 25 | "api_key": api_key, 26 | "api_base": api_base, 27 | "stream": stream, 28 | # Optional parameters 29 | "temperature": 0.6, 30 | "max_tokens": 500, 31 | } 32 | 33 | # Send the request 34 | if stream: 35 | # Return the streaming response generator 36 | return completion(**params) 37 | else: 38 | # Get the complete response 39 | response = completion(**params) 40 | return response.choices[0].message.content 41 | 42 | except Exception as e: 43 | print(f"Error generating response: {e}") 44 | return f"An error occurred: {str(e)}" 45 | 46 | def stream_response(prompt): 47 | """ 48 | Stream a response from the model and print it to the console. 49 | 50 | Args: 51 | prompt (str): The user prompt to send to the model 52 | """ 53 | try: 54 | print(f"\nPrompt: {prompt}") 55 | print("\nResponse: ", end="") 56 | 57 | # Get the streaming response 58 | response_stream = generate_response(prompt, stream=True) 59 | 60 | # Process and print each chunk 61 | for chunk in response_stream: 62 | content = chunk.choices[0].delta.content 63 | if content: 64 | sys.stdout.write(content) 65 | sys.stdout.flush() 66 | 67 | print("\n") 68 | except Exception as e: 69 | print(f"\nError streaming response: {e}") 70 | 71 | def main(): 72 | prompt1 = "write a short poem about artificial intelligence" 73 | print(f"Prompt: {prompt1}") 74 | response1 = generate_response(prompt1) 75 | print(f"Response: {response1}") 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /ai/litellm/requirements.txt: -------------------------------------------------------------------------------- 1 | litellm>=1.62.1 2 | python-dotenv>=1.0.0 3 | -------------------------------------------------------------------------------- /ds/ab-testing/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is my revised code of the tutorial at https://towardsdatascience.com/the-math-behind-a-b-testing-with-example-code-part-1-of-2-7be752e1d06f. 4 | 5 | ## Setup 6 | 7 | do the following at the root of this project: 8 | 9 | ``` 10 | cd ab-testing 11 | python3 -m venv venv 12 | source venv/bin/activate 13 | pip install -r requirements.txt 14 | ``` 15 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 16 | 17 | -------------------------------------------------------------------------------- /ds/ab-testing/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib==3.1.2 3 | pandas 4 | scipy 5 | -------------------------------------------------------------------------------- /ds/ab-testing/utils/data.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as scs 2 | import pandas as pd 3 | # import numpy as np 4 | 5 | 6 | def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A', 7 | test_label='B'): 8 | """Returns a pandas dataframe with fake CTR data 9 | 10 | Example: 11 | 12 | Parameters: 13 | N_A (int): sample size for control group 14 | N_B (int): sample size for test group 15 | Note: final sample size may not match N_A provided because the 16 | group at each row is chosen at random (50/50). 17 | p_A (float): conversion rate; conversion rate of control group 18 | p_B (float): conversion rate; conversion rate of test group 19 | days (int): optional; if provided, a column for 'ts' will be included 20 | to divide the data in chunks of time 21 | Note: overflow data will be included in an extra day 22 | control_label (str) 23 | test_label (str) 24 | 25 | Returns: 26 | df (df) 27 | """ 28 | 29 | # initiate empty container 30 | data = [] 31 | 32 | # total amount of rows in the data 33 | N = N_A + N_B 34 | 35 | # distribute events based on proportion of group size 36 | group_bern = scs.bernoulli(N_A / (N_A + N_B)) 37 | 38 | # initiate bernoulli distributions from which to randomly sample 39 | A_bern = scs.bernoulli(p_A) 40 | B_bern = scs.bernoulli(p_B) 41 | 42 | for idx in range(N): 43 | # initiate empty row 44 | row = {} 45 | # for 'ts' column 46 | if days is not None: 47 | if type(days) == int: 48 | row['ts'] = idx // (N // days) 49 | else: 50 | raise ValueError("Provide an integer for the days parameter.") 51 | # assign group based on 50/50 probability 52 | row['group'] = group_bern.rvs() 53 | 54 | if row['group'] == 0: 55 | # assign conversion based on provided parameters 56 | row['converted'] = A_bern.rvs() 57 | else: 58 | row['converted'] = B_bern.rvs() 59 | # collect row into data container 60 | data.append(row) 61 | 62 | # convert data into pandas dataframe 63 | df = pd.DataFrame(data) 64 | 65 | # transform group labels of 0s and 1s to user-defined group labels 66 | df['group'] = df['group'].apply( 67 | lambda x: control_label if x == 0 else test_label) 68 | 69 | return df 70 | -------------------------------------------------------------------------------- /ds/ab-testing/utils/stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as scs 3 | 4 | 5 | def pooled_prob(N_A, N_B, X_A, X_B): 6 | """Returns pooled probability for two samples""" 7 | return (X_A + X_B) / (N_A + N_B) 8 | 9 | 10 | def pooled_SE(N_A, N_B, X_A, X_B): 11 | """Returns the pooled standard error for two samples""" 12 | p_hat = pooled_prob(N_A, N_B, X_A, X_B) 13 | SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B)) 14 | return SE 15 | 16 | 17 | def confidence_interval(sample_mean=0, sample_std=1, sample_size=1, 18 | sig_level=0.05): 19 | """Returns the confidence interval as a tuple""" 20 | z = z_val(sig_level) 21 | 22 | left = sample_mean - z * sample_std / np.sqrt(sample_size) 23 | right = sample_mean + z * sample_std / np.sqrt(sample_size) 24 | 25 | return (left, right) 26 | 27 | 28 | def z_val(sig_level=0.05, two_tailed=True): 29 | """Returns the z value for a given significance level""" 30 | z_dist = scs.norm() 31 | if two_tailed: 32 | sig_level = sig_level/2 33 | area = 1 - sig_level 34 | else: 35 | area = 1 - sig_level 36 | 37 | z = z_dist.ppf(area) 38 | 39 | return z 40 | 41 | 42 | def ab_dist(stderr, d_hat=0, group_type='control'): 43 | """Returns a distribution object depending on group type 44 | 45 | Examples: 46 | 47 | Parameters: 48 | stderr (float): pooled standard error of two independent samples 49 | d_hat (float): the mean difference between two independent samples 50 | group_type (string): 'control' and 'test' are supported 51 | 52 | Returns: 53 | dist (scipy.stats distribution object) 54 | """ 55 | if group_type == 'control': 56 | sample_mean = 0 57 | 58 | elif group_type == 'test': 59 | sample_mean = d_hat 60 | 61 | # create a normal distribution which is dependent on mean and std dev 62 | dist = scs.norm(sample_mean, stderr) 63 | return dist 64 | 65 | 66 | def min_sample_size(bcr, mde, power=0.8, sig_level=0.05): 67 | """Returns the minimum sample size to set up a split test 68 | 69 | Arguments: 70 | bcr (float): probability of success for control, sometimes 71 | referred to as baseline conversion rate 72 | 73 | mde (float): minimum change in measurement between control 74 | group and test group if alternative hypothesis is true, sometimes 75 | referred to as minimum detectable effect 76 | 77 | power (float): probability of rejecting the null hypothesis when the 78 | null hypothesis is false, typically 0.8 79 | 80 | sig_level (float): significance level often denoted as alpha, 81 | typically 0.05 82 | 83 | Returns: 84 | min_N: minimum sample size (float) 85 | 86 | References: 87 | Stanford lecture on sample sizes 88 | http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf 89 | """ 90 | # standard normal distribution to determine z-values 91 | standard_norm = scs.norm(0, 1) 92 | 93 | # find Z_beta from desired power 94 | Z_beta = standard_norm.ppf(power) 95 | 96 | # find Z_alpha 97 | Z_alpha = standard_norm.ppf(1-sig_level/2) 98 | 99 | # average of probabilities from both groups 100 | pooled_prob = (bcr + bcr+mde) / 2 101 | 102 | min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2 103 | / mde**2) 104 | 105 | return min_N 106 | 107 | 108 | def p_val(N_A, N_B, p_A, p_B): 109 | """Returns the p-value for an A/B test""" 110 | return scs.binom(N_A, p_A).pmf(p_B * N_B) 111 | -------------------------------------------------------------------------------- /ds/airflow/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is my revised code of the tutorial at: https://medium.com/abn-amro-developer/data-pipeline-orchestration-on-steroids-apache-airflow-tutorial-part-1-87361905db6d 4 | 5 | 6 | ## Setup 7 | I assume you have airflow (2.1.0) running locally on your mac by running the commands at https://airflow.apache.org/docs/apache-airflow/stable/start/local.html 8 | 9 | ``` 10 | export AIRFLOW_HOME=~/airflow 11 | pip install apache-airflow 12 | airflow db init 13 | airflow users create \ 14 | --username admin \ 15 | --firstname Harry \ 16 | --lastname Wang \ 17 | --role Admin \ 18 | --email harryjwang@gmail.com 19 | 20 | # start the web server, default port is 8080 21 | airflow webserver --port 8080 22 | 23 | # start the scheduler 24 | # open a new terminal or else run webserver with ``-D`` option to run it as a daemon 25 | airflow scheduler 26 | 27 | ``` 28 | 29 | Now, you can access airflow at http://localhost:8080 30 | 31 | Open `/Users/harrywang/airflow/airflow.cfg` you can find the path to hold your dag python file: `dags_folder = /Users/harrywang/airflow/dags` you may need to create this folder. 32 | 33 | `simple_bash_dag.py` is a simple DAG that create an empty txt file and then rename it - two tasks in a sequence. 34 | 35 | copy the dag files to the dag folder, then run `airflow scheduler` (airflow does not auto refresh, you have to run this command manually to see the newly added dag file), you should be able to see the dag via UI. 36 | 37 | You can trigger the DAG as follows: 38 | 39 | Screen Shot 2021-06-01 at 11 37 32 AM 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /ds/airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | -------------------------------------------------------------------------------- /ds/airflow/simple_bash_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.bash_operator import BashOperator 4 | 5 | 6 | default_args = { 7 | 'owner': 'Harry Wang', 8 | 'depends_on_past': False, 9 | 'start_date': datetime(2021, 6, 1), 10 | 'email': ['harryjwang@gmail.com'], 11 | 'email_on_failure': False, 12 | 'email_on_retry': False, 13 | # In case of errors, do one retry 14 | 'retries': 1, 15 | # Do the retry with 30 seconds delay after the error 16 | 'retry_delay': timedelta(seconds=30), 17 | # Run once every 15 minutes 18 | 'schedule_interval': '*/15 * * * *' 19 | } 20 | 21 | with DAG( 22 | dag_id='simple_bash_dag', 23 | default_args=default_args, 24 | schedule_interval=None, 25 | tags=['my_dags'], 26 | ) as dag: 27 | t1 = BashOperator( 28 | bash_command="touch ~/my_bash_file.txt", 29 | task_id="create_file" 30 | ) 31 | t2 = BashOperator( 32 | bash_command="mv ~/my_bash_file.txt ~/my_bash_file_changed.txt", 33 | task_id="change_file_name" 34 | ) 35 | t1 >> t2 # t2 depends on t 36 | -------------------------------------------------------------------------------- /ds/aws-pyspark/README.md: -------------------------------------------------------------------------------- 1 | Code for tutorial at https://towardsdatascience.com/getting-started-with-pyspark-on-amazon-emr-c85154b6b921 2 | 3 | -------------------------------------------------------------------------------- /ds/aws-pyspark/emr_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo pip install -U matplotlib pandas -------------------------------------------------------------------------------- /ds/cohort-analysis/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | revised version of https://www.kaggle.com/mahmoudelfahl/cohort-analysis-customer-segmentation-with-rfm 4 | 5 | ## Setup 6 | 7 | go to the tutorial folder and do the following: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ds/cohort-analysis/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib 3 | numpy 4 | seaborn -------------------------------------------------------------------------------- /ds/dask/.gitignore: -------------------------------------------------------------------------------- 1 | data/ -------------------------------------------------------------------------------- /ds/dask/README.md: -------------------------------------------------------------------------------- 1 | ## Python statsmodels tutorial 2 | 3 | This is my code of the tutorial at https://docs.dask.org/en/stable/10-minutes-to-dask.html 4 | 5 | 6 | 7 | ## Setup 8 | 9 | ``` 10 | brew brew install graphviz 11 | ``` 12 | 13 | within the tutorial folder: 14 | 15 | ``` 16 | python3 -m venv venv 17 | source venv/bin/activate 18 | pip install -r requirements.txt 19 | ``` 20 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks. 21 | 22 | -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/global.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/global.lock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/purge.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/purge.lock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-3y9yh5wc.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-3y9yh5wc.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-5u5lbrxx.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-5u5lbrxx.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-82zb8rgu.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-82zb8rgu.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-9wl7s6m3.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-9wl7s6m3.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-_n7kuuyd.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-_n7kuuyd.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-bbjm31ih.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-bbjm31ih.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-fwxxmool.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-fwxxmool.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-l28a891y.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-l28a891y.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-l8y7v2oj.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-l8y7v2oj.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-lckuq0ub.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-lckuq0ub.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-ofkwc26n.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-ofkwc26n.dirlock -------------------------------------------------------------------------------- /ds/dask/dask-worker-space/worker-wuu54xyo.dirlock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-wuu54xyo.dirlock -------------------------------------------------------------------------------- /ds/dask/mydask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/mydask.png -------------------------------------------------------------------------------- /ds/dask/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | dask[distributed] 3 | pandas 4 | bokeh>=2.1.1 5 | graphviz 6 | s3fs 7 | -------------------------------------------------------------------------------- /ds/data-driven-growth/.gitignore: -------------------------------------------------------------------------------- 1 | /data/*.csv -------------------------------------------------------------------------------- /ds/data-driven-growth/README.md: -------------------------------------------------------------------------------- 1 | ## DATA DRIVEN GROWTH WITH PYTHON 2 | 3 | This is my revised code of the tutorial at https://towardsdatascience.com/data-driven-growth-with-python-part-1-know-your-metrics-812781e66a5b. 4 | 5 | ## Setup 6 | 7 | do the following at the root of this project: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ds/data-driven-growth/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib 3 | pandas 4 | scipy 5 | seaborn 6 | -------------------------------------------------------------------------------- /ds/data-driven-growth/utils/data.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as scs 2 | import pandas as pd 3 | # import numpy as np 4 | 5 | 6 | def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A', 7 | test_label='B'): 8 | """Returns a pandas dataframe with fake CTR data 9 | 10 | Example: 11 | 12 | Parameters: 13 | N_A (int): sample size for control group 14 | N_B (int): sample size for test group 15 | Note: final sample size may not match N_A provided because the 16 | group at each row is chosen at random (50/50). 17 | p_A (float): conversion rate; conversion rate of control group 18 | p_B (float): conversion rate; conversion rate of test group 19 | days (int): optional; if provided, a column for 'ts' will be included 20 | to divide the data in chunks of time 21 | Note: overflow data will be included in an extra day 22 | control_label (str) 23 | test_label (str) 24 | 25 | Returns: 26 | df (df) 27 | """ 28 | 29 | # initiate empty container 30 | data = [] 31 | 32 | # total amount of rows in the data 33 | N = N_A + N_B 34 | 35 | # distribute events based on proportion of group size 36 | group_bern = scs.bernoulli(N_A / (N_A + N_B)) 37 | 38 | # initiate bernoulli distributions from which to randomly sample 39 | A_bern = scs.bernoulli(p_A) 40 | B_bern = scs.bernoulli(p_B) 41 | 42 | for idx in range(N): 43 | # initiate empty row 44 | row = {} 45 | # for 'ts' column 46 | if days is not None: 47 | if type(days) == int: 48 | row['ts'] = idx // (N // days) 49 | else: 50 | raise ValueError("Provide an integer for the days parameter.") 51 | # assign group based on 50/50 probability 52 | row['group'] = group_bern.rvs() 53 | 54 | if row['group'] == 0: 55 | # assign conversion based on provided parameters 56 | row['converted'] = A_bern.rvs() 57 | else: 58 | row['converted'] = B_bern.rvs() 59 | # collect row into data container 60 | data.append(row) 61 | 62 | # convert data into pandas dataframe 63 | df = pd.DataFrame(data) 64 | 65 | # transform group labels of 0s and 1s to user-defined group labels 66 | df['group'] = df['group'].apply( 67 | lambda x: control_label if x == 0 else test_label) 68 | 69 | return df 70 | -------------------------------------------------------------------------------- /ds/data-driven-growth/utils/stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as scs 3 | 4 | 5 | def pooled_prob(N_A, N_B, X_A, X_B): 6 | """Returns pooled probability for two samples""" 7 | return (X_A + X_B) / (N_A + N_B) 8 | 9 | 10 | def pooled_SE(N_A, N_B, X_A, X_B): 11 | """Returns the pooled standard error for two samples""" 12 | p_hat = pooled_prob(N_A, N_B, X_A, X_B) 13 | SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B)) 14 | return SE 15 | 16 | 17 | def confidence_interval(sample_mean=0, sample_std=1, sample_size=1, 18 | sig_level=0.05): 19 | """Returns the confidence interval as a tuple""" 20 | z = z_val(sig_level) 21 | 22 | left = sample_mean - z * sample_std / np.sqrt(sample_size) 23 | right = sample_mean + z * sample_std / np.sqrt(sample_size) 24 | 25 | return (left, right) 26 | 27 | 28 | def z_val(sig_level=0.05, two_tailed=True): 29 | """Returns the z value for a given significance level""" 30 | z_dist = scs.norm() 31 | if two_tailed: 32 | sig_level = sig_level/2 33 | area = 1 - sig_level 34 | else: 35 | area = 1 - sig_level 36 | 37 | z = z_dist.ppf(area) 38 | 39 | return z 40 | 41 | 42 | def ab_dist(stderr, d_hat=0, group_type='control'): 43 | """Returns a distribution object depending on group type 44 | 45 | Examples: 46 | 47 | Parameters: 48 | stderr (float): pooled standard error of two independent samples 49 | d_hat (float): the mean difference between two independent samples 50 | group_type (string): 'control' and 'test' are supported 51 | 52 | Returns: 53 | dist (scipy.stats distribution object) 54 | """ 55 | if group_type == 'control': 56 | sample_mean = 0 57 | 58 | elif group_type == 'test': 59 | sample_mean = d_hat 60 | 61 | # create a normal distribution which is dependent on mean and std dev 62 | dist = scs.norm(sample_mean, stderr) 63 | return dist 64 | 65 | 66 | def min_sample_size(bcr, mde, power=0.8, sig_level=0.05): 67 | """Returns the minimum sample size to set up a split test 68 | 69 | Arguments: 70 | bcr (float): probability of success for control, sometimes 71 | referred to as baseline conversion rate 72 | 73 | mde (float): minimum change in measurement between control 74 | group and test group if alternative hypothesis is true, sometimes 75 | referred to as minimum detectable effect 76 | 77 | power (float): probability of rejecting the null hypothesis when the 78 | null hypothesis is false, typically 0.8 79 | 80 | sig_level (float): significance level often denoted as alpha, 81 | typically 0.05 82 | 83 | Returns: 84 | min_N: minimum sample size (float) 85 | 86 | References: 87 | Stanford lecture on sample sizes 88 | http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf 89 | """ 90 | # standard normal distribution to determine z-values 91 | standard_norm = scs.norm(0, 1) 92 | 93 | # find Z_beta from desired power 94 | Z_beta = standard_norm.ppf(power) 95 | 96 | # find Z_alpha 97 | Z_alpha = standard_norm.ppf(1-sig_level/2) 98 | 99 | # average of probabilities from both groups 100 | pooled_prob = (bcr + bcr+mde) / 2 101 | 102 | min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2 103 | / mde**2) 104 | 105 | return min_N 106 | 107 | 108 | def p_val(N_A, N_B, p_A, p_B): 109 | """Returns the p-value for an A/B test""" 110 | return scs.binom(N_A, p_A).pmf(p_B * N_B) 111 | -------------------------------------------------------------------------------- /ds/diff-in-diff/Panel101.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/diff-in-diff/Panel101.dta -------------------------------------------------------------------------------- /ds/diff-in-diff/README.md: -------------------------------------------------------------------------------- 1 | ## Python statsmodels tutorial 2 | 3 | This is my code of the tutorial at https://medium.com/@sadhaverajasekar/diff-in-diff-testing-python-f24835330bc8 4 | 5 | ## Setup 6 | 7 | within the tutorial folder: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ds/diff-in-diff/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | scikit-learn 3 | statsmodels 4 | -------------------------------------------------------------------------------- /ds/dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /data -------------------------------------------------------------------------------- /ds/dvc/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This tutorial needs a separate repo, so visit https://github.com/harrywang/dvc for the code for tutorial at 4 | https://realpython.com/python-data-version-control/ -------------------------------------------------------------------------------- /ds/hypo-testing/README.md: -------------------------------------------------------------------------------- 1 | ## Python statsmodels tutorial 2 | 3 | This is my code of the tutorial at https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce 4 | 5 | ## Setup 6 | 7 | within the tutorial folder: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ds/hypo-testing/blood-pressure.csv: -------------------------------------------------------------------------------- 1 | patient,sex,agegrp,bp_before,bp_after 2 | 1,Male,30-45,143,153 3 | 2,Male,30-45,163,170 4 | 3,Male,30-45,153,168 5 | 4,Male,30-45,153,142 6 | 5,Male,30-45,146,141 7 | 6,Male,30-45,150,147 8 | 7,Male,30-45,148,133 9 | 8,Male,30-45,153,141 10 | 9,Male,30-45,153,131 11 | 10,Male,30-45,158,125 12 | 11,Male,30-45,149,164 13 | 12,Male,30-45,173,159 14 | 13,Male,30-45,165,135 15 | 14,Male,30-45,145,159 16 | 15,Male,30-45,143,153 17 | 16,Male,30-45,152,126 18 | 17,Male,30-45,141,162 19 | 18,Male,30-45,176,134 20 | 19,Male,30-45,143,136 21 | 20,Male,30-45,162,150 22 | 21,Male,46-59,149,168 23 | 22,Male,46-59,156,155 24 | 23,Male,46-59,151,136 25 | 24,Male,46-59,159,132 26 | 25,Male,46-59,164,160 27 | 26,Male,46-59,154,160 28 | 27,Male,46-59,152,136 29 | 28,Male,46-59,142,183 30 | 29,Male,46-59,162,152 31 | 30,Male,46-59,155,162 32 | 31,Male,46-59,175,151 33 | 32,Male,46-59,184,139 34 | 33,Male,46-59,167,175 35 | 34,Male,46-59,148,184 36 | 35,Male,46-59,170,151 37 | 36,Male,46-59,159,171 38 | 37,Male,46-59,149,157 39 | 38,Male,46-59,140,159 40 | 39,Male,46-59,185,140 41 | 40,Male,46-59,160,174 42 | 41,Male,60+,157,167 43 | 42,Male,60+,158,158 44 | 43,Male,60+,162,168 45 | 44,Male,60+,160,159 46 | 45,Male,60+,180,153 47 | 46,Male,60+,155,164 48 | 47,Male,60+,172,169 49 | 48,Male,60+,157,148 50 | 49,Male,60+,171,185 51 | 50,Male,60+,170,163 52 | 51,Male,60+,175,146 53 | 52,Male,60+,175,160 54 | 53,Male,60+,172,175 55 | 54,Male,60+,173,163 56 | 55,Male,60+,170,185 57 | 56,Male,60+,164,146 58 | 57,Male,60+,147,176 59 | 58,Male,60+,154,147 60 | 59,Male,60+,172,161 61 | 60,Male,60+,162,164 62 | 61,Female,30-45,152,149 63 | 62,Female,30-45,147,142 64 | 63,Female,30-45,144,146 65 | 64,Female,30-45,144,138 66 | 65,Female,30-45,158,131 67 | 66,Female,30-45,147,145 68 | 67,Female,30-45,154,134 69 | 68,Female,30-45,151,135 70 | 69,Female,30-45,149,131 71 | 70,Female,30-45,138,135 72 | 71,Female,30-45,162,133 73 | 72,Female,30-45,157,135 74 | 73,Female,30-45,141,168 75 | 74,Female,30-45,167,144 76 | 75,Female,30-45,147,147 77 | 76,Female,30-45,143,151 78 | 77,Female,30-45,142,149 79 | 78,Female,30-45,166,147 80 | 79,Female,30-45,147,149 81 | 80,Female,30-45,142,135 82 | 81,Female,46-59,157,127 83 | 82,Female,46-59,170,150 84 | 83,Female,46-59,150,138 85 | 84,Female,46-59,150,147 86 | 85,Female,46-59,167,157 87 | 86,Female,46-59,154,146 88 | 87,Female,46-59,143,148 89 | 88,Female,46-59,157,136 90 | 89,Female,46-59,149,146 91 | 90,Female,46-59,161,132 92 | 91,Female,46-59,142,145 93 | 92,Female,46-59,162,132 94 | 93,Female,46-59,144,157 95 | 94,Female,46-59,142,140 96 | 95,Female,46-59,159,137 97 | 96,Female,46-59,140,154 98 | 97,Female,46-59,144,169 99 | 98,Female,46-59,142,145 100 | 99,Female,46-59,145,137 101 | 100,Female,46-59,145,143 102 | 101,Female,60+,168,178 103 | 102,Female,60+,142,141 104 | 103,Female,60+,147,149 105 | 104,Female,60+,148,148 106 | 105,Female,60+,162,138 107 | 106,Female,60+,170,143 108 | 107,Female,60+,173,167 109 | 108,Female,60+,151,158 110 | 109,Female,60+,155,152 111 | 110,Female,60+,163,154 112 | 111,Female,60+,183,161 113 | 112,Female,60+,159,143 114 | 113,Female,60+,148,159 115 | 114,Female,60+,151,177 116 | 115,Female,60+,165,142 117 | 116,Female,60+,152,152 118 | 117,Female,60+,161,152 119 | 118,Female,60+,165,174 120 | 119,Female,60+,149,151 121 | 120,Female,60+,185,163 122 | -------------------------------------------------------------------------------- /ds/hypo-testing/chi-test.csv: -------------------------------------------------------------------------------- 1 | Gender,Shopping 2 | Male,No 3 | Female,Yes 4 | Male,Yes 5 | Female,Yes 6 | Female,Yes 7 | Male,Yes 8 | Male,No 9 | Female,No 10 | Female,No 11 | -------------------------------------------------------------------------------- /ds/hypo-testing/crop-yield.csv: -------------------------------------------------------------------------------- 1 | Fert,Water,Yield 2 | A,High,27.4 3 | A,High,33.6 4 | A,High,29.8 5 | A,High,35.2 6 | A,High,33 7 | B,High,34.8 8 | B,High,27 9 | B,High,30.2 10 | B,High,30.8 11 | B,High,26.4 12 | A,Low,32 13 | A,Low,32.2 14 | A,Low,26 15 | A,Low,33.4 16 | A,Low,26.4 17 | B,Low,26.8 18 | B,Low,23.2 19 | B,Low,29.4 20 | B,Low,19.4 21 | B,Low,23.8 -------------------------------------------------------------------------------- /ds/hypo-testing/plant-growth.csv: -------------------------------------------------------------------------------- 1 | "","weight","group" 2 | "1",4.17,"ctrl" 3 | "2",5.58,"ctrl" 4 | "3",5.18,"ctrl" 5 | "4",6.11,"ctrl" 6 | "5",4.5,"ctrl" 7 | "6",4.61,"ctrl" 8 | "7",5.17,"ctrl" 9 | "8",4.53,"ctrl" 10 | "9",5.33,"ctrl" 11 | "10",5.14,"ctrl" 12 | "11",4.81,"trt1" 13 | "12",4.17,"trt1" 14 | "13",4.41,"trt1" 15 | "14",3.59,"trt1" 16 | "15",5.87,"trt1" 17 | "16",3.83,"trt1" 18 | "17",6.03,"trt1" 19 | "18",4.89,"trt1" 20 | "19",4.32,"trt1" 21 | "20",4.69,"trt1" 22 | "21",6.31,"trt2" 23 | "22",5.12,"trt2" 24 | "23",5.54,"trt2" 25 | "24",5.5,"trt2" 26 | "25",5.37,"trt2" 27 | "26",5.29,"trt2" 28 | "27",4.92,"trt2" 29 | "28",6.15,"trt2" 30 | "29",5.8,"trt2" 31 | "30",5.26,"trt2" 32 | -------------------------------------------------------------------------------- /ds/hypo-testing/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | statsmodels 3 | -------------------------------------------------------------------------------- /ds/inside-airbnb/.idea/inside-airbnb.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /ds/inside-airbnb/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /ds/inside-airbnb/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /ds/inside-airbnb/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | Code to download the images by using the image URLs provided in the NYC data from 4 | http://insideairbnb.com/get-the-data.html and save those images into local folders. 5 | 6 | # Setup and Run 7 | 8 | Python 3 9 | 10 | - create virtual environment: `$virtualenv -p python3 venv` 11 | - activate virtual env: `$source venv/bin/activate` 12 | - install required packages: `pip3 install -r requirements.txt` 13 | 14 | To run (NOTE: for the NYC dataset, it took about 4.5 hours to download the ~ 45,000 images): 15 | 16 | 1. copy the real listing csv file to /data/ and comment out the `# listings = pd.read_csv('data/listings.csv')` 17 | 2. `python3 get-images.py` will download the save the images to /data/images/ folder 18 | 19 | 20 | 21 | Use Katalon Recorder (Selenium IDE for Chrome) to help test the css selector: 22 | 23 | Install at: https://chrome.google.com/webstore/detail/katalon-recorder-selenium/ 24 | 25 | Then enter the css selector as below and search to see whether you are find the element you need: 26 | 27 | screen shot 2018-02-14 at 1 32 32 pm 28 | 29 | Example: open https://www.airbnb.com/rooms/18461891 in chrome and search css=button[data-veloute='hero-view-photos-button'], the View Photo button should be highlighted -------------------------------------------------------------------------------- /ds/inside-airbnb/add-columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | listings = pd.read_csv('data/nyc-listings.csv') # testing data 4 | # listings = pd.read_csv('data/listings.csv') 5 | listings.info() 6 | 7 | listings["total_photos"] = 0 8 | 9 | # the original list used string not boolean 10 | listings["photo_downloaded"] = "f" 11 | listings["host_photo_downloaded"] = "f" 12 | 13 | listings.to_csv('data/nyc-listings_new.csv', encoding='utf-8', index=False) 14 | 15 | print("csv file processed") -------------------------------------------------------------------------------- /ds/inside-airbnb/get-one-photo.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from urllib.error import HTTPError 3 | from urllib.request import urlretrieve 4 | 5 | listings = pd.read_csv('data/nyc-listings.csv') # testing data 6 | # listings = pd.read_csv('data/listings.csv') 7 | listings.info() 8 | count = 0 # count for successful downloads 9 | err_count = 0 # error counts 10 | for index, row in listings.iterrows(): 11 | # print(row["id"], row["xl_picture_url"]) 12 | # check whether the URL exists 13 | if pd.isnull(row["xl_picture_url"]): 14 | url = row["picture_url"] 15 | else: 16 | url = row["xl_picture_url"] 17 | 18 | try: 19 | urlretrieve(url, "./data/images/" + str(row["id"]) + ".jpg") 20 | count += 1 21 | print("downloading " + str(row["id"])) 22 | except FileNotFoundError as err: 23 | print(err) # something wrong with local path 24 | except HTTPError as err: 25 | print(err) # something wrong with url 26 | except: 27 | print("something is wrong, skipped one line") 28 | 29 | 30 | print("downloading complete with " + str(count) + " images.") -------------------------------------------------------------------------------- /ds/inside-airbnb/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | selenium -------------------------------------------------------------------------------- /ds/matplotlib/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | Grouped bar chart with percentage change using matplotlib. Excel version is at https://www.excelcampus.com/charts/column-chart-percentage-change/ 4 | 5 | ## Setup 6 | 7 | go to the tutorial folder and do the following: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ds/matplotlib/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter>=1.0.0 2 | matplotlib 3 | numpy 4 | seaborn -------------------------------------------------------------------------------- /ds/multi-armed-bandit/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | My revised code based on: 4 | 5 | https://towardsdatascience.com/solving-multiarmed-bandits-a-comparison-of-epsilon-greedy-and-thompson-sampling-d97167ca9a50 6 | 7 | https://github.com/conormm/bandit_algorithms/blob/master/bandits_post_code.py 8 | 9 | 10 | # Setup 11 | 12 | Setup virtual environment and install packages: 13 | ``` 14 | python3 -m venv venv 15 | source venv/bin/activate 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | then, open the notebook 20 | -------------------------------------------------------------------------------- /ds/multi-armed-bandit/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | matplotlib 3 | numpy 4 | pandas 5 | scipy 6 | tqdm -------------------------------------------------------------------------------- /ds/pymongo/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | https://realpython.com/introduction-to-mongodb-and-python/ 4 | 5 | ## Setup 6 | 7 | Install MongoDB: 8 | 9 | ``` 10 | brew tap mongodb/brew 11 | brew install mongodb-community@5.0 12 | ``` 13 | 14 | run/stop as a service 15 | 16 | ``` 17 | brew services start mongodb-community@5.0 18 | brew services stop mongodb-community@5.0 19 | ``` 20 | 21 | connect: 22 | 23 | ``` 24 | mongosh 25 | ``` 26 | 27 | create a new db 28 | 29 | ``` 30 | use rptutorials 31 | show dbs 32 | db 33 | ``` 34 | 35 | create a collection (table) using dot notation 36 | 37 | ``` 38 | db.tutorial 39 | ``` 40 | 41 | document (table row) 42 | 43 | When you’re building a MongoDB database application, probably your most important decision is about the structure of documents. In other words, you’ll have to decide which fields and values your documents will have. 44 | 45 | insert a document: 46 | 47 | ``` 48 | db.tutorial.insertOne( 49 | { 50 | "title": "Reading and Writing CSV Files in Python", 51 | "author": "Jon", 52 | "contributors": [ 53 | "Aldren", 54 | "Geir Arne", 55 | "Joanna", 56 | "Jason" 57 | ], 58 | "url": "https://realpython.com/python-csv/" 59 | } 60 | ) 61 | 62 | db.tutorial.insertOne( 63 | { 64 | "title": "Python 3's f-Strings: An Improved String Formatting Syntax", 65 | "author": "Joanna", 66 | "contributors": [ 67 | "Adriana", 68 | "David", 69 | "Dan", 70 | "Jim", 71 | "Pavel" 72 | ], 73 | "url": "https://realpython.com/python-f-strings/" 74 | } 75 | ) 76 | ``` 77 | 78 | find 79 | ``` 80 | db.tutorial.find() 81 | db.tutorial.find({author: "Joanna"}) 82 | ``` 83 | -------------------------------------------------------------------------------- /ds/pymongo/requirements.txt: -------------------------------------------------------------------------------- 1 | pymongo 2 | -------------------------------------------------------------------------------- /ds/seaborn/README.md: -------------------------------------------------------------------------------- 1 | ## Seaborn Basics with Python 3 2 | 3 | This is my revision of the tutorials at 4 | 5 | - The Ultimate Python Seaborn Tutorial: https://elitedatascience.com/python-seaborn-tutorial 6 | - Styling plots with Seaborn: http://jose-coto.com/styling-with-seaborn 7 | 8 | ## Setup 9 | 10 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 11 | 12 | ``` 13 | $ cd path_to_this folder 14 | $ virtualenv -p python3 venv 15 | $ source venv/bin/activate 16 | $ pip3 install -r requirements.txt 17 | ``` 18 | 19 | Run `$ jupyter notebook` to go over the tutorial step-by-step. 20 | -------------------------------------------------------------------------------- /ds/seaborn/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib 3 | pandas 4 | seaborn 5 | scikit-learn 6 | 7 | -------------------------------------------------------------------------------- /ds/spark-basics/datacamp-notes.md: -------------------------------------------------------------------------------- 1 | https://www.datacamp.com/community/tutorials/apache-spark-tutorial-machine-learning 2 | 3 | ## installation on Mac 4 | https://medium.com/beeranddiapers/installing-apache-spark-on-mac-os-ce416007d79f 5 | ``` 6 | brew upgrade && brew update 7 | 8 | brew install --cask java 9 | 10 | java -version 11 | openjdk version "11.0.1" 2018-10-16 12 | OpenJDK Runtime Environment 18.9 (build 11.0.1+13) 13 | OpenJDK 64-Bit Server VM 18.9 (build 11.0.1+13, mixed mode) 14 | 15 | xcode-select --install 16 | 17 | brew install scala 18 | 19 | scala -version 20 | Scala code runner version 2.13.5 -- Copyright 2002-2020, LAMP/EPFL and Lightbend, Inc. 21 | 22 | brew install apache-spark 23 | 24 | spark-shell 25 | 26 | pyspark 27 | ``` 28 | 29 | 30 | ``` 31 | pip install pyspark 32 | pip install findspark 33 | ``` -------------------------------------------------------------------------------- /ds/spark-basics/datacamp-spark.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/spark-basics/datacamp-spark.ipynb -------------------------------------------------------------------------------- /ds/statsmodels-tutorial/README.md: -------------------------------------------------------------------------------- 1 | ## Python statsmodels tutorial 2 | 3 | This is my code of the tutorial at https://www.statsmodels.org/stable/gettingstarted.html 4 | 5 | ## Setup 6 | 7 | within the tutorial folder: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ds/statsmodels-tutorial/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | linearmodels 3 | matplotlib 4 | statsmodels 5 | -------------------------------------------------------------------------------- /ds/streamlit/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This is my revised code for tutorial at https://towardsdatascience.com/streamlit-101-an-in-depth-introduction-fc8aad9492f2 4 | 5 | Changes: 6 | 7 | - changed data file and make it self-contained in the repo 8 | - added requirements.txt and virtual environment setup 9 | 10 | ## Local Setup 11 | 12 | Python 3 required, see my tutorial to setup Python 3: https://bit.ly/2uX6wAX 13 | 14 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 15 | 16 | 17 | ```shell 18 | $ python3 -m venv venv 19 | $ source venv/bin/activate 20 | $ pip install -r requirements.txt 21 | ``` 22 | 23 | Run the app locally (Local URL: http://localhost:8501) using terminal: `streamlit run airbnb.py` 24 | 25 | Stop the app by using ctrl + C or closing the terminal 26 | 27 | Deploy the app to the cloud for public access via services such as streamlit share, heroku, aws by following my tutorial at https://github.com/harrywang/streamlit-basics. you can see an example at: https://st-demo-harrywang.herokuapp.com/ 28 | -------------------------------------------------------------------------------- /ds/streamlit/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=3.4.2 2 | pandas>=1.2.4 3 | plotly>=5.0.0 4 | streamlit>=0.82.0 -------------------------------------------------------------------------------- /ds/superset/README.md: -------------------------------------------------------------------------------- 1 | Install a local copy 2 | 3 | https://superset.apache.org/docs/installation/installing-superset-using-docker-compose 4 | 5 | I made some changes to the instructions found in the doc above: 6 | 7 | - make sure local PostgreSQL is stopped otherwise superset port conflict 8 | - run into this problem https://github.com/apache/superset/issues/12723 with docker deskptop 2G - increased to 7.5G 9 | 10 | I did the followings: 11 | 12 | - get code `git clone https://github.com/apache/superset.git` 13 | - make sure to use master branch `git checkout master` 14 | - change redis from 3.2 to latest in `docker-compose-non-dev.yml` 15 | Screen Shot 2021-03-23 at 9 51 25 PM 16 | - use `docker-compose -f docker-compose-non-dev.yml up` to start the server 17 | - wait some time and superset_init exited with 0 is expected - it does not affect the server: 18 | Screen Shot 2021-03-23 at 9 47 56 PM 19 | - login at http://localhost:8088/ using admin/admin -------------------------------------------------------------------------------- /ds/time-series-additive-model/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This tutorial is originally published by William Koehrsen at https://towardsdatascience.com/time-series-analysis-in-python-an-introduction-70d5a5b1d52a 3 | 4 | # Setup (Mac) 5 | 6 | - Install python 3 using Homebrew if not done yet: `$ brew install python3` 7 | 8 | - Create python3 virtualenv: `virtualenv -p python3 venv` 9 | 10 | - Activate it: `source venv/bin/activate` 11 | 12 | - Install packages: `$ pip3 install -r requirements.txt`, which include quandl seaborn matplotlib numpy pandas scipy scikit-learn fbprophet 13 | 14 | - Change API key for Quandl: We will access financial data using the Quandl library. Please go to https://www.quandl.com/ and register to get your api_key. You will need to use your own api_key to pull data from the quandl financial library. **You should never put your real API key in the code and push to Github.** We use a local environment variable for the API key: `quandl.ApiConfig.api_key = os.environ.get('QUANDL_KEY')`. You need to add one line `export QUANDL_KEY='your_real_api_key'` to the `~/.bash_profile` file (use `vim` to edit, `source` to execute it, then use `env` to double check): 15 | ``` 16 | $ vim ~/.bash_profile 17 | $ source ~/.bash_profile 18 | $ env 19 | ``` 20 | **NOTE: You may need to close the Terminal window and restart it for Jupyter Notebook to read the new QUANDL_KEY you just added.** 21 | 22 | # Run 23 | 24 | - Start Virtual Env: 25 | ``` 26 | $ virtualenv -p python3 venv 27 | $ source venv/bin/activate 28 | ``` 29 | - Run Jupyter: `jupyter notebook` 30 | - Run additive_models.ipynb 31 | 32 | ### TODO: Get rid of the Deprecation Warnings 33 | -------------------------------------------------------------------------------- /ds/time-series-additive-model/data/Workbook1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/Workbook1.xlsx -------------------------------------------------------------------------------- /ds/time-series-additive-model/data/gm_sales.csv: -------------------------------------------------------------------------------- 1 | Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total 2 | 2017,195909,237388,256224,244406,237364,243151,226107,275552,279397,252813,245387,308539,3002237 3 | 2016,203745,227825,252128,259557,240449,255209,267258,256429,249795,258626,252644,319108,3042773 4 | 2015,202786,231378,249875,269055,293097,259346,272512,270480,251310,262993,229296,290230,3082358 5 | 2014,171486,222104,256047,254076,284694,267461,256160,272422,223437,226819,225818,274483,2935007 6 | 2013,194699,224314,245950,237646,252894,264843,234071,275847,187195,226402,212060,230157,2786078 7 | 2012,167962,209306,231052,213387,245256,248750,201237,240520,210245,195764,186505,245733,2595717 8 | 2011,178896,207028,206621,232538,221192,215358,214915,218479,207145,186895,180402,234351,2503820 9 | 2010,145098,138849,185406,183091,222305,194828,199432,184921,172969,183392,168704,223932,2202927 -------------------------------------------------------------------------------- /ds/time-series-additive-model/data/gm_sales.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/gm_sales.xlsx -------------------------------------------------------------------------------- /ds/time-series-additive-model/data/recessions.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/recessions.csv -------------------------------------------------------------------------------- /ds/time-series-additive-model/data/recessions.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/recessions.xlsx -------------------------------------------------------------------------------- /ds/time-series-additive-model/data/tesla_search_terms.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:699ee134222a9b87c1434bd28d7e66ab0a163788d22c84af35386459961ac202 3 | size 880 4 | -------------------------------------------------------------------------------- /ds/time-series-additive-model/requirements.txt: -------------------------------------------------------------------------------- 1 | quandl 2 | seaborn 3 | matplotlib 4 | numpy 5 | pandas 6 | scipy 7 | scikit-learn 8 | pystan 9 | fbprophet 10 | jupyter 11 | -------------------------------------------------------------------------------- /ds/time-series-basics/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | Time Series Analysis with Pandas: https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/ 4 | 5 | ## Setup 6 | 7 | Tested with Python 3.6 via virtual environment. Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 8 | 9 | 10 | ```shell 11 | $ python3.6 -m venv venv 12 | $ source venv/bin/activate 13 | $ pip install -r requirements.txt 14 | ``` 15 | 16 | Run `$ jupyter notebook` to go over the tutorial step-by-step. 17 | -------------------------------------------------------------------------------- /ds/time-series-basics/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter==1.0.0 2 | pandas==1.0.0 3 | matplotlib==3.1.2 4 | seaborn==0.10.0 5 | -------------------------------------------------------------------------------- /ml/attention/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v_S1r-iPuuVAkqVo8hAL6-OsQ-hZhGx8) 4 | 5 | 6 | I combined and revised the following tutorials: 7 | - https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/ 8 | - https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39 9 | 10 | ## Setup 11 | 12 | within the tutorial folder: 13 | 14 | ``` 15 | python3 -m venv venv 16 | source venv/bin/activate 17 | pip install -r requirements.txt 18 | ``` 19 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 20 | 21 | -------------------------------------------------------------------------------- /ml/attention/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib==3.1.0 # latest version breaks the seaborn heatmap 3 | seaborn 4 | -------------------------------------------------------------------------------- /ml/autogluon/README.md: -------------------------------------------------------------------------------- 1 | ## Kaggle Kernel 2 | 3 | You can run this kernel directly at Kaggle.com: https://www.kaggle.com/harrywang/housing-price-prediction 4 | 5 | ## Run Locally 6 | 7 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 8 | 9 | ``` 10 | $ cd path_to_this folder 11 | $ virtualenv -p python3 venv 12 | $ source venv/bin/activate 13 | $ pip3 install -r requirements.txt 14 | ``` 15 | 16 | Run `$ jupyter notebook` to go over the tutorial step-by-step. 17 | 18 | ## Source 19 | 20 | This is the dataset used in this book: https://github.com/ageron/handson-ml/tree/master/datasets/housing to illustrate a sample end-to-end ML project workflow (pipeline). This is a great book - I highly recommend! 21 | 22 | The data is based on California Census in 1990. 23 | 24 | ### About the Data (from the book): 25 | 26 | "This dataset is a modified version of the California Housing dataset available from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the StatLib repository (which is closed now). The dataset may also be downloaded from StatLib mirrors. 27 | 28 | The following is the description from the book author: 29 | 30 | This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people). 31 | 32 | The dataset in this directory is almost identical to the original, with two differences: 33 | 207 values were randomly removed from the total_bedrooms column, so we can discuss what to do with missing data. 34 | An additional categorical attribute called ocean_proximity was added, indicating (very roughly) whether each block group is near the ocean, near the Bay area, inland or on an island. This allows discussing what to do with categorical data. 35 | Note that the block groups are called "districts" in the Jupyter notebooks, simply because in some contexts the name "block group" was confusing." 36 | 37 | ### About the Data (From Luís Torgo page): 38 | http://www.dcc.fc.up.pt/%7Eltorgo/Regression/cal_housing.html 39 | 40 | This is a dataset obtained from the StatLib repository. Here is the included description: 41 | 42 | "We collected information on the variables using all the block groups in California from the 1990 Cens us. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value)." 43 | 44 | 45 | ### End-to-End ML Project Steps (Chapter 2 of the book) 46 | 47 | 1. Look at the big picture 48 | 2. Get the data 49 | 3. Discover and visualize the data to gain insights 50 | 4. Prepare the data for Machine Learning algorithms 51 | 5. Select a model and train it 52 | 6. Fine-tune your model 53 | 7. Present your solution 54 | 8. Launch, monitor, and maintain your system 55 | 56 | ## The 10-Step Machine Learning Project Workflow (My Version) 57 | 58 | 1. Define business object 59 | 2. Make sense of the data from a high level 60 | - data types (number, text, object, etc.) 61 | - continuous/discrete 62 | - basic stats (min, max, std, median, etc.) using boxplot 63 | - frequency via histogram 64 | - scales and distributions of different features 65 | 3. Create the traning and test sets using proper sampling methods, e.g., random vs. stratified 66 | 4. Correlation analysis (pair-wise and attribute combinations) 67 | 5. Data cleaning (missing data, outliers, data errors) 68 | 6. Data transformation via pipelines (categorical text to number using one hot encoding, feature scaling via normalization/standardization, feature combinations) 69 | 7. Train and cross validate different models and select the most promising one (Linear Regression, Decision Tree, and Random Forest were tried in this tutorial) 70 | 8. Fine tune the model using trying different combinations of hyperparameters 71 | 9. Evaluate the model with best estimators in the test set 72 | 10. Launch, monitor, and refresh the model and system 73 | -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/learner.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/learner.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/CatBoost/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/CatBoost/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/ExtraTreesEntr/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/ExtraTreesEntr/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/ExtraTreesGini/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/ExtraTreesGini/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/KNeighborsDist/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/KNeighborsDist/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/KNeighborsUnif/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/KNeighborsUnif/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/LightGBM/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBM/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/LightGBMLarge/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBMLarge/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/LightGBMXT/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBMXT/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model-internals.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model-internals.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/RandomForestEntr/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/RandomForestEntr/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/RandomForestGini/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/RandomForestGini/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/model_template.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/model_template.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/oof.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/oof.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/XGBoost/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/XGBoost/model.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/models/trainer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/trainer.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/predictor.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/predictor.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/utils/data/X.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/X.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/utils/data/X_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/X_val.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/utils/data/y.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/y.pkl -------------------------------------------------------------------------------- /ml/autogluon/agModels-predictClass/utils/data/y_val.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/y_val.pkl -------------------------------------------------------------------------------- /ml/autogluon/input/anscombe.csv: -------------------------------------------------------------------------------- 1 | dataset,x,y 2 | I,10.0,8.04 3 | I,8.0,6.95 4 | I,13.0,7.58 5 | I,9.0,8.81 6 | I,11.0,8.33 7 | I,14.0,9.96 8 | I,6.0,7.24 9 | I,4.0,4.26 10 | I,12.0,10.84 11 | I,7.0,4.82 12 | I,5.0,5.68 13 | II,10.0,9.14 14 | II,8.0,8.14 15 | II,13.0,8.74 16 | II,9.0,8.77 17 | II,11.0,9.26 18 | II,14.0,8.1 19 | II,6.0,6.13 20 | II,4.0,3.1 21 | II,12.0,9.13 22 | II,7.0,7.26 23 | II,5.0,4.74 24 | III,10.0,7.46 25 | III,8.0,6.77 26 | III,13.0,12.74 27 | III,9.0,7.11 28 | III,11.0,7.81 29 | III,14.0,8.84 30 | III,6.0,6.08 31 | III,4.0,5.39 32 | III,12.0,8.15 33 | III,7.0,6.42 34 | III,5.0,5.73 35 | IV,8.0,6.58 36 | IV,8.0,5.76 37 | IV,8.0,7.71 38 | IV,8.0,8.84 39 | IV,8.0,8.47 40 | IV,8.0,7.04 41 | IV,8.0,5.25 42 | IV,19.0,12.5 43 | IV,8.0,5.56 44 | IV,8.0,7.91 45 | IV,8.0,6.89 46 | -------------------------------------------------------------------------------- /ml/autogluon/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | autogluon 3 | -------------------------------------------------------------------------------- /ml/clearml-server/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | Setup ClearML server locally on Mac: https://allegro.ai/docs/deploying_trains/trains_server_linux_mac/ 4 | 5 | For how to use ClearML, check out https://github.com/harrywang/tutorial-buffet/tree/master/clearml 6 | 7 | Note: I set 6G for Docker 8 | 9 | - Make sure docker is running correctly: `docker run hello-world` 10 | - Create the mounting folder: `sudo mkdir /opt/trains` and then Open the Docker app., On the File Sharing tab, add `/opt/trains`. 11 | 12 | **NOTE: you have to restart docker app after this step!!** 13 | Screen Shot 2020-12-24 at 10 16 29 AM 14 | 15 | - By default, ElasticSearch is mounted at `/opt/trains/data/elastic_7`, you need to create the folder and then give write permission as follows: 16 | 17 | ``` 18 | $ sudo mkdir -p /opt/trains/data/elastic_7 19 | $ chmod 777 /opt/trains/data/elastic_7 20 | ``` 21 | 22 | - Grant access to the Dockers, depending upon your operating system: 23 | ``` 24 | sudo chown -R $(whoami):staff /opt/trains 25 | ``` 26 | 27 | - download `docker-compose.yml` to the `/opt/trains` folder: 28 | ``` 29 | sudo curl https://raw.githubusercontent.com/allegroai/trains-server/master/docker-compose.yml -o /opt/trains/docker-compose.yml 30 | ``` 31 | - Start the server: `docker-compose -f /opt/trains/docker-compose.yml up -d` 32 | 33 | Then go to http://localhost:8080/ to login 34 | 35 | Screen Shot 2020-12-25 at 11 26 45 AM 36 | 37 | - Restart: 38 | 39 | ``` 40 | docker-compose -f /opt/trains/docker-compose.yml down 41 | docker-compose -f /opt/trains/docker-compose.yml up -d 42 | ``` 43 | 44 | -------------------------------------------------------------------------------- /ml/clearml/matplotlib/matplotlib_example.py: -------------------------------------------------------------------------------- 1 | # TRAINS - Example of Matplotlib and Seaborn integration and reporting 2 | # 3 | import matplotlib 4 | matplotlib.use('agg') # use agg instead of tkinter 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | from clearml import Task 9 | 10 | 11 | task = Task.init(project_name='examples', task_name='Matplotlib example by Harry') 12 | 13 | # Create a plot 14 | N = 50 15 | x = np.random.rand(N) 16 | y = np.random.rand(N) 17 | colors = np.random.rand(N) 18 | area = (30 * np.random.rand(N))**2 # 0 to 15 point radii 19 | plt.scatter(x, y, s=area, c=colors, alpha=0.5) 20 | # Plot will be reported automatically 21 | plt.show() 22 | 23 | # Alternatively, in order to report the plot with a more meaningful title/series and iteration number 24 | area = (40 * np.random.rand(N))**2 25 | plt.scatter(x, y, s=area, c=colors, alpha=0.5) 26 | task.logger.report_matplotlib_figure(title="My Plot Title", series="My Plot Series", iteration=10, figure=plt) 27 | plt.show() 28 | 29 | # Create another plot - with a name 30 | x = np.linspace(0, 10, 30) 31 | y = np.sin(x) 32 | plt.plot(x, y, 'o', color='black') 33 | # Plot will be reported automatically 34 | plt.show() 35 | 36 | # Create image plot 37 | m = np.eye(256, 256, dtype=np.uint8) 38 | plt.imshow(m) 39 | # Plot will be reported automatically 40 | plt.show() 41 | 42 | # Create image plot - with a name 43 | m = np.eye(256, 256, dtype=np.uint8) 44 | plt.imshow(m) 45 | plt.title('Image Title') 46 | # Plot will be reported automatically 47 | plt.show() 48 | 49 | sns.set(style="darkgrid") 50 | # Load an example dataset with long-form data 51 | fmri = sns.load_dataset("fmri") 52 | # Plot the responses for different events and regions 53 | sns.lineplot(x="timepoint", y="signal", 54 | hue="region", style="event", 55 | data=fmri) 56 | # Plot will be reported automatically 57 | plt.show() 58 | 59 | print('This is a Matplotlib & Seaborn example') 60 | -------------------------------------------------------------------------------- /ml/clearml/matplotlib/mlp_grouped_errorbar.py: -------------------------------------------------------------------------------- 1 | # Grouped bar chart with precentage change bars and labels 2 | import matplotlib 3 | matplotlib.use('agg') # use agg instead of tkinter 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | plt.style.use('seaborn') 7 | 8 | from clearml import Task 9 | task = Task.init(project_name='examples', task_name='Matplotlib GroupedBar example by Harry') 10 | 11 | men_means = np.array([20, 35, 30, 35, 27]) 12 | women_means = np.array([25, 32, 34, 20, 25]) 13 | 14 | ind = np.arange(len(men_means)) # the x locations for the groups 15 | width = 0.35 # the width of the bars 16 | 17 | fig, ax = plt.subplots() 18 | 19 | rects1 = ax.bar(ind - width/2, men_means, width, 20 | label='Men') 21 | rects2 = ax.bar(ind + width/2, women_means, width, 22 | label='Women') 23 | 24 | # Add some text for labels, title and custom x-axis tick labels, etc. 25 | ax.set_ylabel('Scores') 26 | ax.set_title('Scores by group and gender') 27 | ax.set_xticks(ind) 28 | ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5')) 29 | ax.legend() 30 | 31 | 32 | def autolabel(rects, xpos='center'): 33 | """ 34 | Attach a text label above each bar in *rects*, displaying its height. 35 | 36 | *xpos* indicates which side to place the text w.r.t. the center of 37 | the bar. It can be one of the following {'center', 'right', 'left'}. 38 | 39 | ha: horizontal alignment 40 | """ 41 | 42 | ha = {'center': 'center', 'right': 'left', 'left': 'right'} 43 | offset = {'center': 0, 'right': 1, 'left': -1} 44 | 45 | for rect in rects: 46 | height = rect.get_height() 47 | ax.annotate('{}'.format(height), 48 | xy=(rect.get_x() + rect.get_width() / 2, height), 49 | xytext=(offset[xpos]*3, 3), # use 3 points offset 50 | textcoords="offset points", # in both directions 51 | ha=ha[xpos], va='bottom') 52 | 53 | 54 | autolabel(rects1) 55 | autolabel(rects2) 56 | 57 | # custom error bar 58 | diff = (men_means - women_means)/2 59 | change_percentage = np.abs((men_means - women_means)/men_means) 60 | errorbar_y = men_means - diff # the y of the error bar 61 | errorbar_x_offset = 0.1 62 | 63 | 64 | # show the small caps on error bar ends: 65 | # capsize=3 (bar width) AND markeredgewidth=1 (bar width - default is 0) 66 | # elinewidth=1 is the error bar line width 67 | ax.errorbar(ind + width + errorbar_x_offset, errorbar_y, 68 | yerr=diff, fmt='none', elinewidth=1, 69 | capsize=3, markeredgewidth=1) 70 | 71 | # show the change percentage labels 72 | 73 | errorbar_text_offset = 0.625 # the offset fro the man's bar x location 74 | 75 | for i in range(len(rects1)): 76 | # find the higher bar to determine label height 77 | height1 = rects1[i].get_height() 78 | height2 = rects2[i].get_height() 79 | height = height1 if height1 > height2 else height2 80 | 81 | # add the percentage change labels 82 | ax.annotate(f'{change_percentage[i]:.1%}', # the text 83 | xy=(ind[i] - width/2 + 0.625, height), # x y for the text 84 | xytext=(0, 3), # 0 point horizotal and 3 points vertical offsets 85 | textcoords="offset points", # in both directions 86 | ha='center', # horizontal alignment 87 | va='bottom') # vertical alignment 88 | 89 | fig.tight_layout() 90 | 91 | plt.show() -------------------------------------------------------------------------------- /ml/clearml/matplotlib/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib >= 3.1.1 ; python_version >= '3.6' 2 | matplotlib >= 2.2.4 ; python_version < '3.6' 3 | seaborn 4 | clearml -------------------------------------------------------------------------------- /ml/clearml/pytorch/manual_model_upload.py: -------------------------------------------------------------------------------- 1 | # TRAINS - Example of manual model configuration and uploading 2 | # 3 | import os 4 | from tempfile import gettempdir 5 | 6 | import torch 7 | from trains import Task 8 | 9 | 10 | task = Task.init(project_name='examples', task_name='Model configuration and upload') 11 | 12 | # create a model 13 | model = torch.nn.Module 14 | 15 | # Connect a local configuration file 16 | config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json') 17 | config_file = task.connect_configuration(config_file) 18 | # then read configuration as usual, the backend will contain a copy of it. 19 | # later when executing remotely, the returned `config_file` will be a temporary file 20 | # containing a new copy of the configuration retrieved form the backend 21 | # # model_config_dict = json.load(open(config_file, 'rt')) 22 | 23 | # Or Store dictionary of definition for a specific network design 24 | model_config_dict = { 25 | 'value': 13.37, 26 | 'dict': {'sub_value': 'string', 'sub_integer': 11}, 27 | 'list_of_ints': [1, 2, 3, 4], 28 | } 29 | model_config_dict = task.connect_configuration(model_config_dict) 30 | 31 | # We now update the dictionary after connecting it, and the changes will be tracked as well. 32 | model_config_dict['new value'] = 10 33 | model_config_dict['value'] *= model_config_dict['new value'] 34 | 35 | # store the label enumeration of the training model 36 | labels = {'background': 0, 'cat': 1, 'dog': 2} 37 | task.connect_label_enumeration(labels) 38 | 39 | # storing the model, it will have the task network configuration and label enumeration 40 | print('Any model stored from this point onwards, will contain both model_config and label_enumeration') 41 | 42 | torch.save(model, os.path.join(gettempdir(), "model.pt")) 43 | print('Model saved') 44 | -------------------------------------------------------------------------------- /ml/clearml/pytorch/notebooks/audio/README.md: -------------------------------------------------------------------------------- 1 | The `audio_classifier_UrbanSound8K.ipynb` example uses a small dataset based on [UrbanSound8K dataset](https://urbansounddataset.weebly.com/urbansound8k.html). -------------------------------------------------------------------------------- /ml/clearml/pytorch/notebooks/audio/audio_preprocessing_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "! pip install -U pip\n", 12 | "! pip install -U torch==1.5.1\n", 13 | "! pip install -U torchaudio==0.5.1\n", 14 | "! pip install -U matplotlib==3.2.1\n", 15 | "! pip install -U trains>=0.16.1\n", 16 | "! pip install -U tensorboard==2.2.1" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import torch\n", 27 | "import torchaudio\n", 28 | "from torch.utils.tensorboard import SummaryWriter\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "\n", 31 | "from trains import Task\n", 32 | "\n", 33 | "%matplotlib inline" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "task = Task.init(project_name='Audio Example', task_name='data pre-processing')\n", 43 | "configuration_dict = {'number_of_samples': 3}\n", 44 | "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", 45 | "print(configuration_dict) # printing actual configuration (after override in remote mode)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "tensorboard_writer = SummaryWriter('./tensorboard_logs')" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "scrolled": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "if not os.path.isdir('./data'):\n", 66 | " os.mkdir('./data')\n", 67 | "yesno_data = torchaudio.datasets.YESNO('./data', download=True)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "def plot_signal(signal, title, cmap=None):\n", 77 | " plt.figure()\n", 78 | " if signal.ndim == 1:\n", 79 | " plt.plot(signal)\n", 80 | " else:\n", 81 | " plt.imshow(signal, cmap=cmap) \n", 82 | " plt.title(title)\n", 83 | " plt.show()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "pycharm": { 91 | "name": "#%%\n" 92 | }, 93 | "scrolled": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "fixed_sample_rate = 22050\n", 98 | "for n in range(configuration_dict.get('number_of_samples', 3)):\n", 99 | " audio, sample_rate, labels = yesno_data[n]\n", 100 | " tensorboard_writer.add_audio('Audio samples/{}'.format(n), audio, n, sample_rate)\n", 101 | " \n", 102 | " resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=fixed_sample_rate)\n", 103 | " melspectogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=fixed_sample_rate, n_mels=128)\n", 104 | " \n", 105 | " audio_mono = torch.mean(resample_transform(audio), dim=0, keepdim=True)\n", 106 | " plot_signal(audio_mono[0,:], 'Original waveform')\n", 107 | " \n", 108 | " melspectogram = melspectogram_transform(audio_mono)\n", 109 | " plot_signal(melspectogram.squeeze().numpy(), 'Mel spectogram', 'hot')\n", 110 | " plot_signal(torchaudio.transforms.AmplitudeToDB()(melspectogram).squeeze().numpy(), 'Mel spectogram DB', 'hot')" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.4" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } 136 | -------------------------------------------------------------------------------- /ml/clearml/pytorch/notebooks/table/download_and_split.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "! pip install -U pip\n", 10 | "! pip install -U trains==0.16.2rc0\n", 11 | "! pip install -U pandas==1.0.4\n", 12 | "! pip install -U scikit-learn==0.23.1\n", 13 | "! pip install -U pathlib2==2.3.5" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "from pathlib2 import Path\n", 24 | "from sklearn.model_selection import train_test_split\n", 25 | "\n", 26 | "from trains import Task" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "task = Task.init(project_name='Tabular Example', task_name='Download and split tabular dataset')\n", 36 | "logger = task.get_logger()\n", 37 | "configuration_dict = {'test_size': 0.1, 'split_random_state': 0}\n", 38 | "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", 39 | "print(configuration_dict) # printing actual configuration (after override in remote mode)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# **Downloading**" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Download the shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n", 56 | "# and save it to your cloud storage or your mounted local storage\n", 57 | "# If the data is on your cloud storage, you can use trains' storage manager to get a local copy of it:\n", 58 | "# from trains.storage import StorageManager\n", 59 | "# path_to_ShelterAnimal = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip\", \n", 60 | "# extract_archive=True)\n", 61 | "path_to_ShelterAnimal = '/home/sam/Datasets/shelter-animal-outcomes'" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "train_set = pd.read_csv(Path(path_to_ShelterAnimal) / 'train.csv')\n", 71 | "logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "# **Splitting to train and val**" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "X = train_set.drop(columns= ['OutcomeType'])\n", 88 | "Y = train_set['OutcomeType']\n", 89 | "X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=configuration_dict.get('test_size', 0.1), \n", 90 | " random_state=configuration_dict.get('split_random_state', 0))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "train_df = X_train.join(Y_train)\n", 100 | "val_df = X_val.join(Y_val)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "task.upload_artifact('train_data', artifact_object=train_df)\n", 110 | "task.upload_artifact('val_data', artifact_object=val_df)" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.4" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } 136 | -------------------------------------------------------------------------------- /ml/clearml/pytorch/notebooks/table/pick_best_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "! pip install -U pip\n", 10 | "! pip install -U trains==0.16.2rc0" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from trains import Task, OutputModel" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "task = Task.init(project_name='Tabular Example', task_name='pick best model')\n", 29 | "configuration_dict = {'train_tasks_ids': ['c9bff3d15309487a9e5aaa00358ff091', 'c9bff3d15309487a9e5aaa00358ff091']}\n", 30 | "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", 31 | "print(configuration_dict) # printing actual configuration (after override in remote mode)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "results = {}\n", 41 | "for task_id in configuration_dict.get('train_tasks_ids'):\n", 42 | " train_task = Task.get_task(task_id)\n", 43 | " results[task_id] = train_task.get_last_scalar_metrics()['accuracy']['total']['last']" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "print(results)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "best_model_task_id = max(results.items(), key=lambda x: x[1])[0]\n", 62 | "best_model_id = Task.get_task(best_model_task_id).output_model_id" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "OutputModel(base_model_id=best_model_id)" 72 | ] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.7.4" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 4 96 | } 97 | -------------------------------------------------------------------------------- /ml/clearml/pytorch/notebooks/table/tabular_ml_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# pip install with locked versions\n", 10 | "! pip install -U pip\n", 11 | "! pip install -U trains==0.16.2rc0" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from trains import Task\n", 21 | "from trains.automation.controller import PipelineController" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "task = Task.init(project_name='Tabular Example', task_name='tabular training pipeline', task_type=Task.TaskTypes.controller)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "pipe = PipelineController(default_execution_queue='dan_queue', add_pipeline_tags=True)\n", 40 | "pipe.add_step(name='preprocessing_1', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n", 41 | " parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n", 42 | " 'General/fill_categorical_NA': 'True',\n", 43 | " 'General/fill_numerical_NA': 'True'})\n", 44 | "pipe.add_step(name='preprocessing_2', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n", 45 | " parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n", 46 | " 'General/fill_categorical_NA': 'False',\n", 47 | " 'General/fill_numerical_NA': 'True'})\n", 48 | " \n", 49 | "pipe.add_step(name='train_1', parents=['preprocessing_1'],\n", 50 | " base_task_project='Tabular Example', base_task_name='tabular prediction',\n", 51 | " parameter_override={'General/data_task_id': '${preprocessing_1.id}'})\n", 52 | "pipe.add_step(name='train_2', parents=['preprocessing_2'],\n", 53 | " base_task_project='Tabular Example', base_task_name='tabular prediction',\n", 54 | " parameter_override={'General/data_task_id': '${preprocessing_2.id}'})\n", 55 | " \n", 56 | "pipe.add_step(name='pick_best', parents=['train_1', 'train_2'],\n", 57 | " base_task_project='Tabular Example', base_task_name='pick best model',\n", 58 | " parameter_override={'General/train_tasks_ids': '[${train_1.id}, ${train_2.id}]'}) " 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Starting the pipeline (in the background)\n", 68 | "pipe.start()\n", 69 | "# Wait until pipeline terminates\n", 70 | "pipe.wait()\n", 71 | "# cleanup everything\n", 72 | "pipe.stop()" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.7.4" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 4 97 | } 98 | -------------------------------------------------------------------------------- /ml/clearml/pytorch/pytorch_tensorboardx.py: -------------------------------------------------------------------------------- 1 | ../tensorboardx/pytorch_tensorboardX.py -------------------------------------------------------------------------------- /ml/clearml/pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | tensorboardX 3 | tensorboard>=1.14.0 4 | torch>=1.1.0 5 | torchvision>=0.3.0 6 | clearml -------------------------------------------------------------------------------- /ml/clearml/pytorch/tensorboard_toy_pytorch.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tempfile import gettempdir 3 | 4 | import numpy as np 5 | from PIL import Image 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | from trains import Task 9 | task = Task.init(project_name='examples', task_name='pytorch tensorboard toy example') 10 | 11 | 12 | writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs')) 13 | 14 | # convert to 4d [batch, col, row, RGB-channels] 15 | image_open = Image.open(os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg")) 16 | image = np.asarray(image_open) 17 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis] 18 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2) 19 | image_rgba = image_rgba[np.newaxis, :, :, :] 20 | image = image[np.newaxis, :, :, :] 21 | 22 | writer.add_image("test/first", image[0], dataformats='HWC') 23 | writer.add_image("test_gray/second", image_gray[0], dataformats='HWC') 24 | writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC') 25 | # writer.add_image("image/first_series", image, max_outputs=10) 26 | # writer.add_image("image_gray/second_series", image_gray, max_outputs=10) 27 | # writer.add_image("image_rgba/third_series", image_rgba, max_outputs=10) 28 | 29 | print('Done!') 30 | -------------------------------------------------------------------------------- /ml/clearml/requirements.txt: -------------------------------------------------------------------------------- 1 | PyJWT==1.7.1 2 | -------------------------------------------------------------------------------- /ml/clearml/scikit-learn/model-harry.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/clearml/scikit-learn/model-harry.pkl -------------------------------------------------------------------------------- /ml/clearml/scikit-learn/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/clearml/scikit-learn/model.pkl -------------------------------------------------------------------------------- /ml/clearml/scikit-learn/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=0.13.2 2 | matplotlib >= 3.1.1 ; python_version >= '3.6' 3 | matplotlib >= 2.2.4 ; python_version < '3.6' 4 | scikit-learn 5 | clearml -------------------------------------------------------------------------------- /ml/clearml/scikit-learn/sklearn_joblib_example.py: -------------------------------------------------------------------------------- 1 | try: 2 | import joblib 3 | except ImportError: 4 | from sklearn.externals import joblib 5 | 6 | from sklearn import datasets 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.model_selection import train_test_split 9 | import numpy as np 10 | import matplotlib 11 | matplotlib.use('agg') # use agg instead of tkinter 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | 16 | from clearml import Task 17 | 18 | task = Task.init(project_name="examples", task_name="scikit-learn joblib example") 19 | 20 | iris = datasets.load_iris() 21 | X = iris.data 22 | y = iris.target 23 | 24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 25 | 26 | model = LogisticRegression(solver='liblinear', multi_class='auto') # sklearn LogisticRegression class 27 | model.fit(X_train, y_train) 28 | 29 | joblib.dump(model, 'model-harry.pkl', compress=True) 30 | 31 | loaded_model = joblib.load('model.pkl') 32 | result = loaded_model.score(X_test, y_test) 33 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 34 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 35 | h = .02 # step size in the mesh 36 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 37 | plt.figure(1, figsize=(4, 3)) 38 | 39 | plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired) 40 | plt.xlabel('Sepal length') 41 | plt.ylabel('Sepal width') 42 | 43 | plt.xlim(xx.min(), xx.max()) 44 | plt.ylim(yy.min(), yy.max()) 45 | plt.xticks(()) 46 | plt.yticks(()) 47 | 48 | plt.show() 49 | -------------------------------------------------------------------------------- /ml/clearml/tensorflow/legacy/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard>=1.14.0 2 | tensorflow>=1.14.0 3 | -------------------------------------------------------------------------------- /ml/clearml/tensorflow/legacy/tensorboard_toy.py: -------------------------------------------------------------------------------- 1 | # TRAINS - Example of tensorboard with tensorflow (without any actual training) 2 | # 3 | import os 4 | from tempfile import gettempdir 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | from PIL import Image 9 | 10 | from trains import Task 11 | task = Task.init(project_name='examples', task_name='tensorboard toy example') 12 | 13 | 14 | k = tf.placeholder(tf.float32) 15 | 16 | # Make a normal distribution, with a shifting mean 17 | mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1) 18 | # Record that distribution into a histogram summary 19 | tf.summary.histogram("normal/moving_mean", mean_moving_normal) 20 | tf.summary.scalar("normal/value", mean_moving_normal[-1]) 21 | 22 | # Make a normal distribution with shrinking variance 23 | variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k)) 24 | # Record that distribution too 25 | tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal) 26 | tf.summary.scalar("normal/variance_shrinking_normal", variance_shrinking_normal[-1]) 27 | 28 | # Let's combine both of those distributions into one dataset 29 | normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0) 30 | # We add another histogram summary to record the combined distribution 31 | tf.summary.histogram("normal/bimodal", normal_combined) 32 | tf.summary.scalar("normal/normal_combined", normal_combined[0]) 33 | 34 | # Add a gamma distribution 35 | gamma = tf.random_gamma(shape=[1000], alpha=k) 36 | tf.summary.histogram("gamma", gamma) 37 | 38 | # And a poisson distribution 39 | poisson = tf.random_poisson(shape=[1000], lam=k) 40 | tf.summary.histogram("poisson", poisson) 41 | 42 | # And a uniform distribution 43 | uniform = tf.random_uniform(shape=[1000], maxval=k*10) 44 | tf.summary.histogram("uniform", uniform) 45 | 46 | # Finally, combine everything together! 47 | all_distributions = [mean_moving_normal, variance_shrinking_normal, gamma, poisson, uniform] 48 | all_combined = tf.concat(all_distributions, 0) 49 | tf.summary.histogram("all_combined", all_combined) 50 | 51 | # Log text value 52 | tf.summary.text("this is a test", tf.make_tensor_proto("This is the content", dtype=tf.string)) 53 | 54 | # convert to 4d [batch, col, row, RGB-channels] 55 | image_open = Image.open(os.path.join("..", "..", "..", "reporting", "data_samples", "picasso.jpg")) 56 | image = np.asarray(image_open) 57 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis] 58 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2) 59 | image_rgba = image_rgba[np.newaxis, :, :, :] 60 | image = image[np.newaxis, :, :, :] 61 | 62 | tf.summary.image("test", image, max_outputs=10) 63 | tf.summary.image("test_gray", image_gray, max_outputs=10) 64 | tf.summary.image("test_rgba", image_rgba, max_outputs=10) 65 | 66 | # Setup a session and summary writer 67 | summaries = tf.summary.merge_all() 68 | sess = tf.Session() 69 | 70 | logger = task.get_logger() 71 | 72 | # Use original FileWriter for comparison , run: 73 | # % tensorboard --logdir=/tmp/histogram_example 74 | writer = tf.summary.FileWriter(os.path.join(gettempdir(), "histogram_example")) 75 | 76 | # Setup a loop and write the summaries to disk 77 | N = 40 78 | for step in range(N): 79 | k_val = step/float(N) 80 | summ = sess.run(summaries, feed_dict={k: k_val}) 81 | writer.add_summary(summ, global_step=step) 82 | 83 | print('Done!') 84 | -------------------------------------------------------------------------------- /ml/clearml/tensorflow/manual_model_upload.py: -------------------------------------------------------------------------------- 1 | # TRAINS - Example of manual model configuration and uploading 2 | # 3 | import os 4 | import tempfile 5 | 6 | import tensorflow as tf 7 | from trains import Task 8 | 9 | task = Task.init(project_name='examples', task_name='Model configuration and upload') 10 | 11 | model = tf.Module() 12 | 13 | # Connect a local configuration file 14 | config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json') 15 | config_file = task.connect_configuration(config_file) 16 | # then read configuration as usual, the backend will contain a copy of it. 17 | # later when executing remotely, the returned `config_file` will be a temporary file 18 | # containing a new copy of the configuration retrieved form the backend 19 | # # model_config_dict = json.load(open(config_file, 'rt')) 20 | 21 | # Or Store dictionary of definition for a specific network design 22 | model_config_dict = { 23 | 'value': 13.37, 24 | 'dict': {'sub_value': 'string', 'sub_integer': 11}, 25 | 'list_of_ints': [1, 2, 3, 4], 26 | } 27 | model_config_dict = task.connect_configuration(model_config_dict) 28 | 29 | # We now update the dictionary after connecting it, and the changes will be tracked as well. 30 | model_config_dict['new value'] = 10 31 | model_config_dict['value'] *= model_config_dict['new value'] 32 | 33 | # store the label enumeration of the training model 34 | labels = {'background': 0, 'cat': 1, 'dog': 2} 35 | task.connect_label_enumeration(labels) 36 | 37 | # storing the model, it will have the task network configuration and label enumeration 38 | print('Any model stored from this point onwards, will contain both model_config and label_enumeration') 39 | 40 | tempdir = tempfile.mkdtemp() 41 | tf.saved_model.save(model, os.path.join(tempdir, "model")) 42 | print('Model saved') 43 | -------------------------------------------------------------------------------- /ml/clearml/tensorflow/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard>=2.0 2 | tensorflow>=2.0 3 | trains -------------------------------------------------------------------------------- /ml/clearml/tensorflow/tensorboard_toy.py: -------------------------------------------------------------------------------- 1 | # TRAINS - Example of tensorboard with tensorflow (without any actual training) 2 | # 3 | import os 4 | import tensorflow as tf 5 | import numpy as np 6 | from tempfile import gettempdir 7 | from PIL import Image 8 | 9 | from trains import Task 10 | 11 | 12 | def generate_summary(k, step): 13 | # Make a normal distribution, with a shifting mean 14 | mean_moving_normal = tf.random.normal(shape=[1000], mean=(5 * k), stddev=1) 15 | # Record that distribution into a histogram summary 16 | tf.summary.histogram("normal/moving_mean", mean_moving_normal, step=step) 17 | tf.summary.scalar("normal/value", mean_moving_normal[-1], step=step) 18 | 19 | # Make a normal distribution with shrinking variance 20 | variance_shrinking_normal = tf.random.normal(shape=[1000], mean=0, stddev=1-k) 21 | # Record that distribution too 22 | tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal, step=step) 23 | tf.summary.scalar("normal/variance_shrinking_normal", variance_shrinking_normal[-1], step=step) 24 | 25 | # Let's combine both of those distributions into one dataset 26 | normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0) 27 | # We add another histogram summary to record the combined distribution 28 | tf.summary.histogram("normal/bimodal", normal_combined, step=step) 29 | tf.summary.scalar("normal/normal_combined", normal_combined[0], step=step) 30 | 31 | # Add a gamma distribution 32 | gamma = tf.random.gamma(shape=[1000], alpha=k) 33 | tf.summary.histogram("gamma", gamma, step=step) 34 | 35 | # And a poisson distribution 36 | poisson = tf.random.poisson(shape=[1000], lam=k) 37 | tf.summary.histogram("poisson", poisson, step=step) 38 | 39 | # And a uniform distribution 40 | uniform = tf.random.uniform(shape=[1000], maxval=k*10) 41 | tf.summary.histogram("uniform", uniform, step=step) 42 | 43 | # Finally, combine everything together! 44 | all_distributions = [mean_moving_normal, variance_shrinking_normal, gamma, poisson, uniform] 45 | all_combined = tf.concat(all_distributions, 0) 46 | tf.summary.histogram("all_combined", all_combined, step=step) 47 | 48 | # Log text value 49 | tf.summary.text("this is a test", "This is the content", step=step) 50 | 51 | # convert to 4d [batch, col, row, RGB-channels] 52 | image_open = Image.open(os.path.join('..', '..', 'reporting', 'data_samples', 'picasso.jpg')) 53 | image = np.asarray(image_open) 54 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis] 55 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2) 56 | image_rgba = image_rgba[np.newaxis, :, :, :] 57 | image = image[np.newaxis, :, :, :] 58 | 59 | tf.summary.image("test", image, max_outputs=10, step=step) 60 | tf.summary.image("test_gray", image_gray, max_outputs=10, step=step) 61 | tf.summary.image("test_rgba", image_rgba, max_outputs=10, step=step) 62 | 63 | 64 | task = Task.init(project_name='examples', task_name='tensorboard toy example') 65 | 66 | # create the tensorboard file writer in a temp folder 67 | writer = tf.summary.create_file_writer(os.path.join(gettempdir(), "toy_tb_example")) 68 | 69 | # Setup a loop and write the summaries to disk 70 | N = 40 71 | for step in range(N): 72 | k_val = step/float(N) 73 | with writer.as_default(): 74 | generate_summary(k_val, tf.cast(step, tf.int64)) 75 | 76 | print('Tensorboard toy example done') 77 | -------------------------------------------------------------------------------- /ml/clearml/wandb/latest-run: -------------------------------------------------------------------------------- 1 | run-20210201_173509-jrmpee7z -------------------------------------------------------------------------------- /ml/clearml/wandb/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | tensorboardX 3 | tensorboard>=1.14.0 4 | torch>=1.1.0 5 | torchvision>=0.3.0 6 | clearml 7 | wandb -------------------------------------------------------------------------------- /ml/clearml/xgboost/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib >= 3.1.1 ; python_version >= '3.6' 2 | matplotlib >= 2.2.4 ; python_version < '3.6' 3 | sklearn 4 | trains 5 | xgboost>=0.90 ; python_version >= '3' 6 | xgboost>=0.82 ; python_version < '3' 7 | # sudo apt-get install graphviz 8 | graphviz>=0.8 9 | -------------------------------------------------------------------------------- /ml/clearml/xgboost/xgboost_sample.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import xgboost as xgb 3 | from sklearn import datasets 4 | from sklearn.metrics import accuracy_score 5 | from sklearn.model_selection import train_test_split 6 | from xgboost import plot_tree 7 | 8 | from trains import Task 9 | 10 | task = Task.init(project_name='examples', task_name='XGBoost simple example') 11 | iris = datasets.load_iris() 12 | X = iris.data 13 | y = iris.target 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 15 | dtrain = xgb.DMatrix(X_train, label=y_train) 16 | dtest = xgb.DMatrix(X_test, label=y_test) 17 | param = { 18 | 'max_depth': 3, # the maximum depth of each tree 19 | 'eta': 0.3, # the training step for each iteration 20 | 'silent': 1, # logging mode - quiet 21 | 'objective': 'multi:softprob', # error evaluation for multiclass training 22 | 'num_class': 3} # the number of classes that exist in this datset 23 | num_round = 20 # the number of training iterations 24 | 25 | # noinspection PyBroadException 26 | try: 27 | # try to load a model 28 | bst = xgb.Booster(params=param, model_file='xgb.01.model') 29 | bst.load_model('xgb.01.model') 30 | except Exception: 31 | bst = None 32 | 33 | # if we dont have one train a model 34 | if bst is None: 35 | bst = xgb.train(param, dtrain, num_round) 36 | 37 | # store trained model model v1 38 | bst.save_model('xgb.01.model') 39 | bst.dump_model('xgb.01.raw.txt') 40 | 41 | # build classifier 42 | model = xgb.XGBClassifier() 43 | model.fit(X_train, y_train) 44 | 45 | # store trained classifier model 46 | model.save_model('xgb.02.model') 47 | 48 | # make predictions for test data 49 | y_pred = model.predict(X_test) 50 | predictions = [round(value) for value in y_pred] 51 | 52 | # evaluate predictions 53 | accuracy = accuracy_score(y_test, predictions) 54 | print("Accuracy: %.2f%%" % (accuracy * 100.0)) 55 | labels = dtest.get_label() 56 | 57 | # plot results 58 | xgb.plot_importance(model) 59 | plot_tree(model) 60 | plt.show() 61 | -------------------------------------------------------------------------------- /ml/document-clustering/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | Document Clustering with Python 3 4 | 5 | This is my revision of the great tutorial at http://brandonrose.org/clustering - many thanks to the author. 6 | 7 | ## TL;DR 8 | **Data**: Top 100 movies (http://www.imdb.com/list/ls055592025/) with title, genre, and synopsis (IMDB and Wiki) 9 | 10 | **Goal**: Put 100 movies into 5 clusters by text-mining their synopses and plot the result as follows 11 | 12 | screenshot 2016-05-23 20 50 20 13 | 14 | ## Setup 15 | 16 | First, clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 17 | 18 | ``` 19 | $ cd path_to_document-clustering 20 | $ virtualenv -p python3 venv 21 | $ source venv/bin/activate 22 | $ pip3 install -r requirements.txt 23 | ``` 24 | Second, use nltk.download() to download all nltk packages (a GUI will open and you can choose to install all packages: ~ 3.5G), which are saved to /Users/your_mac_username/nltk_data 25 | 26 | ``` 27 | ipython 28 | import nltk 29 | nltk.download() 30 | ``` 31 | 32 | Lastly, run `$ jupyter notebook` to go over the tutorial step-by-step. 33 | 34 | ## Key Steps 35 | 1. **Read data**: read titles, genres, synopses, rankings into four arrays 36 | 2. **Tokenize and stem**: break paragraphs into sentences, then to words, stem the words (without removing stopwords) - each synopsis essentially becomes a bag of stemmed words. 37 | 3. **Generate tf-idf matrix**: each row is a term (unigram, bigram, trigram...generated from the bag of words in 2.), each column is a synopsis. 38 | 4. **Generate clusters**: based on the tf-idf matrix, 5 (or any number) clusters are generated using k-means. The top key terms are selected for each cluster. 39 | 5. **Calculate similarity**: generate the cosine similarity matrix using the tf-idf matrix (100x100), then generate the distance matrix (1 - similarity matrix), so each pair of synopsis has a distance number between 0 and 1. 40 | 6. **Plot clusters**: use multidimensional scaling (MDS) to convert distance matrix to a 2-dimensional array, each synopsis has (x, y) that represents their relative location based on the distance matrix. Plot the 100 points with their (x, y) using matplotlib (I added an example on using plotly.js). 41 | -------------------------------------------------------------------------------- /ml/document-clustering/data/genres_list.txt: -------------------------------------------------------------------------------- 1 | [u' Crime', u' Drama'] 2 | [u' Crime', u' Drama'] 3 | [u' Biography', u' Drama', u' History'] 4 | [u' Biography', u' Drama', u' Sport'] 5 | [u' Drama', u' Romance', u' War'] 6 | [u' Drama'] 7 | [u' Drama', u' Romance', u' War'] 8 | [u' Drama', u' Mystery'] 9 | [u' Adventure', u' Family', u' Fantasy', u' Musical'] 10 | [u' Drama', u' Romance'] 11 | [u' Adventure', u' Biography', u' Drama', u' History', u' War'] 12 | [u' Crime', u' Drama'] 13 | [u' Horror', u' Mystery', u' Thriller'] 14 | [u' Drama', u' Film-Noir'] 15 | [u' Mystery', u' Romance', u' Thriller'] 16 | [u' Crime', u' Drama'] 17 | [u' Drama', u' Romance'] 18 | [u' Biography', u' Drama', u' Family', u' Musical', u' Romance'] 19 | [u' Crime', u' Drama', u' Musical', u' Romance', u' Thriller'] 20 | [u' Action', u' Adventure', u' Fantasy', u' Sci-Fi'] 21 | [u' Adventure', u' Family', u' Sci-Fi'] 22 | [u' Mystery', u' Sci-Fi'] 23 | [u' Crime', u' Drama', u' Thriller'] 24 | [u' Drama', u' Mystery', u' Thriller'] 25 | [u' Adventure', u' Drama', u' War'] 26 | [u' Comedy', u' Musical', u' Romance'] 27 | [u' Drama', u' Family', u' Fantasy'] 28 | [u' Comedy'] 29 | [u' Drama'] 30 | [u' Comedy', u' War'] 31 | [u' Biography', u' Drama', u' Music'] 32 | [u' Drama', u' War'] 33 | [u' Biography', u' Drama', u' History'] 34 | [u' Adventure', u' Fantasy'] 35 | [u' Action', u' Drama'] 36 | [u' Drama', u' Romance', u' War'] 37 | [u' Action', u' Drama', u' War'] 38 | [u' Western'] 39 | [u' Action', u' Adventure'] 40 | [u' Drama', u' Sport'] 41 | [u' Drama'] 42 | [u' Comedy', u' Romance'] 43 | [u' Drama'] 44 | [u' Musical', u' Romance'] 45 | [u' Drama', u' Romance', u' War'] 46 | [u' Drama', u' Family', u' Musical', u' Romance'] 47 | [u' Adventure', u' Drama'] 48 | [u' Drama', u' Romance', u' War'] 49 | [u' Biography', u' Drama', u' War'] 50 | [u' Drama', u' Thriller'] 51 | [u' Action', u' Biography', u' Drama', u' History', u' War'] 52 | [u' Western'] 53 | [u' Biography', u' Crime', u' Western'] 54 | [u' Action', u' Adventure', u' Drama', u' Western'] 55 | [u' Comedy', u' Drama', u' Romance'] 56 | [u' Drama', u' War'] 57 | [u' Western'] 58 | [u' Adventure', u' Drama', u' Western'] 59 | [u' Biography', u' Drama', u' War'] 60 | [u' Biography', u' Crime', u' Drama'] 61 | [u' Horror'] 62 | [u' Drama', u' War'] 63 | [u' Drama', u' War'] 64 | [u' Action', u' Crime', u' Thriller'] 65 | [u' Comedy', u' Drama', u' Romance'] 66 | [u' Biography', u' Drama', u' History'] 67 | [u' Comedy', u' Romance'] 68 | [u' Drama', u' Romance'] 69 | [u' Drama'] 70 | [u' Drama'] 71 | [u' Drama'] 72 | [u' Comedy', u' Drama', u' Romance'] 73 | [u' Biography', u' Drama', u' Romance'] 74 | [u' Drama'] 75 | [u' Comedy', u' Drama'] 76 | [u' Comedy', u' Drama', u' Romance'] 77 | [u' Crime', u' Drama', u' Thriller'] 78 | [u' Drama', u' Romance'] 79 | [u' Drama'] 80 | [u' Drama', u' Romance', u' Western'] 81 | [u' Crime', u' Drama', u' Fantasy', u' Mystery'] 82 | [u' Drama', u' Sci-Fi'] 83 | [u' Drama'] 84 | [u' Drama', u' Music'] 85 | [u' Comedy', u' Drama', u' Romance'] 86 | [u' Comedy', u' Drama'] 87 | [u' Crime', u' Drama', u' Thriller'] 88 | [u' Adventure', u' Romance', u' War'] 89 | [u' Adventure', u' Western'] 90 | [u' Adventure', u' Drama', u' History'] 91 | [u' Drama', u' Film-Noir', u' Mystery'] 92 | [u' Crime', u' Drama', u' Sci-Fi'] 93 | [u' Crime', u' Drama'] 94 | [u' Drama', u' Romance'] 95 | [u' Crime', u' Drama', u' Film-Noir', u' Thriller'] 96 | [u' Drama'] 97 | [u' Mystery', u' Thriller'] 98 | [u' Film-Noir', u' Mystery', u' Thriller'] 99 | [u' Mystery', u' Thriller'] 100 | [u' Biography', u' Drama', u' Musical'] 101 | -------------------------------------------------------------------------------- /ml/document-clustering/data/title_list.txt: -------------------------------------------------------------------------------- 1 | The Godfather 2 | The Shawshank Redemption 3 | Schindler's List 4 | Raging Bull 5 | Casablanca 6 | One Flew Over the Cuckoo's Nest 7 | Gone with the Wind 8 | Citizen Kane 9 | The Wizard of Oz 10 | Titanic 11 | Lawrence of Arabia 12 | The Godfather: Part II 13 | Psycho 14 | Sunset Blvd. 15 | Vertigo 16 | On the Waterfront 17 | Forrest Gump 18 | The Sound of Music 19 | West Side Story 20 | Star Wars 21 | E.T. the Extra-Terrestrial 22 | 2001: A Space Odyssey 23 | The Silence of the Lambs 24 | Chinatown 25 | The Bridge on the River Kwai 26 | Singin' in the Rain 27 | It's a Wonderful Life 28 | Some Like It Hot 29 | 12 Angry Men 30 | Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb 31 | Amadeus 32 | Apocalypse Now 33 | Gandhi 34 | The Lord of the Rings: The Return of the King 35 | Gladiator 36 | From Here to Eternity 37 | Saving Private Ryan 38 | Unforgiven 39 | Raiders of the Lost Ark 40 | Rocky 41 | A Streetcar Named Desire 42 | The Philadelphia Story 43 | To Kill a Mockingbird 44 | An American in Paris 45 | The Best Years of Our Lives 46 | My Fair Lady 47 | Ben-Hur 48 | Doctor Zhivago 49 | Patton 50 | Jaws 51 | Braveheart 52 | The Good, the Bad and the Ugly 53 | Butch Cassidy and the Sundance Kid 54 | The Treasure of the Sierra Madre 55 | The Apartment 56 | Platoon 57 | High Noon 58 | Dances with Wolves 59 | The Pianist 60 | Goodfellas 61 | The Exorcist 62 | The Deer Hunter 63 | All Quiet on the Western Front 64 | The French Connection 65 | City Lights 66 | The King's Speech 67 | It Happened One Night 68 | A Place in the Sun 69 | Midnight Cowboy 70 | Mr. Smith Goes to Washington 71 | Rain Man 72 | Annie Hall 73 | Out of Africa 74 | Good Will Hunting 75 | Terms of Endearment 76 | Tootsie 77 | Fargo 78 | Giant 79 | The Grapes of Wrath 80 | Shane 81 | The Green Mile 82 | Close Encounters of the Third Kind 83 | Network 84 | Nashville 85 | The Graduate 86 | American Graffiti 87 | Pulp Fiction 88 | The African Queen 89 | Stagecoach 90 | Mutiny on the Bounty 91 | The Maltese Falcon 92 | A Clockwork Orange 93 | Taxi Driver 94 | Wuthering Heights 95 | Double Indemnity 96 | Rebel Without a Cause 97 | Rear Window 98 | The Third Man 99 | North by Northwest 100 | Yankee Doodle Dandy 101 | -------------------------------------------------------------------------------- /ml/document-clustering/requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | jupyter 3 | matplotlib 4 | nltk 5 | pandas 6 | scikit-learn 7 | scipy 8 | -------------------------------------------------------------------------------- /ml/feature-importance/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is my revised code of the tutorial at: 4 | - https://machinelearningmastery.com/calculate-feature-importance-with-python/ 5 | - https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/ 6 | 7 | ## Setup 8 | 9 | for xgboost to work, do `brew install libomp` on Mac 10 | 11 | ``` 12 | python3 -m venv venv 13 | source venv/bin/activate 14 | pip install -r requirements.txt 15 | ``` 16 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 17 | 18 | -------------------------------------------------------------------------------- /ml/feature-importance/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib 3 | pandas 4 | scikit-learn 5 | xgboost 6 | -------------------------------------------------------------------------------- /ml/few-shot-learning/.gitignore: -------------------------------------------------------------------------------- 1 | /datasets/omniglot/data 2 | /model 3 | /logs -------------------------------------------------------------------------------- /ml/few-shot-learning/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is my revised code of the tutorial at: https://medium.com/@barnrang/re-implementation-of-the-prototypical-network-for-few-shot-learning-using-tensorflow-2-0-keras-b2adac8e49e0 4 | 5 | The related paper is: Jake Snell and Kevin Swersky and Richard S. Zemel (2017). Prototypical Networks for Few-shot LearningCoRR, abs/1703.05175. https://arxiv.org/abs/1703.05175 6 | ## Setup 7 | 8 | Setup virtual environment and install packages 9 | 10 | ``` 11 | python3 -m venv venv 12 | source venv/bin/activate 13 | pip install -r requirements.txt 14 | ``` 15 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 16 | 17 | ## Prepare Datasets 18 | Download dataset at https://drive.google.com/file/d/1UQEdAv4g_Mh2t15YtorNHkoHfQZJfmoE/view?usp=sharing 19 | 20 | For omniglot dataset: 21 | 22 | ``` 23 | cd datasets/omniglot 24 | mkdir data 25 | unzip images_background.zip -d data/ 26 | unzip images_evaluation.zip -d data/ 27 | mv data/images_evaluation/* data/images_background/ 28 | python dataloader_omniglot.py 29 | ``` 30 | 31 | Note that we split (1200 * 4(rotate 4 direction)) classes for training and the rest for the test set. The dataset will be collected into a numpy file .npy 32 | 33 | ## Train and Test 34 | 35 | To train: 36 | 37 | In the root folder of this repo, run `python train_omniglot.py` to train 2 epochs by default (about 10 minutes on MacBook Pro). 38 | 39 | You can use different arguments: 40 | 41 | - `python train_omniglot.py --epoch 100` 42 | - `python train_omniglot.py --train_way 60 --train_query 5 --val_way 20 --shot 1 --gpu 0[for specify the gpu]` 43 | 44 | temp checkpoints (with the format `omniglot_conv_{epoch}_{shot}_{val_way}`) and the final model `omniglot_conv` are saved in the `/model` folder (ignored by git) 45 | 46 | Show training visualization using Tensorboard, run `tensorboard --logdir=./logs --port=6006` 47 | 48 | Then, you can access TensorBoard at http://localhost:6006/ 49 | 50 | Screen Shot 2020-09-28 at 4 35 40 PM 51 | 52 | To test: 53 | 54 | `python test_omniglot.py --model model/omniglot_conv --shot 1 --test_way 20` 55 | 56 | 57 | -------------------------------------------------------------------------------- /ml/few-shot-learning/datasets/mini_imagenet/dataloader_mini_imagenet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from skimage.io import imread 4 | from skimage.transform import resize as imresize 5 | import os 6 | 7 | train_label = pd.read_csv('train.csv') 8 | val_label = pd.read_csv('val.csv') 9 | test_label = pd.read_csv('test.csv') 10 | 11 | train_images = [] 12 | 13 | PATH = 'images' 14 | 15 | for name, df in train_label['filename'].groupby(train_label['label']): 16 | images = [] 17 | for image_name in df.values: 18 | image = imread(os.path.join(PATH, image_name)) 19 | image = (imresize(image, (84,84)) * 255.).astype(np.uint8) 20 | images.append(image) 21 | 22 | train_images.append(images) 23 | 24 | val_images = [] 25 | 26 | PATH = 'images' 27 | 28 | for name, df in val_label['filename'].groupby(val_label['label']): 29 | images = [] 30 | for image_name in df.values: 31 | image = imread(os.path.join(PATH, image_name)) 32 | image = (imresize(image, (84,84)) * 255.).astype(np.uint8) 33 | images.append(image) 34 | 35 | val_images.append(images) 36 | 37 | test_images = [] 38 | 39 | PATH = 'images' 40 | 41 | for name, df in test_label['filename'].groupby(test_label['label']): 42 | images = [] 43 | for image_name in df.values: 44 | image = imread(os.path.join(PATH, image_name)) 45 | image = (imresize(image, (84,84)) * 255.).astype(np.uint8) 46 | images.append(image) 47 | 48 | test_images.append(images) 49 | 50 | train_images = np.array(train_images) 51 | 52 | val_images = np.array(val_images) 53 | test_images = np.array(test_images) 54 | 55 | np.save('mini_train', train_images) 56 | np.save('mini_val', val_images) 57 | np.save('mini_test', test_images) 58 | 59 | -------------------------------------------------------------------------------- /ml/few-shot-learning/datasets/omniglot/dataloader_omniglot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import matplotlib.pyplot as plt 4 | from tqdm import tqdm 5 | from skimage.transform import resize as imresize 6 | from skimage.transform import rotate 7 | BASE_PATH = "data/images_background" 8 | TRAIN_CLASS = 1200 9 | 10 | 11 | def loader(path=None): 12 | index = 0 13 | train_images = [] 14 | eval_images = [] 15 | current_save = train_images 16 | if path is None: 17 | path = BASE_PATH 18 | folders_list = os.listdir(path) 19 | folders_list.sort() 20 | count = 0 21 | loading_eval = False 22 | for folder in tqdm(folders_list): 23 | path1 = os.path.join(path, folder) 24 | try: #In case of invalid folder 25 | for char_type in os.listdir(path1): 26 | if not loading_eval and count >= 1200: 27 | loading_eval = True 28 | current_save = eval_images 29 | print("Start to collect eval") 30 | 31 | path2 = os.path.join(path1, char_type) 32 | try: 33 | for rot in [0,90,180,270]: 34 | class_image = [] 35 | for image_name in os.listdir(path2): 36 | image = plt.imread(os.path.join(path2, image_name)) 37 | image = imresize(image,(28,28), anti_aliasing=False) 38 | image = rotate(image, rot) 39 | image = np.expand_dims(image, axis=-1) 40 | class_image.append(image) 41 | current_save.append(class_image) 42 | count += 1 43 | except NotADirectoryError: 44 | print(f"Cannot load from {path2}") 45 | except NotADirectoryError: 46 | print(f"cannot load from {path1}") 47 | continue 48 | 49 | np.save(f"./data/train_omniglot.npy", (np.array(train_images) * 255).astype(np.uint8)) 50 | np.save(f"./data/test_omniglot.npy", (np.array(eval_images) * 255).astype(np.uint8)) 51 | 52 | 53 | if __name__ == "__main__": 54 | images = loader() 55 | -------------------------------------------------------------------------------- /ml/few-shot-learning/loader_omniglot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow 4 | from tensorflow import keras 5 | 6 | 7 | class DataGenerator(tensorflow.keras.utils.Sequence): 8 | 'Generates data for Keras' 9 | def __init__(self, data_type='train', dim=(28,28), n_channels=1, 10 | way=20, shot=1, query=1, num_batch=500): 11 | 'Initialization' 12 | self.type = data_type 13 | # if self.type == 'train': 14 | # self.is_training = np.array([True for _ in range(batch_size)]) 15 | # else: 16 | # self.is_training = np.array([False for _ in range(batch_size)]) 17 | self.dim = dim 18 | #self.batch_size = batch_size 19 | self.n_channels = n_channels 20 | self.num_per_class = 20 21 | self.num_batch = num_batch 22 | #self.y_target = np.zeros(self.batch_size) 23 | self.build_data(self.type) 24 | self.on_epoch_end() 25 | self.way = way 26 | self.shot = shot 27 | self.query = query 28 | #TODO!!!! 29 | #self.hard_batch = np.zeros(batch_size, *dim, n_channels) 30 | 31 | def build_data(self, data_type): 32 | if data_type == 'train': 33 | self.class_data = np.load('datasets/omniglot/data/train_omniglot.npy') 34 | else: 35 | self.class_data = np.load('datasets/omniglot/data/test_omniglot.npy') 36 | 37 | self.n_classes = len(self.class_data) 38 | 39 | def __len__(self): 40 | 'Denotes the number of batches per epoch' 41 | return self.num_batch 42 | 43 | def __getitem__(self, index): 44 | 'Generate one batch of data' 45 | # Generate data 46 | X_sample, X_query, label = self.__data_generation() 47 | #way = np.ones((self.way * self.shot, 1)) * self.way 48 | 49 | 50 | return [X_sample, X_query], label 51 | 52 | def on_epoch_end(self): 53 | 'Updates indexes after each epoch' 54 | pass 55 | 56 | def __data_generation(self): 57 | 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) 58 | # Initialization 59 | X_sample = np.empty((self.way, self.shot, *self.dim, self.n_channels)) 60 | X_query = np.empty((self.way, self.query, *self.dim, self.n_channels)) 61 | chosen_class = random.sample(range(self.n_classes), self.way) 62 | label = np.empty(self.way * self.query) 63 | # print(pos, neg) 64 | # print(self.class_data[pos][0].shape) 65 | # Generate data 66 | for i in range(self.way): 67 | sample_idx = random.sample(range(self.num_per_class), self.shot + self.query) 68 | sample_data = self.class_data[chosen_class[i]][sample_idx]/255. 69 | X_sample[i] = sample_data[:self.shot] 70 | X_query[i] = sample_data[self.shot:self.shot + self.query] 71 | label[i * self.query: (i+1) * self.query] = i 72 | return X_sample, X_query, keras.utils.to_categorical(label) 73 | #return X, keras.utils.to_categorical(y, num_classes=self.n_classes) 74 | -------------------------------------------------------------------------------- /ml/few-shot-learning/mini_imagenet/mini_proto_model.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D, Activation 2 | from tensorflow.keras.layers import BatchNormalization 3 | from tensorflow.keras.models import Model, Sequential 4 | from tensorflow.keras.regularizers import l2 5 | from tensorflow.keras import backend as K 6 | from tensorflow.keras.optimizers import SGD,Adam 7 | from tensorflow.keras.losses import binary_crossentropy 8 | import tensorflow as tf 9 | import numpy.random as rng 10 | import numpy as np 11 | import os 12 | import matplotlib.pyplot as plt 13 | eps = 1e-12 14 | 15 | def W_init(shape,name=None): 16 | """Initialize weights as in paper""" 17 | values = rng.normal(loc=0,scale=1e-2,size=shape) 18 | return K.variable(values,name=name) 19 | #//TODO: figure out how to initialize layer biases in tensorflow.keras. 20 | def b_init(shape,name=None): 21 | """Initialize bias as in paper""" 22 | values=rng.normal(loc=0.5,scale=1e-2,size=shape) 23 | return K.variable(values,name=name) 24 | 25 | input_shape = (84,84, 3) 26 | 27 | #build convnet to use in each siamese 'leg' 28 | def conv_net(): 29 | convnet = Sequential() 30 | for i in range(4): 31 | convnet.add(Conv2D(64,(3,3),padding='same',input_shape=input_shape)) 32 | convnet.add(BatchNormalization()) 33 | convnet.add(Activation('relu')) 34 | convnet.add(MaxPooling2D()) 35 | convnet.add(Flatten()) 36 | return convnet 37 | 38 | def l1_distance(x,y): 39 | return tf.reduce_sum(tf.maximum(tf.abs(x-y),eps), axis=1, keep_dims=True) 40 | 41 | def l2_distance(x,y): 42 | return tf.sqrt(tf.reduce_sum(tf.maximum(tf.square(x-y),eps), axis=1, keep_dims=True)) 43 | 44 | def hinge_loss(target, pred, h=1.): 45 | loss = tf.reduce_mean(tf.maximum(pred + h, 0.)) 46 | return loss 47 | 48 | def acc(target, pred): 49 | result = tf.cast(tf.less(pred, target), dtype=tf.float32) 50 | return tf.reduce_mean(result) 51 | -------------------------------------------------------------------------------- /ml/few-shot-learning/mini_imagenet/mini_proto_test.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | def parser(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--test_way', dest='test_way', type=int, default=5) 7 | parser.add_argument('--shot', dest='shot', type=int, default=1) 8 | parser.add_argument('--gpu', dest='gpu', type=int, default=0) 9 | parser.add_argument('--model', dest='model') 10 | 11 | return parser.parse_args() 12 | 13 | args = parser() 14 | os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu) 15 | 16 | from tensorflow.keras import callbacks as cb 17 | from tensorflow.keras.optimizers import Adam 18 | from tensorflow.keras.models import load_model, Model, save_model 19 | from tensorflow.keras.layers import * 20 | from tensorflow.keras.models import Sequential 21 | from tensorflow.keras import regularizers as rg 22 | from tensorflow.keras.preprocessing.image import ImageDataGenerator 23 | from tensorflow.keras.applications.xception import Xception 24 | from tensorflow.keras import backend as K 25 | 26 | 27 | import numpy.random as rng 28 | 29 | import numpy as np 30 | import matplotlib.pyplot as plt 31 | import matplotlib.image as img 32 | import random 33 | from python.dataloader import loader 34 | from mini_protoloader import DataGenerator 35 | from mini_proto_model import conv_net, hinge_loss, l2_distance, acc, l1_distance 36 | #from transform import transform_gate 37 | from util.tensor_op import * 38 | from util.loss import * 39 | input_shape = (None,84,84,3) 40 | batch_size = 20 41 | test_way = args.test_way 42 | shot = args.shot 43 | model_path = args.model 44 | lr = 0.002 45 | 46 | def scheduler(epoch): 47 | global lr 48 | if epoch % 15 == 0: 49 | lr /= 2 50 | return lr 51 | 52 | class SaveConv(tf.keras.callbacks.Callback): 53 | def on_epoch_end(self, epoch, logs=None): 54 | if epoch % 5 == 0: 55 | save_model(conv, f"model/miniimage_conv_{epoch}_{shot}_{val_way}") 56 | 57 | 58 | if __name__ == "__main__": 59 | #conv = conv_net() 60 | conv = load_model(model_path) 61 | sample = Input(input_shape) 62 | conv_5d = TimeDistributed(conv) 63 | out_feature = conv_5d(sample) 64 | out_feature = Lambda(reduce_tensor)(out_feature) 65 | inp = Input(input_shape) 66 | map_feature = conv_5d(inp) 67 | map_feature = Lambda(reshape_query)(map_feature) 68 | pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance 69 | combine = Model([sample, inp], pred) 70 | 71 | optimizer = Adam(0.001) 72 | combine.compile(loss='categorical_crossentropy', optimizer=optimizer, 73 | metrics=['categorical_accuracy']) 74 | test_loader = DataGenerator(data_type='test',way=test_way, shot=shot, num_batch=10000) 75 | 76 | combine.evaluate(test_loader) 77 | 78 | 79 | -------------------------------------------------------------------------------- /ml/few-shot-learning/mini_imagenet/mini_protoloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.utils import np_utils 3 | from tensorflow.keras.preprocessing.image import ImageDataGenerator 4 | import tensorflow 5 | import keras 6 | import random 7 | from python.dataloader import loader 8 | 9 | class DataGenerator(tensorflow.keras.utils.Sequence): 10 | 'Generates data for Keras' 11 | def __init__(self, data_type='train', dim=(84,84), n_channels=3, 12 | way=5, shot=1, query=5, num_batch=500): 13 | 'Initialization' 14 | self.type = data_type 15 | # if self.type == 'train': 16 | # self.is_training = np.array([True for _ in range(batch_size)]) 17 | # else: 18 | # self.is_training = np.array([False for _ in range(batch_size)]) 19 | self.dim = dim 20 | #self.batch_size = batch_size 21 | self.n_channels = n_channels 22 | self.num_per_class = 600 23 | self.num_batch = num_batch 24 | #self.y_target = np.zeros(self.batch_size) 25 | self.build_data(self.type) 26 | self.on_epoch_end() 27 | self.way = way 28 | self.shot = shot 29 | self.query = query 30 | self.transformer = ImageDataGenerator( 31 | width_shift_range=0.1, 32 | height_shift_range=0.1, 33 | zoom_range=0.2, 34 | rotation_range=30, 35 | horizontal_flip=True, 36 | shear_range=0.1 37 | 38 | ) 39 | #TODO!!!! 40 | #self.hard_batch = np.zeros(batch_size, *dim, n_channels) 41 | 42 | def build_data(self, data_type): 43 | if data_type == 'train': 44 | self.class_data = np.load('python/mini_train.npy') 45 | elif data_type == 'val': 46 | self.class_data = np.load('python/mini_val.npy') 47 | else: 48 | self.class_data = np.load('python/mini_test.npy') 49 | 50 | self.n_classes = len(self.class_data) 51 | 52 | def __len__(self): 53 | 'Denotes the number of batches per epoch' 54 | return self.num_batch 55 | 56 | def __getitem__(self, index): 57 | 'Generate one batch of data' 58 | # Generate data 59 | X_sample, X_query, label = self.__data_generation() 60 | #way = np.ones((self.way * self.shot, 1)) * self.way 61 | 62 | 63 | return [X_sample, X_query], label 64 | 65 | def on_epoch_end(self): 66 | 'Updates indexes after each epoch' 67 | pass 68 | 69 | def __data_generation(self): 70 | 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) 71 | # Initialization 72 | X_sample = np.empty((self.way, self.shot, *self.dim, self.n_channels)) 73 | X_query = np.empty((self.way, self.query, *self.dim, self.n_channels)) 74 | chosen_class = random.sample(range(self.n_classes), self.way) 75 | label = np.empty(self.way * self.query) 76 | # print(pos, neg) 77 | # print(self.class_data[pos][0].shape) 78 | # Generate data 79 | for i in range(self.way): 80 | sample_idx = random.sample(range(self.num_per_class), self.shot + self.query) 81 | sample_data = self.class_data[chosen_class[i]][sample_idx]/255. 82 | if True: 83 | #if self.type != 'train': 84 | X_sample[i] = sample_data[:self.shot] 85 | X_query[i] = sample_data[self.shot:self.shot + self.query] 86 | else: 87 | for j in range(self.shot): 88 | params = self.transformer.get_random_transform(self.dim + (self.n_channels,)) 89 | x = self.transformer.apply_transform(sample_data[j], params) 90 | X_sample[i][j] = x 91 | 92 | for j in range(self.shot, self.shot + self.query): 93 | params = self.transformer.get_random_transform(self.dim + (self.n_channels,)) 94 | x = self.transformer.apply_transform(sample_data[j], params) 95 | X_query[i][j-self.shot] = x 96 | 97 | label[i * self.query: (i+1) * self.query] = i 98 | return X_sample, X_query, np_utils.to_categorical(label) 99 | #return X, keras.utils.to_categorical(y, num_classes=self.n_classes) 100 | -------------------------------------------------------------------------------- /ml/few-shot-learning/model_omniglot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten, MaxPooling2D, Activation, BatchNormalization 4 | from tensorflow.keras.models import Model, Sequential 5 | 6 | 7 | eps = 1e-12 8 | 9 | def W_init(shape,name=None): 10 | """Initialize weights as in paper""" 11 | values = np.random.normal(loc=0, scale=1e-2, size=shape) 12 | return tf.variable(values, name=name) 13 | 14 | 15 | #//TODO: figure out how to initialize layer biases in tensorflow.keras. 16 | def b_init(shape, name=None): 17 | """Initialize bias as in paper""" 18 | values=np.random.normal(loc=0.5, scale=1e-2, size=shape) 19 | return tf.variable(values, name=name) 20 | 21 | input_shape = (28, 28, 1) 22 | 23 | 24 | #build convnet to use in each siamese 'leg' 25 | def conv_net(): 26 | convnet = Sequential() 27 | for i in range(4): 28 | convnet.add(Conv2D(64,(3,3),padding='same', input_shape=input_shape)) 29 | convnet.add(BatchNormalization()) 30 | convnet.add(Activation('relu')) 31 | convnet.add(MaxPooling2D()) 32 | convnet.add(Flatten()) 33 | return convnet 34 | 35 | 36 | def l1_distance(x,y): 37 | return tf.reduce_sum(tf.maximum(tf.abs(x-y),eps), axis=1, keep_dims=True) 38 | 39 | 40 | def l2_distance(x,y): 41 | return tf.sqrt(tf.reduce_sum(tf.maximum(tf.square(x-y),eps), axis=1, keep_dims=True)) 42 | 43 | 44 | def hinge_loss(target, pred, h=1.): 45 | loss = tf.reduce_mean(tf.maximum(pred + h, 0.)) 46 | return loss 47 | 48 | 49 | def acc(target, pred): 50 | result = tf.cast(tf.less(pred, target), dtype=tf.float32) 51 | return tf.reduce_mean(result) 52 | -------------------------------------------------------------------------------- /ml/few-shot-learning/notebooks/dataloader_notebook/images_background_small2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/few-shot-learning/notebooks/dataloader_notebook/images_background_small2.zip -------------------------------------------------------------------------------- /ml/few-shot-learning/notebooks/dataloader_notebook/loss_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/barnrang/.conda/envs/chatbot/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 13 | " return f(*args, **kwds)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import tensorflow as tf\n", 19 | "from util.loss import prior_dist\n", 20 | "%load_ext autoreload\n", 21 | "%autoreload 2" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "x = tf.placeholder(shape=[None, 2], dtype=tf.float32)\n", 33 | "y = tf.placeholder(shape=[None, 2], dtype=tf.float32)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 6, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Tensor(\"Sum_4:0\", shape=(?, 1), dtype=float32) Tensor(\"Sum_5:0\", shape=(?, 1), dtype=float32)\n", 46 | "Tensor(\"MatMul_2:0\", shape=(?, ?), dtype=float32)\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "z = prior_dist([x,y])" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 8, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "[[ 9. 17. 29.]\n", 64 | " [ 1. 1. 5.]]\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "with tf.Session() as sess:\n", 70 | " print(sess.run(z, feed_dict={x:[[1,3],[2,4],[3,5]],\n", 71 | " y:[[1,0],[1,4]]}))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.6.3" 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 2 105 | } 106 | -------------------------------------------------------------------------------- /ml/few-shot-learning/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | numpy 3 | scikit-image 4 | tensorflow >= 2.0 5 | tqdm -------------------------------------------------------------------------------- /ml/few-shot-learning/test_omniglot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tensorflow.keras import callbacks as cb 5 | from tensorflow.keras.optimizers import Adam 6 | from tensorflow.keras.models import load_model, Model, save_model 7 | from tensorflow.keras.layers import * 8 | 9 | from loader_omniglot import DataGenerator 10 | from model_omniglot import conv_net 11 | from util.tensor_op import * 12 | from util.loss import * 13 | 14 | 15 | def parser(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--test_way', dest='test_way', type=int, default=5) 18 | parser.add_argument('--shot', dest='shot', type=int, default=1) 19 | parser.add_argument('--gpu', dest='gpu', type=int, default=0) 20 | parser.add_argument('--model', dest='model') 21 | 22 | return parser.parse_args() 23 | 24 | args = parser() 25 | os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu) 26 | test_way = args.test_way 27 | shot = args.shot 28 | model_path = args.model 29 | 30 | input_shape = (None, 28, 28, 1) 31 | batch_size = 20 32 | lr = 0.002 33 | 34 | 35 | def scheduler(epoch): 36 | global lr 37 | if epoch % 15 == 0: 38 | lr /= 2 39 | return lr 40 | 41 | 42 | class SaveConv(tf.keras.callbacks.Callback): 43 | def on_epoch_end(self, epoch, logs=None): 44 | if epoch % 5 == 0: 45 | save_model(conv, f"model/omniglot_conv_{epoch}_{shot}_{val_way}") 46 | 47 | 48 | if __name__ == "__main__": 49 | conv = load_model(model_path) 50 | sample = Input(input_shape) 51 | conv_5d = TimeDistributed(conv) 52 | out_feature = conv_5d(sample) 53 | out_feature = Lambda(reduce_tensor)(out_feature) 54 | inp = Input(input_shape) 55 | map_feature = conv_5d(inp) 56 | map_feature = Lambda(reshape_query)(map_feature) 57 | pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance 58 | combine = Model([sample, inp], pred) 59 | 60 | optimizer = Adam(0.001) 61 | combine.compile(loss='categorical_crossentropy', optimizer=optimizer, 62 | metrics=['categorical_accuracy']) 63 | test_loader = DataGenerator(data_type='test', way=test_way, shot=shot, num_batch=10000) 64 | 65 | combine.evaluate(test_loader) -------------------------------------------------------------------------------- /ml/few-shot-learning/train_omniglot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tensorflow.keras import callbacks as cb 5 | from tensorflow.keras.optimizers import Adam 6 | from tensorflow.keras.models import load_model, Model, save_model 7 | from tensorflow.keras.layers import * 8 | 9 | # import from custom modules 10 | from loader_omniglot import DataGenerator 11 | from model_omniglot import conv_net 12 | from util.tensor_op import * 13 | from util.loss import * 14 | 15 | # command line argument parser 16 | def parser(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--train_way', dest='train_way', type=int, default=60) 19 | parser.add_argument('--train_query', dest='train_query', type=int, default=5) 20 | parser.add_argument('--val_way', dest='val_way', type=int, default=20) 21 | parser.add_argument('--shot', dest='shot', type=int, default=1) 22 | parser.add_argument('--gpu', dest='gpu', type=int, default=0) 23 | parser.add_argument('--epochs', dest='epochs', type=int, default=2) 24 | 25 | return parser.parse_args() 26 | 27 | args = parser() 28 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) 29 | 30 | # get values from the command line arguments 31 | train_way = args.train_way 32 | train_query = args.train_query 33 | val_way = args.val_way 34 | shot = args.shot 35 | epochs = args.epochs 36 | 37 | # specify model parameters 38 | input_shape = (None, 28, 28, 1) 39 | batch_size = 20 40 | lr = 0.002 41 | 42 | def scheduler(epoch): 43 | global lr 44 | if epoch % 100 == 0: 45 | lr /= 2 46 | return lr 47 | 48 | class SaveConv(tf.keras.callbacks.Callback): 49 | def on_epoch_end(self, epoch, logs=None): 50 | if epoch % 50 == 0: 51 | save_model(conv, f"model/omniglot_conv_{epoch}_{shot}_{val_way}") 52 | 53 | if __name__ == "__main__": 54 | conv = conv_net() 55 | sample = Input(input_shape) 56 | conv_5d = TimeDistributed(conv) 57 | out_feature = conv_5d(sample) 58 | out_feature = Lambda(reduce_tensor)(out_feature) 59 | inp = Input(input_shape) 60 | map_feature = conv_5d(inp) 61 | map_feature = Lambda(reshape_query)(map_feature) 62 | # proto_dist is from util/loss.py 63 | pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance 64 | combine = Model([sample, inp], pred) 65 | 66 | optimizer = Adam(0.001) 67 | combine.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy']) 68 | 69 | train_loader = DataGenerator(way=train_way, query=train_query, shot=shot, num_batch=1000) 70 | val_loader = DataGenerator(data_type='val',way=val_way, shot=shot) 71 | 72 | (x,y), z = train_loader[0] 73 | print(x.shape, y.shape, z.shape) 74 | print(combine.summary()) 75 | 76 | save_conv = SaveConv() 77 | reduce_lr = cb.ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=2, min_lr=1e-8) 78 | lr_sched = cb.LearningRateScheduler(scheduler) 79 | tensorboard = cb.TensorBoard() 80 | 81 | combine.fit_generator( 82 | train_loader, 83 | epochs=epochs, 84 | validation_data=val_loader, 85 | use_multiprocessing=False, 86 | workers=4, 87 | shuffle=False, 88 | callbacks=[save_conv, lr_sched, tensorboard] 89 | ) 90 | 91 | save_model(conv, "model/omniglot_conv") 92 | -------------------------------------------------------------------------------- /ml/few-shot-learning/util/loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def proto_dist(x): 4 | feature, pred = x 5 | pred_dist = tf.reduce_sum(pred ** 2, axis=1, keepdims=True) 6 | feature_dist = tf.reduce_sum(feature ** 2, axis=1, keepdims=True) 7 | dot = tf.matmul(pred, tf.transpose(feature)) 8 | return tf.nn.softmax(-(tf.sqrt(pred_dist + tf.transpose(feature_dist) - 2 * dot))) 9 | 10 | -------------------------------------------------------------------------------- /ml/few-shot-learning/util/tensor_op.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def slice_tensor_and_sum(x, way=20): 4 | sliced = tf.split(x, num_or_size_splits=way,axis=0) 5 | return tf.reduce_mean(sliced, axis=1) 6 | 7 | def reduce_tensor(x): 8 | return tf.reduce_mean(x, axis=1) 9 | 10 | def reshape_query(x): 11 | return tf.reshape(x, [-1, tf.shape(x)[-1]]) 12 | -------------------------------------------------------------------------------- /ml/fine-tune-pegasus/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | Fine tune pegasus-large using XSUM dataset 3 | 4 | adapted from https://towardsdatascience.com/how-to-perform-abstractive-summarization-with-pegasus-3dd74e48bafb 5 | 6 | Colab Version (a little different from this notebook - include pip installation and batch size is 2): https://colab.research.google.com/drive/1RyUsYDAo6bA1RZICMb-FxYLszBcDY81X?usp=sharing 7 | 8 | ## Setup 9 | 10 | ``` 11 | $ python3 -m venv venv 12 | $ source venv/bin/activate 13 | $ pip install -r requirements.txt 14 | ``` -------------------------------------------------------------------------------- /ml/fine-tune-pegasus/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | SentencePiece 3 | transformers>=4.3.3 4 | datasets>=1.4.1 5 | torch>=1.8.0 6 | -------------------------------------------------------------------------------- /ml/graph/.gitignore: -------------------------------------------------------------------------------- 1 | /logs -------------------------------------------------------------------------------- /ml/graph/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | https://www.analyticsvidhya.com/blog/2020/01/link-prediction-how-to-predict-your-future-connections-on-facebook/ 4 | https://www.analyticsvidhya.com/blog/2019/11/graph-feature-extraction-deepwalk/ 5 | https://www.tensorflow.org/tutorials/text/word2vec 6 | 7 | 8 | 9 | ## Setup 10 | 11 | ``` 12 | $ python3 -m venv venv 13 | $ source venv/bin/activate 14 | $ pip install -r requirements.txt 15 | ``` -------------------------------------------------------------------------------- /ml/graph/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | pandas 3 | matplotlib 4 | tensorflow 5 | tqdm 6 | sklearn 7 | networkx 8 | node2vec 9 | lightgbm -------------------------------------------------------------------------------- /ml/greedy-layer-wise-pretraning/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is my revised code of the tutorial at https://machinelearningmastery.com/greedy-layer-wise-pretraining-tutorial/ 4 | 5 | ## Setup 6 | 7 | within the tutorial folder: 8 | 9 | ``` 10 | python3 -m venv venv 11 | source venv/bin/activate 12 | pip install -r requirements.txt 13 | ``` 14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 15 | 16 | -------------------------------------------------------------------------------- /ml/greedy-layer-wise-pretraning/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | matplotlib 3 | pandas 4 | scikit-learn 5 | tensorflow 6 | -------------------------------------------------------------------------------- /ml/house-price-prediction/README.md: -------------------------------------------------------------------------------- 1 | ## Kaggle Kernel 2 | 3 | You can run this kernel directly at Kaggle.com: https://www.kaggle.com/harrywang/housing-price-prediction 4 | 5 | ## Run Locally 6 | 7 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 8 | 9 | ``` 10 | $ cd path_to_this folder 11 | $ virtualenv -p python3 venv 12 | $ source venv/bin/activate 13 | $ pip3 install -r requirements.txt 14 | ``` 15 | 16 | Run `$ jupyter notebook` to go over the tutorial step-by-step. 17 | 18 | ## Source 19 | 20 | This is the dataset used in this book: https://github.com/ageron/handson-ml/tree/master/datasets/housing to illustrate a sample end-to-end ML project workflow (pipeline). This is a great book - I highly recommend! 21 | 22 | The data is based on California Census in 1990. 23 | 24 | ### About the Data (from the book): 25 | 26 | "This dataset is a modified version of the California Housing dataset available from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the StatLib repository (which is closed now). The dataset may also be downloaded from StatLib mirrors. 27 | 28 | The following is the description from the book author: 29 | 30 | This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people). 31 | 32 | The dataset in this directory is almost identical to the original, with two differences: 33 | 207 values were randomly removed from the total_bedrooms column, so we can discuss what to do with missing data. 34 | An additional categorical attribute called ocean_proximity was added, indicating (very roughly) whether each block group is near the ocean, near the Bay area, inland or on an island. This allows discussing what to do with categorical data. 35 | Note that the block groups are called "districts" in the Jupyter notebooks, simply because in some contexts the name "block group" was confusing." 36 | 37 | ### About the Data (From Luís Torgo page): 38 | http://www.dcc.fc.up.pt/%7Eltorgo/Regression/cal_housing.html 39 | 40 | This is a dataset obtained from the StatLib repository. Here is the included description: 41 | 42 | "We collected information on the variables using all the block groups in California from the 1990 Cens us. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value)." 43 | 44 | 45 | ### End-to-End ML Project Steps (Chapter 2 of the book) 46 | 47 | 1. Look at the big picture 48 | 2. Get the data 49 | 3. Discover and visualize the data to gain insights 50 | 4. Prepare the data for Machine Learning algorithms 51 | 5. Select a model and train it 52 | 6. Fine-tune your model 53 | 7. Present your solution 54 | 8. Launch, monitor, and maintain your system 55 | 56 | ## The 10-Step Machine Learning Project Workflow (My Version) 57 | 58 | 1. Define business object 59 | 2. Make sense of the data from a high level 60 | - data types (number, text, object, etc.) 61 | - continuous/discrete 62 | - basic stats (min, max, std, median, etc.) using boxplot 63 | - frequency via histogram 64 | - scales and distributions of different features 65 | 3. Create the traning and test sets using proper sampling methods, e.g., random vs. stratified 66 | 4. Correlation analysis (pair-wise and attribute combinations) 67 | 5. Data cleaning (missing data, outliers, data errors) 68 | 6. Data transformation via pipelines (categorical text to number using one hot encoding, feature scaling via normalization/standardization, feature combinations) 69 | 7. Train and cross validate different models and select the most promising one (Linear Regression, Decision Tree, and Random Forest were tried in this tutorial) 70 | 8. Fine tune the model using trying different combinations of hyperparameters 71 | 9. Evaluate the model with best estimators in the test set 72 | 10. Launch, monitor, and refresh the model and system 73 | -------------------------------------------------------------------------------- /ml/house-price-prediction/input/anscombe.csv: -------------------------------------------------------------------------------- 1 | dataset,x,y 2 | I,10.0,8.04 3 | I,8.0,6.95 4 | I,13.0,7.58 5 | I,9.0,8.81 6 | I,11.0,8.33 7 | I,14.0,9.96 8 | I,6.0,7.24 9 | I,4.0,4.26 10 | I,12.0,10.84 11 | I,7.0,4.82 12 | I,5.0,5.68 13 | II,10.0,9.14 14 | II,8.0,8.14 15 | II,13.0,8.74 16 | II,9.0,8.77 17 | II,11.0,9.26 18 | II,14.0,8.1 19 | II,6.0,6.13 20 | II,4.0,3.1 21 | II,12.0,9.13 22 | II,7.0,7.26 23 | II,5.0,4.74 24 | III,10.0,7.46 25 | III,8.0,6.77 26 | III,13.0,12.74 27 | III,9.0,7.11 28 | III,11.0,7.81 29 | III,14.0,8.84 30 | III,6.0,6.08 31 | III,4.0,5.39 32 | III,12.0,8.15 33 | III,7.0,6.42 34 | III,5.0,5.73 35 | IV,8.0,6.58 36 | IV,8.0,5.76 37 | IV,8.0,7.71 38 | IV,8.0,8.84 39 | IV,8.0,8.47 40 | IV,8.0,7.04 41 | IV,8.0,5.25 42 | IV,19.0,12.5 43 | IV,8.0,5.56 44 | IV,8.0,7.91 45 | IV,8.0,6.89 46 | -------------------------------------------------------------------------------- /ml/house-price-prediction/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | pandas 3 | sklearn 4 | matplotlib 5 | seaborn 6 | -------------------------------------------------------------------------------- /ml/imbalanced-multi-classification/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | This is my revised code of the tutorial at: https://machinelearningmastery.com/multi-class-imbalanced-classification/ 4 | 5 | ## Setup 6 | 7 | 8 | ``` 9 | python3 -m venv venv 10 | source venv/bin/activate 11 | pip install -r requirements.txt 12 | ``` 13 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 14 | 15 | -------------------------------------------------------------------------------- /ml/imbalanced-multi-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | imbalanced-learn 3 | matplotlib 4 | pandas 5 | scikit-learn -------------------------------------------------------------------------------- /ml/openml-csv-arff/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This is the pre-processing script to convert a csv to arff for openml data upload at https://www.openml.org/d/42634 4 | 5 | After a csv is created, you need to use the weka to load and save the csv as raff file and then upload to openml.org: 6 | 7 | Screen Shot 2020-08-28 at 4 26 08 PM 8 | 9 | 10 | NOTE: https://pypi.org/project/csv2arff/ does not work - lots of errors. 11 | 12 | 13 | 14 | ## Setup 15 | 16 | Tested with Python 3.6 via virtual environment: 17 | ```shell 18 | $ python3.6 -m venv venv 19 | $ source venv/bin/activate 20 | $ jupyter notebook 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /ml/process-mining/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | A script to convert a txt log into process mining log format. The key of this notebook is to show how to use a pre-defined time gap to identify a user session 4 | 5 | ## Data 6 | 7 | The log file is very big, the following commands can help get a small sample of the file for EDA and other tasks. 8 | 9 | ### Utility Commands 10 | 11 | Run `$ wc -l search.txt` to find out the total number of lines in the file. When the file is too big for efficient pandas analysis and you can split it into smaller files with 100,000 lines each by using Linux version of `split`. 12 | 13 | ``` 14 | $ brew install coreutils 15 | $ gsplit -a 4 -d -l 10000 file.txt search_ 16 | ``` 17 | 18 | ## Setup 19 | 20 | Tested with Python 3.6 via virtual environment: 21 | ```shell 22 | $ python3.6 -m venv venv 23 | $ source venv/bin/activate 24 | $ pip install -r requirements.txt 25 | $ jupyter notebook 26 | ``` 27 | 28 | - log2csv notebook converts the text into a csv. 29 | - log-eda notebook converts the log into process mining log 30 | -------------------------------------------------------------------------------- /ml/process-mining/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter==1.0.0 2 | pandas==0.25.2 3 | matplotlib==3.1.2 4 | seaborn==0.9.0 5 | -------------------------------------------------------------------------------- /ml/tf-serving/.gitignore: -------------------------------------------------------------------------------- 1 | models/ -------------------------------------------------------------------------------- /ml/tf-serving/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | My revised code based on: 4 | 5 | - https://thelongrun.blog/2020/01/12/rest-api-tensorflow-serving-pt1/ 6 | - https://thelongrun.blog/2020/01/26/rest-api-tensorflow-serving-pt2/ 7 | 8 | 9 | # Setup 10 | 11 | Setup virtual environment and install packages: 12 | ``` 13 | python3 -m venv venv 14 | source venv/bin/activate 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | Pull tf serving docker image (I assume you installed docker: https://docs.docker.com/get-docker/): 19 | ``` 20 | docker pull tensorflow/serving:latest 21 | ``` 22 | 23 | Create tf serving servables from tf functions and pre-trained models, two servables will be generated and saved in `/models/` folder: 24 | 25 | ``` 26 | python make_servables.py 27 | ``` 28 | 29 | # Start TF Serving Servers 30 | 31 | Use tf serving to serve two models from different host ports: 8501 and 8502 by running the following command **in the repo root folder**: 32 | ``` 33 | docker run -t --rm -p 8501:8501 -v "$(pwd)/models/mobilenet_v2_test:/models/mobilenet_v2_test" -e MODEL_NAME=mobilenet_v2_test tensorflow/serving & 34 | 35 | docker run -t --rm -p 8502:8501 -v "$(pwd)/models/add_two:/models/add_two" -e MODEL_NAME=add_two tensorflow/serving & 36 | ``` 37 | 38 | You should see two docker apps running: 39 | 40 | Screen Shot 2020-11-03 at 5 45 23 PM 41 | 42 | # Use REST APIs for Computing/Inference 43 | 44 | Call the `AddTwo()` function using `curl`, which will add 2 to each number in the tensor: 45 | ``` 46 | curl -H "Content-Type: application/json" -d '{"instances":[1.0, 5.0, 4.0]}' http://localhost:8502/v1/models/add_two:predict 47 | ``` 48 | 49 | Call MobileNet classifier using `curl`: 50 | ``` 51 | chmod +x client_curl.sh 52 | ./client_curl.sh ./images/animal.jpg 53 | ``` 54 | 55 | Call MobileNet classifier using `curl` via python: 56 | 57 | ``` 58 | python client.py 59 | ``` 60 | 61 | 62 | -------------------------------------------------------------------------------- /ml/tf-serving/client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import base64 4 | 5 | data = {} 6 | with open('images/animal.jpg', mode='rb') as file: 7 | img = file.read() 8 | data = {"inputs":[{"b64":base64.encodebytes(img).decode("utf-8")}]} 9 | 10 | # Making the request 11 | r = requests.post("http://localhost:8501/v1/models/mobilenet_v2_test:predict", data=json.dumps(data)) 12 | print(r.content) 13 | # And returns: 14 | # b'{\n "outputs": [\n "giant panda"\n ]\n}' -------------------------------------------------------------------------------- /ml/tf-serving/client_curl.sh: -------------------------------------------------------------------------------- 1 | # $1 refers to the path where the image file is located 2 | ENCODED_IMG="$(base64 $1)" 3 | (echo '{"inputs": [{"b64": "'; echo "$ENCODED_IMG"; echo '"}]}') | curl -H "Content-Type: application/json" -d @- http://localhost:8501/v1/models/mobilenet_v2_test:predict -------------------------------------------------------------------------------- /ml/tf-serving/images/animal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/animal.jpg -------------------------------------------------------------------------------- /ml/tf-serving/images/clear.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/clear.jpg -------------------------------------------------------------------------------- /ml/tf-serving/images/ponds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/ponds.png -------------------------------------------------------------------------------- /ml/tf-serving/make_servables.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_hub as hub 3 | 4 | 5 | class AddTwo(tf.Module): 6 | @tf.function(input_signature=[tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name='x')]) 7 | def add_two(self, x): 8 | return x + 2 9 | 10 | 11 | class CustomMobileNet(tf.keras.Model): 12 | model_handler = "https://tfhub.dev/google/imagenet/mobilenet_v2_035_224/classification/4" 13 | 14 | def __init__(self): 15 | super(CustomMobileNet, self).__init__() 16 | self.model = hub.load(self.__class__.model_handler) 17 | self.labels = None 18 | 19 | @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.string)]) 20 | def call(self, input_img): 21 | def _preprocess(img_file): 22 | img_bytes = tf.reshape(img_file, []) 23 | img = tf.io.decode_jpeg(img_bytes, channels=3) 24 | img = tf.image.convert_image_dtype(img, tf.float32) 25 | return tf.image.resize(img, (224, 224)) 26 | 27 | labels = tf.io.read_file(self.labels) 28 | labels = tf.strings.split(labels, sep='\n') 29 | img = _preprocess(input_img)[tf.newaxis,:] 30 | logits = self.model(img) 31 | get_class = lambda x: labels[tf.argmax(x)] 32 | class_text = tf.map_fn(get_class, logits, fn_output_signature=tf.string) 33 | return class_text # index of the class 34 | 35 | # create a servable from a tf function 36 | tf_func_servable = AddTwo() 37 | tf.saved_model.save(tf_func_servable, "models/add_two/1") 38 | 39 | # create a servable from a pre-trian model downloaded from tf hub 40 | tf_model_servable = CustomMobileNet() 41 | tf_model_servable.labels = tf.saved_model.Asset("data/ImageNetLabels.txt") # save lables txt as an asset 42 | tf.saved_model.save(tf_model_servable, "models/mobilenet_v2_test/1/") 43 | -------------------------------------------------------------------------------- /ml/tf-serving/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.3.1 2 | tensorflow_hub>=0.10.0 -------------------------------------------------------------------------------- /ml/tfidf-bm25/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | tfidf and bm25 examples for document retrieval using the Cranfield dataset 4 | 5 | "The Cranfield collection. This was the pioneering test collection in allowing precise quantitative measures of information retrieval effectiveness, but is nowadays too small for anything but the most elementary pilot experiments. Collected in the United Kingdom starting in the late 1950s, it contains 1398 abstracts of aerodynamics journal articles, a set of 225 queries, and exhaustive relevance judgments of all (query, document) pairs." - 6 | https://nlp.stanford.edu/IR-book/html/htmledition/standard-test-collections-1.html 7 | 8 | ## Setup 9 | 10 | ``` 11 | $ python3 -m venv venv 12 | $ source venv/bin/activate 13 | $ pip install -r requirements.txt 14 | ``` 15 | ## Data 16 | 17 | The data is in `data` folder in JSON format: 18 | - `cranfield_docs.json`: information about 1400 documents, which are abstracts from papers related to Aeronautics with information about author, bibliography, body (abstract), title: 19 | ``` 20 | { 21 | 22 | "id" : 1, 23 | "author" : "brenckman,m.", 24 | "bibliography" : "j. ae. scs. 25, 1958, 324.", 25 | "body" : "experimental investigation of the aerodynamics of a wing in a slipstream . an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios . the results were intended in part as an evaluation basis for different theoretical treatments of this problem . the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect . the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory . an empirical evaluation of the destalling effects was made for the specific configuration of the experiment .", 26 | "title" : "experimental investigation of the aerodynamics of a wing in a slipstream ." 27 | 28 | } 29 | ``` 30 | - `cranfield_queries.json`: 225 queries representing users' information need. 31 | ``` 32 | { 33 | "query_id": 1, 34 | "query": "what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft ." 35 | } 36 | ``` 37 | 38 | - `cranfield_relevance.json`: the relevance score (1, 2, 3, 4 as 1 being the highest relevance) of each query and related documents. 39 | ``` 40 | {"query_id": "1", "r_score": 2, "doc_id": "184"}, 41 | {"query_id": "2", "r_score": 1, "doc_id": "12"}, 42 | ``` 43 | - 1 : the document is the complete answer to the query 44 | - 2 : the document has a high degree of relevance to the query 45 | - 3 : the document is useful to the query as general background information 46 | - 4 : the document is of minimum interest to the query 47 | 48 | 49 | 50 | ## Evaluation Metrics 51 | 52 | Precision and Recall are used in the examples. See https://nlp.stanford.edu/IR-book/html/htmledition/information-retrieval-system-evaluation-1.html for more evaluation metrics. 53 | -------------------------------------------------------------------------------- /ml/tfidf-bm25/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab>=2.2.9 2 | matplotlib>=3.2.1 3 | nltk>=3.5 4 | pandas>=1.0.3 5 | scikit-learn>=0.22.2 6 | rank-bm25 7 | -------------------------------------------------------------------------------- /ml/topic-modeling/.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | -------------------------------------------------------------------------------- /ml/topic-modeling/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | My revised code for 4 | 5 | https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb 6 | https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24 7 | 8 | and 9 | 10 | LDA from Scratch 11 | My revised tutorial based on https://www.depends-on-the-definition.com/lda-from-scratch/ 12 | 13 | I also found another similar tutorial at https://gist.github.com/umbertogriffo/5041b9e4ec6c3478cef99b8653530032 14 | 15 | ## Setup 16 | 17 | within the tutorial folder: 18 | 19 | ``` 20 | python3 -m venv venv 21 | source venv/bin/activate 22 | pip install -r requirements.txt 23 | ``` 24 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks. 25 | 26 | -------------------------------------------------------------------------------- /ml/topic-modeling/requirements.txt: -------------------------------------------------------------------------------- 1 | gensim 2 | jupyter 3 | matplotlib 4 | nltk 5 | pandas 6 | scikit-learn -------------------------------------------------------------------------------- /ml/tweet-sentiment-analysis/README.md: -------------------------------------------------------------------------------- 1 | ## Tweet Sentiment Analysis with Python 3 2 | 3 | This is my revision of the tutorial at https://dev.to/rodolfoferro/sentiment-analysis-on-trumpss-tweets-using-python - many thanks to the author. The original repo is at https://github.com/RodolfoFerro/pandas_twitter 4 | 5 | The original author provides markdown version of his tutorial. I combine all files into one: tutorial.md and create a English version of the Jupyter notebook (the author only had a Spanish version). 6 | 7 | ## Summary 8 | **Data**: 200 Tweets from Donald Trump: https://twitter.com/realDonaldTrump 9 | 10 | **Goal**: Conduct a sentiment analysis of the tweets with sample result: 11 | 12 | - Percentage of positive tweets: 53.5% 13 | - Percentage of neutral tweets: 23.0% 14 | - Percentage of negative tweets: 23.5% 15 | 16 | Python packages used: jupyter, pandas, numpy, tweepy, textblob 17 | 18 | ## API Keys 19 | Change API key for Twitter: In order to extract tweets for a posterior analysis, we need to access to our Twitter account and create an app. The website to do this is https://apps.twitter.com/. (If you don't know how to do this, you can follow this tutorial video https://www.youtube.com/watch?v=BOA7SD_09Qk to create an account and an application.) 20 | 21 | 22 | - Consumer Key (API Key) 23 | - Consumer Secret (API Secret) 24 | - Access Token 25 | - Access Token Secret 26 | 27 | **You should never put your real API key in the code and push to Github.** We use local environment variables for the API keys: 28 | 29 | ``` 30 | # Get the API key from local environment variable 31 | CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY') 32 | CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET') 33 | ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN') 34 | ACCESS_SECRET = os.environ.get('TWITTER_ACCESS_SECRET') 35 | ``` 36 | 37 | You need to add the following lines to the `~/.bash_profile` file 38 | ``` 39 | export TWITTER_CONSUMER_KEY='yourealkey' 40 | export TWITTER_CONSUMER_SECRET='yourealkey' 41 | export TWITTER_ACCESS_TOKEN='yourealkey' 42 | export TWITTER_ACCESS_SECRET='yourealkey' 43 | ``` 44 | 45 | then, use `vim` to edit, `source` to execute it, then use `env` to double check): 46 | 47 | ``` 48 | $ vim ~/.bash_profile 49 | $ source ~/.bash_profile 50 | $ env 51 | ``` 52 | **NOTE: You may need to close the Terminal window and restart it for Jupyter Notebook to read the new variables you just added.** 53 | 54 | ## Setup 55 | 56 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages: 57 | 58 | ``` 59 | $ cd path_to_document-clustering 60 | $ virtualenv -p python3 venv 61 | $ source venv/bin/activate 62 | $ pip3 install -r requirements.txt 63 | ``` 64 | 65 | Run `$ jupyter notebook` to go over the tutorial step-by-step. 66 | -------------------------------------------------------------------------------- /ml/tweet-sentiment-analysis/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | pandas 3 | numpy 4 | tweepy 5 | textblob 6 | -------------------------------------------------------------------------------- /other/chinese-to-pinyin/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | .DS_Store 104 | 105 | # PyCharm project settings 106 | .idea 107 | -------------------------------------------------------------------------------- /other/chinese-to-pinyin/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | A script to convert Chinese folder names to pinyin. revised according to: http://sunzhen.blogspot.com/2016/05/rename-chinese-filenames-to-pinyin.html 4 | 5 | 6 | ## Setup 7 | 8 | This script converts all file and folder names in the sub folder "data" into Pinyin. 9 | 10 | If different unicode maps to the same pinyin such as "利" and "立"" both map to "li" and 1 will be added to the filename. 11 | 12 | Tested with Python 3.6 via virtual environment: 13 | ```shell 14 | $ python3.6 -m venv venv 15 | $ source venv/bin/activate 16 | $ python ch-to-pinyin.py 17 | ``` 18 | 19 | ## An Example 20 | 21 | Before: 22 | 23 | Screen Shot 2019-09-24 at 11 43 08 AM 24 | 25 | After: 26 | 27 | Screen Shot 2019-09-24 at 11 43 20 AM 28 | -------------------------------------------------------------------------------- /other/chinese-to-pinyin/ch-to-pinyin.py: -------------------------------------------------------------------------------- 1 | # renameCH2Pinyin.py 2 | # Rename filename from Chinese characters to capitalized pinyin using the 3 | # mapping file and taking out the tone numbers 4 | 5 | import os 6 | import re 7 | 8 | # File uni2pinyin is a mapping from hex to Pinyin with a tone number 9 | f = open('uni2pinyin') 10 | wf = f.read() # read the whole mapping file 11 | 12 | os.chdir('data') # to rename all files in sub folder 'voc' 13 | filename_list = os.listdir(u'.') # read all file names in unicode mode 14 | print(filename_list) 15 | for filename_unicode in filename_list: # each file name 16 | filename_pinyin = '' 17 | for c in filename_unicode: # each character 18 | if 0x4e00 <= ord(c) <= 0x9fff: # Chinese Character Unicode range 19 | hexCH = (hex(ord(c))[2:]).upper() # strip leading '0x' and change 20 | # to uppercase 21 | p = re.compile(hexCH+'\t([a-z]+)[\d]*') # define the match pattern 22 | mp = p.search(wf) 23 | filename_pinyin+=mp.group(1).title() # get the pinyin without the tone 24 | # number and capitalize it 25 | else: 26 | filename_pinyin+=c 27 | print(filename_unicode, filename_pinyin) 28 | 29 | latest_filename_list = os.listdir(u'.') 30 | while filename_pinyin in latest_filename_list: 31 | filename_pinyin= filename_pinyin + '1' 32 | print(filename_pinyin) 33 | os.rename(filename_unicode, filename_pinyin) 34 | os.chdir('..') # go back to the parent folder 35 | -------------------------------------------------------------------------------- /other/chinese-to-pinyin/data/.DS_Store11: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store11 -------------------------------------------------------------------------------- /other/chinese-to-pinyin/data/.DS_Store111: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store111 -------------------------------------------------------------------------------- /other/chinese-to-pinyin/data/.DS_Store111111: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store111111 -------------------------------------------------------------------------------- /other/chinese-to-pinyin/data/.DS_Store1111111: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store1111111 -------------------------------------------------------------------------------- /other/chinese-to-pinyin/data/白/0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/白/0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg -------------------------------------------------------------------------------- /other/color-palette/test-palette.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/color-palette/test-palette.png -------------------------------------------------------------------------------- /other/color-palette/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/color-palette/test.png -------------------------------------------------------------------------------- /other/csv-to-bert-text/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | A script to convert csv to multiple txt based on a label column 4 | 5 | ## Data 6 | 7 | ``` 8 | 1 The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor. 5 POS 9 | 2 The food was definitely more "upscale" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas. 4 NEG 10 | 3 Another small bonus was the hard-to-find Sol beer , which was great with a lime. 4 POS 11 | 4 Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!. 5 NEU 12 | ``` 13 | ## Setup 14 | 15 | Tested with Python 3.6 via virtual environment: 16 | ```shell 17 | $ python3.6 -m venv venv 18 | $ source venv/bin/activate 19 | $ pip install -r requirements.txt 20 | $ jupyter notebook 21 | ``` 22 | -------------------------------------------------------------------------------- /other/csv-to-bert-text/csv-to-txt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[27]: 5 | 6 | 7 | import pandas as pd 8 | import os 9 | import shutil 10 | 11 | 12 | # In[20]: 13 | 14 | 15 | sent = pd.read_csv("real.csv", encoding='utf8') 16 | sent 17 | 18 | 19 | # In[21]: 20 | 21 | 22 | path = os.getcwd() 23 | print ("The current working directory is %s" % path) 24 | 25 | 26 | # In[32]: 27 | 28 | 29 | # clean data folder and create new folders 30 | shutil.rmtree('./data') 31 | os.makedirs("./data/pos") 32 | os.makedirs("./data/neg") 33 | os.makedirs("./data/neu") 34 | 35 | 36 | # In[16]: 37 | 38 | 39 | def write_sent(label, id, sent): 40 | filename = "./data/"+ label +"/" + id +".txt" 41 | file = open(filename,"w") 42 | file.writelines(sent) 43 | file.close() 44 | 45 | 46 | # In[17]: 47 | 48 | 49 | for index, row in sent.iterrows(): 50 | write_sent(row['SentiLabel_food'].lower(), str(row['SentenceID']), row['Sentences']) 51 | print("writing sentence") 52 | -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/neg/43.txt: -------------------------------------------------------------------------------- 1 | I was slightly saddened that either they do not offer chalula here or that they ran out. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/neg/44.txt: -------------------------------------------------------------------------------- 1 | Sometimes I feel that breakfast is just not complete without it and hope that they can stock up because that louisiana hot sauce they carried did not cut it. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/neu/29.txt: -------------------------------------------------------------------------------- 1 | Food was ok , -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/1.txt: -------------------------------------------------------------------------------- 1 | The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/2.txt: -------------------------------------------------------------------------------- 1 | The food was definitely more "upscale" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/20.txt: -------------------------------------------------------------------------------- 1 | We ordered 2 appetizers , 2 entrees , 2 bottles of wine... food was very good not great. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/3.txt: -------------------------------------------------------------------------------- 1 | Another small bonus was the hard-to-find Sol beer , which was great with a lime. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/4.txt: -------------------------------------------------------------------------------- 1 | Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/40.txt: -------------------------------------------------------------------------------- 1 | The food came out in about 10 minutes and my skillet was cooked very well. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/41.txt: -------------------------------------------------------------------------------- 1 | The sunny side up eggs here are some of the best I have ever had- they were slightly chewy and full of flavor. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/42.txt: -------------------------------------------------------------------------------- 1 | My skillet came with cheddar cheese mushroom , broccoli , and tomtaoes and it was a hearty meal that , when combined with ketchup and salt , delivered very satisfying feelings to my tastebuds and stomach. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/46.txt: -------------------------------------------------------------------------------- 1 | They were hot and had a soft , spongy texture. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/47.txt: -------------------------------------------------------------------------------- 1 | Pretty delicious and satisfied my sweet tooth. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/5.txt: -------------------------------------------------------------------------------- 1 | After shopping around , my husband and I both think they have the best pizza around Highland Park. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/50.txt: -------------------------------------------------------------------------------- 1 | Their sandwich specials looked great. -------------------------------------------------------------------------------- /other/csv-to-bert-text/data/pos/8.txt: -------------------------------------------------------------------------------- 1 | It's a homey place with good food. -------------------------------------------------------------------------------- /other/csv-to-bert-text/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter==1.0.0 2 | pandas==0.25.2 3 | -------------------------------------------------------------------------------- /other/csv-to-bert-text/sample.csv: -------------------------------------------------------------------------------- 1 | SentenceID,Sentences,SentiScore_food,SentiLabel_food 2 | 1,"The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor.",5,POS 3 | 2,"The food was definitely more ""upscale"" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas.",4,POS 4 | 3,"Another small bonus was the hard-to-find Sol beer , which was great with a lime.",4,POS 5 | 4,"Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!.",5,POS 6 | 5,"After shopping around , my husband and I both think they have the best pizza around Highland Park.",5,POS 7 | 8,It's a homey place with good food.,4,POS 8 | 20,"We ordered 2 appetizers , 2 entrees , 2 bottles of wine... food was very good not great.",4,POS 9 | 29,"Food was ok , ",3,NEU 10 | 40,The food came out in about 10 minutes and my skillet was cooked very well.,5,POS 11 | 41,The sunny side up eggs here are some of the best I have ever had- they were slightly chewy and full of flavor.,5,POS 12 | 42,"My skillet came with cheddar cheese mushroom , broccoli , and tomtaoes and it was a hearty meal that , when combined with ketchup and salt , delivered very satisfying feelings to my tastebuds and stomach.",5,POS 13 | 43,I was slightly saddened that either they do not offer chalula here or that they ran out.,2,NEG 14 | 44,Sometimes I feel that breakfast is just not complete without it and hope that they can stock up because that louisiana hot sauce they carried did not cut it.,2,NEG 15 | 46,"They were hot and had a soft , spongy texture.",4,POS 16 | 47,Pretty delicious and satisfied my sweet tooth.,4,POS 17 | 50,Their sandwich specials looked great.,5,POS -------------------------------------------------------------------------------- /other/list-like-to-list/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /other/list-like-to-list/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /other/list-like-to-list/.idea/movie-genre.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /other/list-like-to-list/README.md: -------------------------------------------------------------------------------- 1 | # Intro 2 | This is a program to do some data transformation. 3 | 4 | The key challenge is converting the following "list-like" string into a real list: 5 | 6 | ``` 7 | "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]" 8 | ``` 9 | 10 | `ast.literal_eval` does the trick: https://docs.python.org/2/library/ast.html 11 | 12 | Another useful trick is converting a list of list: 13 | 14 | ``` 15 | [[862, 16], [862, 35], [862, 10751], [8844, 12]] 16 | ``` 17 | into a csv file: 18 | 19 | ``` 20 | 862,16 21 | 862,35 22 | 862,10751 23 | 8844,12 24 | ``` 25 | pandas makes it easy: 26 | ``` 27 | my_df = pd.DataFrame(my_list) 28 | my_df.to_csv('my_csv.csv', index=False, header=False) 29 | ``` 30 | 31 | # Run 32 | 33 | Python 2.x 34 | 35 | - create virtual environment: `$virtualenv venv` 36 | - activate virtual env: `$source venv/bin/activate` 37 | - install required packages: `pip install -r requirements.txt` 38 | -------------------------------------------------------------------------------- /other/list-like-to-list/movie.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import ast 3 | import pandas as pd 4 | 5 | movies = [] 6 | # how to read a csv into a list 7 | with open('input.csv', 'rb') as f: 8 | reader = csv.reader(f) 9 | movies = list(reader) 10 | 11 | my_list = [] 12 | 13 | for movie in movies: 14 | # print '****** one movie ******' 15 | for genre in ast.literal_eval(movie[1]): 16 | line = [] 17 | # print '****** one genre ******' 18 | # print movie[0] 19 | # print genre['id'] 20 | # print genre['name'] 21 | line.append(int(movie[0])) 22 | line.append(genre['id']) 23 | # print line 24 | my_list.append(line) 25 | 26 | print(my_list) 27 | my_df = pd.DataFrame(my_list) 28 | my_df.to_csv('movie_genre.csv', index=False, header=False) 29 | print 'See result in movie_genre.csv file' 30 | -------------------------------------------------------------------------------- /other/list-like-to-list/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.21.0 2 | -------------------------------------------------------------------------------- /other/list-of-dicts-to-columns/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/list-of-dicts-to-columns/README.md -------------------------------------------------------------------------------- /other/list-of-dicts-to-columns/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | pandas -------------------------------------------------------------------------------- /other/screenshot-gif-generation/.gitignore: -------------------------------------------------------------------------------- 1 | /screenshots/*.jpg 2 | /screenshots/*.gif 3 | -------------------------------------------------------------------------------- /other/screenshot-gif-generation/README.md: -------------------------------------------------------------------------------- 1 | # Generate Screenshots and Gifs via Python 2 | 3 | Code is revised based on: 4 | - https://blog.csdn.net/qq_38161040/article/details/91040640 5 | - https://medium.com/swlh/python-animated-images-6a85b9b68f86 -------------------------------------------------------------------------------- /other/screenshot-gif-generation/gif-generation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3.7.7-final" 14 | }, 15 | "orig_nbformat": 2, 16 | "kernelspec": { 17 | "name": "python_defaultSpec_1600265837621", 18 | "display_name": "Python 3.7.7 64-bit ('venv': venv)" 19 | } 20 | }, 21 | "nbformat": 4, 22 | "nbformat_minor": 2, 23 | "cells": [ 24 | { 25 | "source": [ 26 | "# Generate Screenshots and Gifs via Python\n", 27 | "\n", 28 | "Code is revised based on:\n", 29 | "- https://blog.csdn.net/qq_38161040/article/details/91040640\n", 30 | "- https://medium.com/swlh/python-animated-images-6a85b9b68f86" 31 | ], 32 | "cell_type": "markdown", 33 | "metadata": {} 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 27, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from PIL import ImageGrab\n", 42 | "from PIL import Image\n", 43 | "import time" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 28, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# take a screenshot every 0.1 second, 10 jpg saved\n", 53 | "total_images = 10 # total screenshots\n", 54 | "interval = 0.1 # the interval to take a screenshot\n", 55 | "resize_ratio = 0.3 # the resize ratio to keep the screenshot smaller\n", 56 | "\n", 57 | "for i in range(total_images):\n", 58 | " time.sleep(interval)\n", 59 | " img = ImageGrab.grab()\n", 60 | " width = img.size[0]\n", 61 | " height = img.size[1]\n", 62 | "\n", 63 | " img = img.resize(\n", 64 | " (int(width*resize_ratio), int(height*resize_ratio)), \n", 65 | " Image.ANTIALIAS)\n", 66 | " \n", 67 | " img = img.convert('RGB') # if save to jpg\n", 68 | " img.save(f'./screenshots/screenshot{str(i+1)}.jpg')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 29, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# generate the gif\n", 78 | "import imageio\n", 79 | "\n", 80 | "gif_images = []\n", 81 | "for i in range(total_images):\n", 82 | " gif_images.append(imageio.imread(f'./screenshots/screenshot{str(i+1)}.jpg'))\n", 83 | "\n", 84 | "imageio.mimsave(\"./screenshots/screenshot.gif\", gif_images, fps=5)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 30, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# reduce the gif file size\n", 94 | "from pygifsicle import optimize\n", 95 | "\n", 96 | "gif_orginal = './screenshots/screenshot.gif'\n", 97 | "\n", 98 | "# create a new onegit \n", 99 | "optimize(gif_orginal, './screenshots/screenshot_optimized.gif')\n", 100 | "\n", 101 | "# overwrite the original one if needed\n", 102 | "# optimize(gif_orginal)" 103 | ] 104 | } 105 | ] 106 | } -------------------------------------------------------------------------------- /other/screenshot-gif-generation/git-gen.py: -------------------------------------------------------------------------------- 1 | # # Generate Screenshots and Gifs via Python 2 | # 3 | # Code is revised based on: 4 | # - https://blog.csdn.net/qq_38161040/article/details/91040640 5 | # - https://medium.com/swlh/python-animated-images-6a85b9b68f86 6 | # pip install the following three packages: imageio, pillow, pygifsicle 7 | 8 | # %% 9 | from PIL import ImageGrab 10 | from PIL import Image 11 | import time 12 | 13 | 14 | # %% 15 | # take a screenshot every 0.1 second, 10 jpg saved 16 | total_images = 10 # total screenshots 17 | interval = 0.1 # the interval to take a screenshot 18 | resize_ratio = 0.3 # the resize ratio to keep the screenshot smaller 19 | 20 | for i in range(total_images): 21 | time.sleep(interval) 22 | img = ImageGrab.grab() 23 | width = img.size[0] 24 | height = img.size[1] 25 | 26 | img = img.resize( 27 | (int(width*resize_ratio), int(height*resize_ratio)), 28 | Image.ANTIALIAS) 29 | 30 | img = img.convert('RGB') # if save to jpg 31 | img.save(f'./screenshots/screenshot{str(i+1)}.jpg') 32 | 33 | 34 | # %% 35 | import imageio 36 | 37 | gif_images = [] 38 | for i in range(total_images): 39 | gif_images.append(imageio.imread(f'./screenshots/screenshot{str(i+1)}.jpg')) 40 | 41 | imageio.mimsave("./screenshots/screenshot.gif", gif_images, fps=5) 42 | 43 | 44 | # %% 45 | from pygifsicle import optimize 46 | 47 | gif_orginal = './screenshots/screenshot.gif' 48 | 49 | # create a new onegit 50 | optimize(gif_orginal, './screenshots/screenshot_optimized.gif') 51 | 52 | # overwrite the original one if needed 53 | # optimize(gif_orginal) 54 | 55 | 56 | -------------------------------------------------------------------------------- /other/screenshot-gif-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | imageio 3 | pillow 4 | pygifsicle -------------------------------------------------------------------------------- /other/screenshot-gif-generation/screenshots/screenshot-folder.md: -------------------------------------------------------------------------------- 1 | temp files are saved in this folder --------------------------------------------------------------------------------