├── .gitignore
├── LICENSE
├── README.md
├── ai
├── autogen
│ ├── README.md
│ ├── autogen-code-execution.ipynb
│ ├── autogen-tools.ipynb
│ ├── autogent-tutorial.ipynb
│ ├── docker-example.ipynb
│ └── requirements.txt
├── langchain
│ └── langchain-rag-basics
│ │ ├── basics.ipynb
│ │ ├── data
│ │ ├── chroma
│ │ │ ├── 39b238f5-b82f-42ff-a683-cc8d5aea4747
│ │ │ │ ├── data_level0.bin
│ │ │ │ ├── header.bin
│ │ │ │ ├── length.bin
│ │ │ │ └── link_lists.bin
│ │ │ └── chroma.sqlite3
│ │ ├── getting-real
│ │ │ ├── getting-real-01-introduction.pdf
│ │ │ ├── getting-real-02-starting-line.pdf
│ │ │ ├── getting-real-03-stay-lean.pdf
│ │ │ ├── getting-real-04-priorities.pdf
│ │ │ ├── getting-real-05-feature-selection.pdf
│ │ │ ├── getting-real-06-process.pdf
│ │ │ ├── getting-real-07-organization.pdf
│ │ │ ├── getting-real-08-staffing.pdf
│ │ │ ├── getting-real-09-interface-design.pdf
│ │ │ ├── getting-real-10-code.pdf
│ │ │ ├── getting-real-11-words.pdf
│ │ │ ├── getting-real-12-pricing-signup.pdf
│ │ │ ├── getting-real-13-promotion.pdf
│ │ │ ├── getting-real-14-support.pdf
│ │ │ ├── getting-real-15-post-launch.pdf
│ │ │ └── getting-real-full.pdf
│ │ └── nba-rules-2023.pdf
│ │ ├── qa.ipynb
│ │ ├── rag-retrieval.ipynb
│ │ └── requirements.txt
└── litellm
│ ├── README.md
│ ├── deepseek_example.py
│ └── requirements.txt
├── ds
├── ab-testing
│ ├── README.md
│ ├── Walkthrough.ipynb
│ ├── ab-testing-math.ipynb
│ ├── requirements.txt
│ └── utils
│ │ ├── data.py
│ │ ├── plot.py
│ │ └── stats.py
├── airflow
│ ├── README.md
│ ├── requirements.txt
│ └── simple_bash_dag.py
├── aws-pyspark
│ ├── README.md
│ └── emr_bootstrap.sh
├── cohort-analysis
│ ├── README.md
│ ├── cohort-analysis.ipynb
│ └── requirements.txt
├── dask
│ ├── .gitignore
│ ├── README.md
│ ├── dask-array.ipynb
│ ├── dask-big-dataset.ipynb
│ ├── dask-intro.ipynb
│ ├── dask-taxi.ipynb
│ ├── dask-worker-space
│ │ ├── global.lock
│ │ ├── purge.lock
│ │ ├── worker-3y9yh5wc.dirlock
│ │ ├── worker-5u5lbrxx.dirlock
│ │ ├── worker-82zb8rgu.dirlock
│ │ ├── worker-9wl7s6m3.dirlock
│ │ ├── worker-_n7kuuyd.dirlock
│ │ ├── worker-bbjm31ih.dirlock
│ │ ├── worker-fwxxmool.dirlock
│ │ ├── worker-l28a891y.dirlock
│ │ ├── worker-l8y7v2oj.dirlock
│ │ ├── worker-lckuq0ub.dirlock
│ │ ├── worker-ofkwc26n.dirlock
│ │ └── worker-wuu54xyo.dirlock
│ ├── mydask.png
│ └── requirements.txt
├── data-driven-growth
│ ├── .gitignore
│ ├── README.md
│ ├── know-your-metrics.ipynb
│ ├── requirements.txt
│ └── utils
│ │ ├── data.py
│ │ ├── plot.py
│ │ └── stats.py
├── diff-in-diff
│ ├── Panel101.dta
│ ├── README.md
│ ├── did-min-wage.ipynb
│ ├── did-panel101.ipynb
│ ├── mini-wage.csv
│ ├── mini-wage.dat
│ ├── panel101.csv
│ └── requirements.txt
├── dvc
│ ├── .gitignore
│ └── README.md
├── hypo-testing
│ ├── README.md
│ ├── blood-pressure.csv
│ ├── chi-test.csv
│ ├── crop-yield.csv
│ ├── hypo-testing.ipynb
│ ├── plant-growth.csv
│ └── requirements.txt
├── inside-airbnb
│ ├── .idea
│ │ ├── inside-airbnb.iml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ └── workspace.xml
│ ├── README.md
│ ├── add-columns.py
│ ├── data
│ │ ├── nyc-listings.csv
│ │ └── nyc-listings_new.csv
│ ├── get-one-photo.py
│ ├── get-photos.py
│ └── requirements.txt
├── matplotlib
│ ├── README.md
│ ├── grouped-bar-plot-with-precentage-change-matplotlib.ipynb
│ └── requirements.txt
├── multi-armed-bandit
│ ├── README.md
│ ├── mab.ipynb
│ └── requirements.txt
├── pymongo
│ ├── README.md
│ ├── pymongo.ipynb
│ └── requirements.txt
├── seaborn
│ ├── README.md
│ ├── pokemon.csv
│ ├── requirements.txt
│ └── seaborn_basics.ipynb
├── spark-basics
│ ├── datacamp-notes.md
│ └── datacamp-spark.ipynb
├── statsmodels-tutorial
│ ├── README.md
│ ├── lr-python.ipynb
│ ├── requirements.txt
│ ├── statsmodels.ipynb
│ └── statsmodels_getstarted.ipynb
├── streamlit
│ ├── README.md
│ ├── airbnb.py
│ ├── listings.csv
│ └── requirements.txt
├── superset
│ └── README.md
├── time-series-additive-model
│ ├── README.md
│ ├── additive_models.ipynb
│ ├── data
│ │ ├── Workbook1.xlsx
│ │ ├── gm_sales.csv
│ │ ├── gm_sales.xlsx
│ │ ├── recessions.csv
│ │ ├── recessions.xlsx
│ │ └── tesla_search_terms.csv
│ └── requirements.txt
└── time-series-basics
│ ├── README.md
│ ├── data
│ └── opsd_germany_daily.csv
│ ├── requirements.txt
│ └── time_series_basics.ipynb
├── ml
├── attention
│ ├── README.md
│ ├── attention_explained.ipynb
│ └── requirements.txt
├── autogluon
│ ├── README.md
│ ├── agModels-predictClass
│ │ ├── learner.pkl
│ │ ├── models
│ │ │ ├── CatBoost
│ │ │ │ └── model.pkl
│ │ │ ├── ExtraTreesEntr
│ │ │ │ └── model.pkl
│ │ │ ├── ExtraTreesGini
│ │ │ │ └── model.pkl
│ │ │ ├── KNeighborsDist
│ │ │ │ └── model.pkl
│ │ │ ├── KNeighborsUnif
│ │ │ │ └── model.pkl
│ │ │ ├── LightGBM
│ │ │ │ └── model.pkl
│ │ │ ├── LightGBMLarge
│ │ │ │ └── model.pkl
│ │ │ ├── LightGBMXT
│ │ │ │ └── model.pkl
│ │ │ ├── NeuralNetFastAI
│ │ │ │ ├── model-internals.pkl
│ │ │ │ └── model.pkl
│ │ │ ├── RandomForestEntr
│ │ │ │ └── model.pkl
│ │ │ ├── RandomForestGini
│ │ │ │ └── model.pkl
│ │ │ ├── WeightedEnsemble_L2
│ │ │ │ ├── model.pkl
│ │ │ │ └── utils
│ │ │ │ │ ├── model_template.pkl
│ │ │ │ │ └── oof.pkl
│ │ │ ├── XGBoost
│ │ │ │ └── model.pkl
│ │ │ └── trainer.pkl
│ │ ├── predictor.pkl
│ │ └── utils
│ │ │ └── data
│ │ │ ├── X.pkl
│ │ │ ├── X_val.pkl
│ │ │ ├── y.pkl
│ │ │ └── y_val.pkl
│ ├── autogluon.ipynb
│ ├── housing-prediction.ipynb
│ ├── input
│ │ ├── anscombe.csv
│ │ └── housing.csv
│ └── requirements.txt
├── clearml-server
│ └── README.md
├── clearml
│ ├── README.md
│ ├── matplotlib
│ │ ├── Allegro_Trains_matplotlib_example.ipynb
│ │ ├── matplotlib_example.py
│ │ ├── mlp_grouped_errorbar.py
│ │ └── requirements.txt
│ ├── pytorch
│ │ ├── manual_model_upload.py
│ │ ├── notebooks
│ │ │ ├── audio
│ │ │ │ ├── README.md
│ │ │ │ ├── audio_classifier_UrbanSound8K.ipynb
│ │ │ │ └── audio_preprocessing_example.ipynb
│ │ │ ├── image
│ │ │ │ ├── hyperparameter_search.ipynb
│ │ │ │ └── image_classification_CIFAR10.ipynb
│ │ │ ├── table
│ │ │ │ ├── download_and_preprocessing.ipynb
│ │ │ │ ├── download_and_split.ipynb
│ │ │ │ ├── pick_best_model.ipynb
│ │ │ │ ├── preprocessing_and_encoding.ipynb
│ │ │ │ ├── tabular_ml_pipeline.ipynb
│ │ │ │ └── train_tabular_predictor.ipynb
│ │ │ └── text
│ │ │ │ └── text_classification_AG_NEWS.ipynb
│ │ ├── pytorch_distributed_example.py
│ │ ├── pytorch_matplotlib.py
│ │ ├── pytorch_mnist.py
│ │ ├── pytorch_tensorboard.py
│ │ ├── pytorch_tensorboardx.py
│ │ ├── requirements.txt
│ │ └── tensorboard_toy_pytorch.py
│ ├── requirements.txt
│ ├── scikit-learn
│ │ ├── model-harry.pkl
│ │ ├── model.pkl
│ │ ├── requirements.txt
│ │ ├── sklearn_joblib_example.py
│ │ └── sklearn_matplotlib_example.py
│ ├── tensorflow
│ │ ├── legacy
│ │ │ ├── requirements.txt
│ │ │ ├── tensorboard_pr_curve.py
│ │ │ ├── tensorboard_toy.py
│ │ │ ├── tensorflow_eager.py
│ │ │ └── tensorflow_mnist_with_summaries.py
│ │ ├── manual_model_upload.py
│ │ ├── requirements.txt
│ │ ├── tensorboard_pr_curve.py
│ │ ├── tensorboard_toy.py
│ │ └── tensorflow_mnist.py
│ ├── wandb
│ │ ├── latest-run
│ │ ├── pytorch_mnist_clearml.py
│ │ ├── pytorch_mnist_wandb.py
│ │ └── requirements.txt
│ └── xgboost
│ │ ├── requirements.txt
│ │ └── xgboost_sample.py
├── clip-image-classification
│ └── clip-img-cls.ipynb
├── document-clustering
│ ├── README.md
│ ├── data
│ │ ├── genres_list.txt
│ │ ├── synopses_list_imdb.txt
│ │ ├── synopses_list_wiki.txt
│ │ └── title_list.txt
│ ├── doc_clustering.ipynb
│ └── requirements.txt
├── feature-importance
│ ├── README.md
│ ├── breast-cancer.csv
│ ├── feature-importance.ipynb
│ ├── feature-selection.ipynb
│ └── requirements.txt
├── few-shot-learning
│ ├── .gitignore
│ ├── README.md
│ ├── datasets
│ │ ├── mini_imagenet
│ │ │ └── dataloader_mini_imagenet.py
│ │ └── omniglot
│ │ │ └── dataloader_omniglot.py
│ ├── loader_omniglot.py
│ ├── mini_imagenet
│ │ ├── mini_proto_model.py
│ │ ├── mini_proto_test.py
│ │ ├── mini_proto_train.py
│ │ └── mini_protoloader.py
│ ├── model_omniglot.py
│ ├── notebooks
│ │ └── dataloader_notebook
│ │ │ ├── Omniglot.ipynb
│ │ │ ├── dataloader.ipynb
│ │ │ ├── images_background_small2.zip
│ │ │ └── loss_test.ipynb
│ ├── requirements.txt
│ ├── test_omniglot.py
│ ├── train_omniglot.py
│ └── util
│ │ ├── loss.py
│ │ └── tensor_op.py
├── fine-tune-pegasus
│ ├── README.md
│ ├── pegasus_finetuning_xsum.ipynb
│ └── requirements.txt
├── graph
│ ├── .gitignore
│ ├── README.md
│ ├── data
│ │ ├── fb-pages-food.edges
│ │ ├── fb-pages-food.nodes
│ │ └── shakespeare.txt
│ ├── deepwalk.ipynb
│ ├── fb-page-link-prediction.ipynb
│ ├── metadata.tsv
│ ├── requirements.txt
│ ├── vectors.tsv
│ └── word2vec.ipynb
├── greedy-layer-wise-pretraning
│ ├── README.md
│ ├── layer-wise-pretrain.ipynb
│ └── requirements.txt
├── house-price-prediction
│ ├── README.md
│ ├── house_price_prediction.ipynb
│ ├── input
│ │ ├── anscombe.csv
│ │ └── housing.csv
│ └── requirements.txt
├── imbalanced-multi-classification
│ ├── README.md
│ ├── glass.csv
│ ├── imbalanced-classification.ipynb
│ └── requirements.txt
├── openml-csv-arff
│ ├── README.md
│ └── news-aggregator.ipynb
├── process-mining
│ ├── README.md
│ ├── log-eda.ipynb
│ ├── log2csv.ipynb
│ ├── pm-log.csv
│ ├── requirements.txt
│ ├── sample.csv
│ └── sample.txt
├── tf-serving
│ ├── .gitignore
│ ├── README.md
│ ├── client.py
│ ├── client_curl.sh
│ ├── data
│ │ └── ImageNetLabels.txt
│ ├── images
│ │ ├── animal.jpg
│ │ ├── clear.jpg
│ │ └── ponds.png
│ ├── make_servables.py
│ └── requirements.txt
├── tfidf-bm25
│ ├── README.md
│ ├── data
│ │ ├── cranfield_docs.json
│ │ ├── cranfield_queries.json
│ │ └── cranfield_relevance.json
│ ├── requirements.txt
│ └── tfidf-bm25.ipynb
├── topic-modeling
│ ├── .gitignore
│ ├── LDA_news_headlines.ipynb
│ ├── README.md
│ ├── abcnews-small.csv
│ ├── lda_from_scratch.ipynb
│ └── requirements.txt
└── tweet-sentiment-analysis
│ ├── README.md
│ ├── requirements.txt
│ ├── trump-tweets.csv
│ ├── tutorial.md
│ └── tweet_sentiment_analysis.ipynb
└── other
├── chinese-to-pinyin
├── .gitignore
├── README.md
├── ch-to-pinyin.py
├── data
│ ├── .DS_Store11
│ ├── .DS_Store111
│ ├── .DS_Store111111
│ ├── .DS_Store1111111
│ └── 白
│ │ └── 0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg
└── uni2pinyin
├── color-palette
├── color-palette.ipynb
├── test-palette.png
└── test.png
├── csv-to-bert-text
├── README.md
├── csv-to-txt.ipynb
├── csv-to-txt.py
├── data
│ ├── neg
│ │ ├── 43.txt
│ │ └── 44.txt
│ ├── neu
│ │ └── 29.txt
│ └── pos
│ │ ├── 1.txt
│ │ ├── 2.txt
│ │ ├── 20.txt
│ │ ├── 3.txt
│ │ ├── 4.txt
│ │ ├── 40.txt
│ │ ├── 41.txt
│ │ ├── 42.txt
│ │ ├── 46.txt
│ │ ├── 47.txt
│ │ ├── 5.txt
│ │ ├── 50.txt
│ │ └── 8.txt
├── requirements.txt
└── sample.csv
├── list-like-to-list
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── movie-genre.iml
│ └── workspace.xml
├── README.md
├── input.csv
├── movie.py
├── movie_genre.csv
└── requirements.txt
├── list-of-dicts-to-columns
├── README.md
├── example.csv
├── list-to-columns.ipynb
└── requirements.txt
└── screenshot-gif-generation
├── .gitignore
├── README.md
├── gif-generation.ipynb
├── git-gen.py
├── requirements.txt
└── screenshots
└── screenshot-folder.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | .DS_Store
104 |
105 | .npy
106 | .pkl
107 |
108 | # vscode
109 |
110 | .vscode
111 |
112 | tmp/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Harry Wang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repo contains a set of AI and Data Science tutorials in Python curated and revised by me. I modified most of the tutorials to add more instructions and make sure they work well in configured virtual environments. Many thanks to the tutorial authors and other contributors. See the README in each tutorial folder for details.
2 |
3 | I organize the tutorials into four folders:
4 |
5 | - `ai` for AI tutorials
6 | - `ds` for Data Science tutorials
7 | - `ml` for machine learning/deep learning tutorials
8 | - `other` for code on things like data processing, one-off tricks, etc.
9 |
10 | ## Setup
11 |
12 | Each tutorial may have different version requirements for certain packages. So, each tutorial will use a separate virtual environment.
13 |
14 | For some tutorials, you may need to set API keys. You need to add a `.env` file and include the API keys as follows (see my blog post on [Manage Environment Variables in Python Projects](https://harrywang.me/env)):
15 |
16 | ```
17 | OPENAI_API_KEY=sk-proj-xxxx
18 | LANGCHAIN_API_KEY=ls__69650xxxx
19 | REPLICATE_API_TOKEN=r8_W0V3rJxxx
20 | ```
21 |
22 | To run each tutorial, you need to do the following at the root of this project - I use `document_clustering` tutorial as an example:
23 |
24 | ```
25 | cd document_clustering
26 | python3 -m venv venv
27 | source venv/bin/activate
28 | pip install -r requirements.txt
29 | ```
30 |
31 | Then, you can use VSCode `code .` to open the notebooks.
32 |
--------------------------------------------------------------------------------
/ai/autogen/README.md:
--------------------------------------------------------------------------------
1 | Start autogenstudio
2 |
3 | ```
4 | autogenstudio ui
5 | ```
--------------------------------------------------------------------------------
/ai/autogen/docker-example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "\u001b[31m\n",
13 | ">>>>>>>> EXECUTING CODE BLOCK (inferred language is shell)...\u001b[0m\n",
14 | "exitcode: 0 (execution succeeded)\n",
15 | "Code output: \n",
16 | "\u001b[31m\n",
17 | ">>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...\u001b[0m\n",
18 | "exitcode: 0 (execution succeeded)\n",
19 | "Code output: Scatter plot saved to line.png\n",
20 | "\n"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "import os\n",
26 | "from dotenv import load_dotenv\n",
27 | "from autogen import ConversableAgent\n",
28 | "from autogen.coding import DockerCommandLineCodeExecutor\n",
29 | "\n",
30 | "load_dotenv() # take environment variables from .env.\n",
31 | "\n",
32 | "llm_config={\"config_list\": [{\n",
33 | " \"model\": \"gpt-4-turbo\",\n",
34 | " \"cache\": None,\n",
35 | " \"temperature\": 0.9, \n",
36 | " \"api_key\": os.environ.get(\"OPENAI_API_KEY\")}]}\n",
37 | "\n",
38 | "\n",
39 | "# Create a temporary directory to store the code files.\n",
40 | "temp_dir = './tmp'\n",
41 | "\n",
42 | "docker_container_name = 'autogen'\n",
43 | "\n",
44 | "docker_executor = DockerCommandLineCodeExecutor(\n",
45 | " image=\"python:3.12-slim\", # Execute code using the given docker image name.\n",
46 | " container_name=docker_container_name, # Name of the Docker container.\n",
47 | " timeout=180, # Timeout for each code execution in seconds - 3 minutes\n",
48 | " work_dir=temp_dir, # Use the temporary directory to store the code files.\n",
49 | ")\n",
50 | "\n",
51 | "# Create an agent with code executor configuration that uses docker.\n",
52 | "code_executor_agent_using_docker = ConversableAgent(\n",
53 | " \"code_executor_agent_docker\",\n",
54 | " llm_config=False, # Turn off LLM for this agent.\n",
55 | " code_execution_config={\"executor\": docker_executor}, # Use the docker command line code executor.\n",
56 | " human_input_mode=\"NEVER\", # Change to ALWAYS to take human input for this agent for safety.\n",
57 | ")\n",
58 | "\n",
59 | "message_with_code_block = \"\"\"This is a message with code block.\n",
60 | "The code block is below:\n",
61 | "```shell\n",
62 | "pip install matplotlib numpy\n",
63 | "```\n",
64 | "This is the end of the message.\n",
65 | "\"\"\"\n",
66 | "\n",
67 | "reply = code_executor_agent_using_docker.generate_reply(messages=[{\"role\": \"user\", \"content\": message_with_code_block}])\n",
68 | "print(reply)\n",
69 | "\n",
70 | "\n",
71 | "message_with_code_block = \"\"\"This is a message with code block.\n",
72 | "The code block is below:\n",
73 | "```python\n",
74 | "import numpy as np\n",
75 | "import matplotlib.pyplot as plt\n",
76 | "x = range(100)\n",
77 | "y = np.random.randint(0, 100, 100)\n",
78 | "plt.plot(x, y)\n",
79 | "plt.savefig('line.png')\n",
80 | "print('Scatter plot saved to line.png')\n",
81 | "```\n",
82 | "This is the end of the message.\n",
83 | "\"\"\"\n",
84 | "\n",
85 | "reply = code_executor_agent_using_docker.generate_reply(messages=[{\"role\": \"user\", \"content\": message_with_code_block}])\n",
86 | "print(reply)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": []
95 | }
96 | ],
97 | "metadata": {
98 | "kernelspec": {
99 | "display_name": "venv",
100 | "language": "python",
101 | "name": "python3"
102 | },
103 | "language_info": {
104 | "codemirror_mode": {
105 | "name": "ipython",
106 | "version": 3
107 | },
108 | "file_extension": ".py",
109 | "mimetype": "text/x-python",
110 | "name": "python",
111 | "nbconvert_exporter": "python",
112 | "pygments_lexer": "ipython3",
113 | "version": "3.10.6"
114 | }
115 | },
116 | "nbformat": 4,
117 | "nbformat_minor": 2
118 | }
119 |
--------------------------------------------------------------------------------
/ai/autogen/requirements.txt:
--------------------------------------------------------------------------------
1 | pyautogen
2 | python-dotenv
3 | matplotlib
4 | numpy
5 | yfinance
6 | autogenstudio
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/header.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/header.bin
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/length.bin:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/link_lists.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/link_lists.bin
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/chroma.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/chroma.sqlite3
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-01-introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-01-introduction.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-02-starting-line.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-02-starting-line.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-03-stay-lean.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-03-stay-lean.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-04-priorities.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-04-priorities.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-05-feature-selection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-05-feature-selection.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-06-process.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-06-process.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-07-organization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-07-organization.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-08-staffing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-08-staffing.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-09-interface-design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-09-interface-design.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-10-code.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-10-code.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-11-words.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-11-words.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-12-pricing-signup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-12-pricing-signup.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-13-promotion.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-13-promotion.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-14-support.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-14-support.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-15-post-launch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-15-post-launch.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-full.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-full.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/nba-rules-2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/nba-rules-2023.pdf
--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | openai
3 | python-dotenv
4 | pypdf
5 | yt_dlp
6 | pydub
7 | bs4
8 | tiktoken
9 | langchain_openai
10 | chromadb
11 | PyPDF2
12 | lark
13 | scikit-learn
14 | panel
15 | docarray
--------------------------------------------------------------------------------
/ai/litellm/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | 1. Create and activate a virtual environment:
4 | ```bash
5 | python -m venv venv
6 | source venv/bin/activate # On Windows: venv\Scripts\activate
7 | ```
8 |
9 | 2. Install the required packages:
10 | ```bash
11 | pip install -r requirements.txt
12 | ```
13 |
14 | 3. Create a .env file with your API key:
15 | ```bash
16 | echo 'API_KEY="sk-Vi-wwJMyM8vJX"' > .env
17 | ```
18 |
19 | 4. Run the Python script:
20 | ```bash
21 | python deepseek_example.py
22 | ```
23 |
--------------------------------------------------------------------------------
/ai/litellm/deepseek_example.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from dotenv import load_dotenv
4 | from litellm import completion
5 |
6 | # Load environment variables from .env file
7 | load_dotenv()
8 |
9 | # Configure the API endpoint and key
10 | api_key = os.environ.get("API_KEY")
11 | api_base = "https://litellmud.takin.ai" # Hardcoded API base URL
12 | model = "litellm_proxy/ollama/deepseek-r1:7b" # Hardcoded model name
13 |
14 | # Verify API key is set
15 | if not api_key:
16 | print("Error: API_KEY must be set in .env file")
17 | sys.exit(1)
18 |
19 | def generate_response(prompt, stream=False):
20 | try:
21 | # Set up the request parameters
22 | params = {
23 | "model": model,
24 | "messages": [{"role": "user", "content": prompt}],
25 | "api_key": api_key,
26 | "api_base": api_base,
27 | "stream": stream,
28 | # Optional parameters
29 | "temperature": 0.6,
30 | "max_tokens": 500,
31 | }
32 |
33 | # Send the request
34 | if stream:
35 | # Return the streaming response generator
36 | return completion(**params)
37 | else:
38 | # Get the complete response
39 | response = completion(**params)
40 | return response.choices[0].message.content
41 |
42 | except Exception as e:
43 | print(f"Error generating response: {e}")
44 | return f"An error occurred: {str(e)}"
45 |
46 | def stream_response(prompt):
47 | """
48 | Stream a response from the model and print it to the console.
49 |
50 | Args:
51 | prompt (str): The user prompt to send to the model
52 | """
53 | try:
54 | print(f"\nPrompt: {prompt}")
55 | print("\nResponse: ", end="")
56 |
57 | # Get the streaming response
58 | response_stream = generate_response(prompt, stream=True)
59 |
60 | # Process and print each chunk
61 | for chunk in response_stream:
62 | content = chunk.choices[0].delta.content
63 | if content:
64 | sys.stdout.write(content)
65 | sys.stdout.flush()
66 |
67 | print("\n")
68 | except Exception as e:
69 | print(f"\nError streaming response: {e}")
70 |
71 | def main():
72 | prompt1 = "write a short poem about artificial intelligence"
73 | print(f"Prompt: {prompt1}")
74 | response1 = generate_response(prompt1)
75 | print(f"Response: {response1}")
76 |
77 | if __name__ == "__main__":
78 | main()
79 |
--------------------------------------------------------------------------------
/ai/litellm/requirements.txt:
--------------------------------------------------------------------------------
1 | litellm>=1.62.1
2 | python-dotenv>=1.0.0
3 |
--------------------------------------------------------------------------------
/ds/ab-testing/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | This is my revised code of the tutorial at https://towardsdatascience.com/the-math-behind-a-b-testing-with-example-code-part-1-of-2-7be752e1d06f.
4 |
5 | ## Setup
6 |
7 | do the following at the root of this project:
8 |
9 | ```
10 | cd ab-testing
11 | python3 -m venv venv
12 | source venv/bin/activate
13 | pip install -r requirements.txt
14 | ```
15 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
16 |
17 |
--------------------------------------------------------------------------------
/ds/ab-testing/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib==3.1.2
3 | pandas
4 | scipy
5 |
--------------------------------------------------------------------------------
/ds/ab-testing/utils/data.py:
--------------------------------------------------------------------------------
1 | import scipy.stats as scs
2 | import pandas as pd
3 | # import numpy as np
4 |
5 |
6 | def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A',
7 | test_label='B'):
8 | """Returns a pandas dataframe with fake CTR data
9 |
10 | Example:
11 |
12 | Parameters:
13 | N_A (int): sample size for control group
14 | N_B (int): sample size for test group
15 | Note: final sample size may not match N_A provided because the
16 | group at each row is chosen at random (50/50).
17 | p_A (float): conversion rate; conversion rate of control group
18 | p_B (float): conversion rate; conversion rate of test group
19 | days (int): optional; if provided, a column for 'ts' will be included
20 | to divide the data in chunks of time
21 | Note: overflow data will be included in an extra day
22 | control_label (str)
23 | test_label (str)
24 |
25 | Returns:
26 | df (df)
27 | """
28 |
29 | # initiate empty container
30 | data = []
31 |
32 | # total amount of rows in the data
33 | N = N_A + N_B
34 |
35 | # distribute events based on proportion of group size
36 | group_bern = scs.bernoulli(N_A / (N_A + N_B))
37 |
38 | # initiate bernoulli distributions from which to randomly sample
39 | A_bern = scs.bernoulli(p_A)
40 | B_bern = scs.bernoulli(p_B)
41 |
42 | for idx in range(N):
43 | # initiate empty row
44 | row = {}
45 | # for 'ts' column
46 | if days is not None:
47 | if type(days) == int:
48 | row['ts'] = idx // (N // days)
49 | else:
50 | raise ValueError("Provide an integer for the days parameter.")
51 | # assign group based on 50/50 probability
52 | row['group'] = group_bern.rvs()
53 |
54 | if row['group'] == 0:
55 | # assign conversion based on provided parameters
56 | row['converted'] = A_bern.rvs()
57 | else:
58 | row['converted'] = B_bern.rvs()
59 | # collect row into data container
60 | data.append(row)
61 |
62 | # convert data into pandas dataframe
63 | df = pd.DataFrame(data)
64 |
65 | # transform group labels of 0s and 1s to user-defined group labels
66 | df['group'] = df['group'].apply(
67 | lambda x: control_label if x == 0 else test_label)
68 |
69 | return df
70 |
--------------------------------------------------------------------------------
/ds/ab-testing/utils/stats.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.stats as scs
3 |
4 |
5 | def pooled_prob(N_A, N_B, X_A, X_B):
6 | """Returns pooled probability for two samples"""
7 | return (X_A + X_B) / (N_A + N_B)
8 |
9 |
10 | def pooled_SE(N_A, N_B, X_A, X_B):
11 | """Returns the pooled standard error for two samples"""
12 | p_hat = pooled_prob(N_A, N_B, X_A, X_B)
13 | SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B))
14 | return SE
15 |
16 |
17 | def confidence_interval(sample_mean=0, sample_std=1, sample_size=1,
18 | sig_level=0.05):
19 | """Returns the confidence interval as a tuple"""
20 | z = z_val(sig_level)
21 |
22 | left = sample_mean - z * sample_std / np.sqrt(sample_size)
23 | right = sample_mean + z * sample_std / np.sqrt(sample_size)
24 |
25 | return (left, right)
26 |
27 |
28 | def z_val(sig_level=0.05, two_tailed=True):
29 | """Returns the z value for a given significance level"""
30 | z_dist = scs.norm()
31 | if two_tailed:
32 | sig_level = sig_level/2
33 | area = 1 - sig_level
34 | else:
35 | area = 1 - sig_level
36 |
37 | z = z_dist.ppf(area)
38 |
39 | return z
40 |
41 |
42 | def ab_dist(stderr, d_hat=0, group_type='control'):
43 | """Returns a distribution object depending on group type
44 |
45 | Examples:
46 |
47 | Parameters:
48 | stderr (float): pooled standard error of two independent samples
49 | d_hat (float): the mean difference between two independent samples
50 | group_type (string): 'control' and 'test' are supported
51 |
52 | Returns:
53 | dist (scipy.stats distribution object)
54 | """
55 | if group_type == 'control':
56 | sample_mean = 0
57 |
58 | elif group_type == 'test':
59 | sample_mean = d_hat
60 |
61 | # create a normal distribution which is dependent on mean and std dev
62 | dist = scs.norm(sample_mean, stderr)
63 | return dist
64 |
65 |
66 | def min_sample_size(bcr, mde, power=0.8, sig_level=0.05):
67 | """Returns the minimum sample size to set up a split test
68 |
69 | Arguments:
70 | bcr (float): probability of success for control, sometimes
71 | referred to as baseline conversion rate
72 |
73 | mde (float): minimum change in measurement between control
74 | group and test group if alternative hypothesis is true, sometimes
75 | referred to as minimum detectable effect
76 |
77 | power (float): probability of rejecting the null hypothesis when the
78 | null hypothesis is false, typically 0.8
79 |
80 | sig_level (float): significance level often denoted as alpha,
81 | typically 0.05
82 |
83 | Returns:
84 | min_N: minimum sample size (float)
85 |
86 | References:
87 | Stanford lecture on sample sizes
88 | http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf
89 | """
90 | # standard normal distribution to determine z-values
91 | standard_norm = scs.norm(0, 1)
92 |
93 | # find Z_beta from desired power
94 | Z_beta = standard_norm.ppf(power)
95 |
96 | # find Z_alpha
97 | Z_alpha = standard_norm.ppf(1-sig_level/2)
98 |
99 | # average of probabilities from both groups
100 | pooled_prob = (bcr + bcr+mde) / 2
101 |
102 | min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2
103 | / mde**2)
104 |
105 | return min_N
106 |
107 |
108 | def p_val(N_A, N_B, p_A, p_B):
109 | """Returns the p-value for an A/B test"""
110 | return scs.binom(N_A, p_A).pmf(p_B * N_B)
111 |
--------------------------------------------------------------------------------
/ds/airflow/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | This is my revised code of the tutorial at: https://medium.com/abn-amro-developer/data-pipeline-orchestration-on-steroids-apache-airflow-tutorial-part-1-87361905db6d
4 |
5 |
6 | ## Setup
7 | I assume you have airflow (2.1.0) running locally on your mac by running the commands at https://airflow.apache.org/docs/apache-airflow/stable/start/local.html
8 |
9 | ```
10 | export AIRFLOW_HOME=~/airflow
11 | pip install apache-airflow
12 | airflow db init
13 | airflow users create \
14 | --username admin \
15 | --firstname Harry \
16 | --lastname Wang \
17 | --role Admin \
18 | --email harryjwang@gmail.com
19 |
20 | # start the web server, default port is 8080
21 | airflow webserver --port 8080
22 |
23 | # start the scheduler
24 | # open a new terminal or else run webserver with ``-D`` option to run it as a daemon
25 | airflow scheduler
26 |
27 | ```
28 |
29 | Now, you can access airflow at http://localhost:8080
30 |
31 | Open `/Users/harrywang/airflow/airflow.cfg` you can find the path to hold your dag python file: `dags_folder = /Users/harrywang/airflow/dags` you may need to create this folder.
32 |
33 | `simple_bash_dag.py` is a simple DAG that create an empty txt file and then rename it - two tasks in a sequence.
34 |
35 | copy the dag files to the dag folder, then run `airflow scheduler` (airflow does not auto refresh, you have to run this command manually to see the newly added dag file), you should be able to see the dag via UI.
36 |
37 | You can trigger the DAG as follows:
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/ds/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 |
--------------------------------------------------------------------------------
/ds/airflow/simple_bash_dag.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from airflow import DAG
3 | from airflow.operators.bash_operator import BashOperator
4 |
5 |
6 | default_args = {
7 | 'owner': 'Harry Wang',
8 | 'depends_on_past': False,
9 | 'start_date': datetime(2021, 6, 1),
10 | 'email': ['harryjwang@gmail.com'],
11 | 'email_on_failure': False,
12 | 'email_on_retry': False,
13 | # In case of errors, do one retry
14 | 'retries': 1,
15 | # Do the retry with 30 seconds delay after the error
16 | 'retry_delay': timedelta(seconds=30),
17 | # Run once every 15 minutes
18 | 'schedule_interval': '*/15 * * * *'
19 | }
20 |
21 | with DAG(
22 | dag_id='simple_bash_dag',
23 | default_args=default_args,
24 | schedule_interval=None,
25 | tags=['my_dags'],
26 | ) as dag:
27 | t1 = BashOperator(
28 | bash_command="touch ~/my_bash_file.txt",
29 | task_id="create_file"
30 | )
31 | t2 = BashOperator(
32 | bash_command="mv ~/my_bash_file.txt ~/my_bash_file_changed.txt",
33 | task_id="change_file_name"
34 | )
35 | t1 >> t2 # t2 depends on t
36 |
--------------------------------------------------------------------------------
/ds/aws-pyspark/README.md:
--------------------------------------------------------------------------------
1 | Code for tutorial at https://towardsdatascience.com/getting-started-with-pyspark-on-amazon-emr-c85154b6b921
2 |
3 |
--------------------------------------------------------------------------------
/ds/aws-pyspark/emr_bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo pip install -U matplotlib pandas
--------------------------------------------------------------------------------
/ds/cohort-analysis/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | revised version of https://www.kaggle.com/mahmoudelfahl/cohort-analysis-customer-segmentation-with-rfm
4 |
5 | ## Setup
6 |
7 | go to the tutorial folder and do the following:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ds/cohort-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | numpy
4 | seaborn
--------------------------------------------------------------------------------
/ds/dask/.gitignore:
--------------------------------------------------------------------------------
1 | data/
--------------------------------------------------------------------------------
/ds/dask/README.md:
--------------------------------------------------------------------------------
1 | ## Python statsmodels tutorial
2 |
3 | This is my code of the tutorial at https://docs.dask.org/en/stable/10-minutes-to-dask.html
4 |
5 |
6 |
7 | ## Setup
8 |
9 | ```
10 | brew brew install graphviz
11 | ```
12 |
13 | within the tutorial folder:
14 |
15 | ```
16 | python3 -m venv venv
17 | source venv/bin/activate
18 | pip install -r requirements.txt
19 | ```
20 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks.
21 |
22 |
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/global.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/global.lock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/purge.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/purge.lock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-3y9yh5wc.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-3y9yh5wc.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-5u5lbrxx.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-5u5lbrxx.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-82zb8rgu.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-82zb8rgu.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-9wl7s6m3.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-9wl7s6m3.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-_n7kuuyd.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-_n7kuuyd.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-bbjm31ih.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-bbjm31ih.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-fwxxmool.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-fwxxmool.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-l28a891y.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-l28a891y.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-l8y7v2oj.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-l8y7v2oj.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-lckuq0ub.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-lckuq0ub.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-ofkwc26n.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-ofkwc26n.dirlock
--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-wuu54xyo.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-wuu54xyo.dirlock
--------------------------------------------------------------------------------
/ds/dask/mydask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/mydask.png
--------------------------------------------------------------------------------
/ds/dask/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | dask[distributed]
3 | pandas
4 | bokeh>=2.1.1
5 | graphviz
6 | s3fs
7 |
--------------------------------------------------------------------------------
/ds/data-driven-growth/.gitignore:
--------------------------------------------------------------------------------
1 | /data/*.csv
--------------------------------------------------------------------------------
/ds/data-driven-growth/README.md:
--------------------------------------------------------------------------------
1 | ## DATA DRIVEN GROWTH WITH PYTHON
2 |
3 | This is my revised code of the tutorial at https://towardsdatascience.com/data-driven-growth-with-python-part-1-know-your-metrics-812781e66a5b.
4 |
5 | ## Setup
6 |
7 | do the following at the root of this project:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ds/data-driven-growth/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | scipy
5 | seaborn
6 |
--------------------------------------------------------------------------------
/ds/data-driven-growth/utils/data.py:
--------------------------------------------------------------------------------
1 | import scipy.stats as scs
2 | import pandas as pd
3 | # import numpy as np
4 |
5 |
6 | def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A',
7 | test_label='B'):
8 | """Returns a pandas dataframe with fake CTR data
9 |
10 | Example:
11 |
12 | Parameters:
13 | N_A (int): sample size for control group
14 | N_B (int): sample size for test group
15 | Note: final sample size may not match N_A provided because the
16 | group at each row is chosen at random (50/50).
17 | p_A (float): conversion rate; conversion rate of control group
18 | p_B (float): conversion rate; conversion rate of test group
19 | days (int): optional; if provided, a column for 'ts' will be included
20 | to divide the data in chunks of time
21 | Note: overflow data will be included in an extra day
22 | control_label (str)
23 | test_label (str)
24 |
25 | Returns:
26 | df (df)
27 | """
28 |
29 | # initiate empty container
30 | data = []
31 |
32 | # total amount of rows in the data
33 | N = N_A + N_B
34 |
35 | # distribute events based on proportion of group size
36 | group_bern = scs.bernoulli(N_A / (N_A + N_B))
37 |
38 | # initiate bernoulli distributions from which to randomly sample
39 | A_bern = scs.bernoulli(p_A)
40 | B_bern = scs.bernoulli(p_B)
41 |
42 | for idx in range(N):
43 | # initiate empty row
44 | row = {}
45 | # for 'ts' column
46 | if days is not None:
47 | if type(days) == int:
48 | row['ts'] = idx // (N // days)
49 | else:
50 | raise ValueError("Provide an integer for the days parameter.")
51 | # assign group based on 50/50 probability
52 | row['group'] = group_bern.rvs()
53 |
54 | if row['group'] == 0:
55 | # assign conversion based on provided parameters
56 | row['converted'] = A_bern.rvs()
57 | else:
58 | row['converted'] = B_bern.rvs()
59 | # collect row into data container
60 | data.append(row)
61 |
62 | # convert data into pandas dataframe
63 | df = pd.DataFrame(data)
64 |
65 | # transform group labels of 0s and 1s to user-defined group labels
66 | df['group'] = df['group'].apply(
67 | lambda x: control_label if x == 0 else test_label)
68 |
69 | return df
70 |
--------------------------------------------------------------------------------
/ds/data-driven-growth/utils/stats.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.stats as scs
3 |
4 |
5 | def pooled_prob(N_A, N_B, X_A, X_B):
6 | """Returns pooled probability for two samples"""
7 | return (X_A + X_B) / (N_A + N_B)
8 |
9 |
10 | def pooled_SE(N_A, N_B, X_A, X_B):
11 | """Returns the pooled standard error for two samples"""
12 | p_hat = pooled_prob(N_A, N_B, X_A, X_B)
13 | SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B))
14 | return SE
15 |
16 |
17 | def confidence_interval(sample_mean=0, sample_std=1, sample_size=1,
18 | sig_level=0.05):
19 | """Returns the confidence interval as a tuple"""
20 | z = z_val(sig_level)
21 |
22 | left = sample_mean - z * sample_std / np.sqrt(sample_size)
23 | right = sample_mean + z * sample_std / np.sqrt(sample_size)
24 |
25 | return (left, right)
26 |
27 |
28 | def z_val(sig_level=0.05, two_tailed=True):
29 | """Returns the z value for a given significance level"""
30 | z_dist = scs.norm()
31 | if two_tailed:
32 | sig_level = sig_level/2
33 | area = 1 - sig_level
34 | else:
35 | area = 1 - sig_level
36 |
37 | z = z_dist.ppf(area)
38 |
39 | return z
40 |
41 |
42 | def ab_dist(stderr, d_hat=0, group_type='control'):
43 | """Returns a distribution object depending on group type
44 |
45 | Examples:
46 |
47 | Parameters:
48 | stderr (float): pooled standard error of two independent samples
49 | d_hat (float): the mean difference between two independent samples
50 | group_type (string): 'control' and 'test' are supported
51 |
52 | Returns:
53 | dist (scipy.stats distribution object)
54 | """
55 | if group_type == 'control':
56 | sample_mean = 0
57 |
58 | elif group_type == 'test':
59 | sample_mean = d_hat
60 |
61 | # create a normal distribution which is dependent on mean and std dev
62 | dist = scs.norm(sample_mean, stderr)
63 | return dist
64 |
65 |
66 | def min_sample_size(bcr, mde, power=0.8, sig_level=0.05):
67 | """Returns the minimum sample size to set up a split test
68 |
69 | Arguments:
70 | bcr (float): probability of success for control, sometimes
71 | referred to as baseline conversion rate
72 |
73 | mde (float): minimum change in measurement between control
74 | group and test group if alternative hypothesis is true, sometimes
75 | referred to as minimum detectable effect
76 |
77 | power (float): probability of rejecting the null hypothesis when the
78 | null hypothesis is false, typically 0.8
79 |
80 | sig_level (float): significance level often denoted as alpha,
81 | typically 0.05
82 |
83 | Returns:
84 | min_N: minimum sample size (float)
85 |
86 | References:
87 | Stanford lecture on sample sizes
88 | http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf
89 | """
90 | # standard normal distribution to determine z-values
91 | standard_norm = scs.norm(0, 1)
92 |
93 | # find Z_beta from desired power
94 | Z_beta = standard_norm.ppf(power)
95 |
96 | # find Z_alpha
97 | Z_alpha = standard_norm.ppf(1-sig_level/2)
98 |
99 | # average of probabilities from both groups
100 | pooled_prob = (bcr + bcr+mde) / 2
101 |
102 | min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2
103 | / mde**2)
104 |
105 | return min_N
106 |
107 |
108 | def p_val(N_A, N_B, p_A, p_B):
109 | """Returns the p-value for an A/B test"""
110 | return scs.binom(N_A, p_A).pmf(p_B * N_B)
111 |
--------------------------------------------------------------------------------
/ds/diff-in-diff/Panel101.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/diff-in-diff/Panel101.dta
--------------------------------------------------------------------------------
/ds/diff-in-diff/README.md:
--------------------------------------------------------------------------------
1 | ## Python statsmodels tutorial
2 |
3 | This is my code of the tutorial at https://medium.com/@sadhaverajasekar/diff-in-diff-testing-python-f24835330bc8
4 |
5 | ## Setup
6 |
7 | within the tutorial folder:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ds/diff-in-diff/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | scikit-learn
3 | statsmodels
4 |
--------------------------------------------------------------------------------
/ds/dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /data
--------------------------------------------------------------------------------
/ds/dvc/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | This tutorial needs a separate repo, so visit https://github.com/harrywang/dvc for the code for tutorial at
4 | https://realpython.com/python-data-version-control/
--------------------------------------------------------------------------------
/ds/hypo-testing/README.md:
--------------------------------------------------------------------------------
1 | ## Python statsmodels tutorial
2 |
3 | This is my code of the tutorial at https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce
4 |
5 | ## Setup
6 |
7 | within the tutorial folder:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ds/hypo-testing/blood-pressure.csv:
--------------------------------------------------------------------------------
1 | patient,sex,agegrp,bp_before,bp_after
2 | 1,Male,30-45,143,153
3 | 2,Male,30-45,163,170
4 | 3,Male,30-45,153,168
5 | 4,Male,30-45,153,142
6 | 5,Male,30-45,146,141
7 | 6,Male,30-45,150,147
8 | 7,Male,30-45,148,133
9 | 8,Male,30-45,153,141
10 | 9,Male,30-45,153,131
11 | 10,Male,30-45,158,125
12 | 11,Male,30-45,149,164
13 | 12,Male,30-45,173,159
14 | 13,Male,30-45,165,135
15 | 14,Male,30-45,145,159
16 | 15,Male,30-45,143,153
17 | 16,Male,30-45,152,126
18 | 17,Male,30-45,141,162
19 | 18,Male,30-45,176,134
20 | 19,Male,30-45,143,136
21 | 20,Male,30-45,162,150
22 | 21,Male,46-59,149,168
23 | 22,Male,46-59,156,155
24 | 23,Male,46-59,151,136
25 | 24,Male,46-59,159,132
26 | 25,Male,46-59,164,160
27 | 26,Male,46-59,154,160
28 | 27,Male,46-59,152,136
29 | 28,Male,46-59,142,183
30 | 29,Male,46-59,162,152
31 | 30,Male,46-59,155,162
32 | 31,Male,46-59,175,151
33 | 32,Male,46-59,184,139
34 | 33,Male,46-59,167,175
35 | 34,Male,46-59,148,184
36 | 35,Male,46-59,170,151
37 | 36,Male,46-59,159,171
38 | 37,Male,46-59,149,157
39 | 38,Male,46-59,140,159
40 | 39,Male,46-59,185,140
41 | 40,Male,46-59,160,174
42 | 41,Male,60+,157,167
43 | 42,Male,60+,158,158
44 | 43,Male,60+,162,168
45 | 44,Male,60+,160,159
46 | 45,Male,60+,180,153
47 | 46,Male,60+,155,164
48 | 47,Male,60+,172,169
49 | 48,Male,60+,157,148
50 | 49,Male,60+,171,185
51 | 50,Male,60+,170,163
52 | 51,Male,60+,175,146
53 | 52,Male,60+,175,160
54 | 53,Male,60+,172,175
55 | 54,Male,60+,173,163
56 | 55,Male,60+,170,185
57 | 56,Male,60+,164,146
58 | 57,Male,60+,147,176
59 | 58,Male,60+,154,147
60 | 59,Male,60+,172,161
61 | 60,Male,60+,162,164
62 | 61,Female,30-45,152,149
63 | 62,Female,30-45,147,142
64 | 63,Female,30-45,144,146
65 | 64,Female,30-45,144,138
66 | 65,Female,30-45,158,131
67 | 66,Female,30-45,147,145
68 | 67,Female,30-45,154,134
69 | 68,Female,30-45,151,135
70 | 69,Female,30-45,149,131
71 | 70,Female,30-45,138,135
72 | 71,Female,30-45,162,133
73 | 72,Female,30-45,157,135
74 | 73,Female,30-45,141,168
75 | 74,Female,30-45,167,144
76 | 75,Female,30-45,147,147
77 | 76,Female,30-45,143,151
78 | 77,Female,30-45,142,149
79 | 78,Female,30-45,166,147
80 | 79,Female,30-45,147,149
81 | 80,Female,30-45,142,135
82 | 81,Female,46-59,157,127
83 | 82,Female,46-59,170,150
84 | 83,Female,46-59,150,138
85 | 84,Female,46-59,150,147
86 | 85,Female,46-59,167,157
87 | 86,Female,46-59,154,146
88 | 87,Female,46-59,143,148
89 | 88,Female,46-59,157,136
90 | 89,Female,46-59,149,146
91 | 90,Female,46-59,161,132
92 | 91,Female,46-59,142,145
93 | 92,Female,46-59,162,132
94 | 93,Female,46-59,144,157
95 | 94,Female,46-59,142,140
96 | 95,Female,46-59,159,137
97 | 96,Female,46-59,140,154
98 | 97,Female,46-59,144,169
99 | 98,Female,46-59,142,145
100 | 99,Female,46-59,145,137
101 | 100,Female,46-59,145,143
102 | 101,Female,60+,168,178
103 | 102,Female,60+,142,141
104 | 103,Female,60+,147,149
105 | 104,Female,60+,148,148
106 | 105,Female,60+,162,138
107 | 106,Female,60+,170,143
108 | 107,Female,60+,173,167
109 | 108,Female,60+,151,158
110 | 109,Female,60+,155,152
111 | 110,Female,60+,163,154
112 | 111,Female,60+,183,161
113 | 112,Female,60+,159,143
114 | 113,Female,60+,148,159
115 | 114,Female,60+,151,177
116 | 115,Female,60+,165,142
117 | 116,Female,60+,152,152
118 | 117,Female,60+,161,152
119 | 118,Female,60+,165,174
120 | 119,Female,60+,149,151
121 | 120,Female,60+,185,163
122 |
--------------------------------------------------------------------------------
/ds/hypo-testing/chi-test.csv:
--------------------------------------------------------------------------------
1 | Gender,Shopping
2 | Male,No
3 | Female,Yes
4 | Male,Yes
5 | Female,Yes
6 | Female,Yes
7 | Male,Yes
8 | Male,No
9 | Female,No
10 | Female,No
11 |
--------------------------------------------------------------------------------
/ds/hypo-testing/crop-yield.csv:
--------------------------------------------------------------------------------
1 | Fert,Water,Yield
2 | A,High,27.4
3 | A,High,33.6
4 | A,High,29.8
5 | A,High,35.2
6 | A,High,33
7 | B,High,34.8
8 | B,High,27
9 | B,High,30.2
10 | B,High,30.8
11 | B,High,26.4
12 | A,Low,32
13 | A,Low,32.2
14 | A,Low,26
15 | A,Low,33.4
16 | A,Low,26.4
17 | B,Low,26.8
18 | B,Low,23.2
19 | B,Low,29.4
20 | B,Low,19.4
21 | B,Low,23.8
--------------------------------------------------------------------------------
/ds/hypo-testing/plant-growth.csv:
--------------------------------------------------------------------------------
1 | "","weight","group"
2 | "1",4.17,"ctrl"
3 | "2",5.58,"ctrl"
4 | "3",5.18,"ctrl"
5 | "4",6.11,"ctrl"
6 | "5",4.5,"ctrl"
7 | "6",4.61,"ctrl"
8 | "7",5.17,"ctrl"
9 | "8",4.53,"ctrl"
10 | "9",5.33,"ctrl"
11 | "10",5.14,"ctrl"
12 | "11",4.81,"trt1"
13 | "12",4.17,"trt1"
14 | "13",4.41,"trt1"
15 | "14",3.59,"trt1"
16 | "15",5.87,"trt1"
17 | "16",3.83,"trt1"
18 | "17",6.03,"trt1"
19 | "18",4.89,"trt1"
20 | "19",4.32,"trt1"
21 | "20",4.69,"trt1"
22 | "21",6.31,"trt2"
23 | "22",5.12,"trt2"
24 | "23",5.54,"trt2"
25 | "24",5.5,"trt2"
26 | "25",5.37,"trt2"
27 | "26",5.29,"trt2"
28 | "27",4.92,"trt2"
29 | "28",6.15,"trt2"
30 | "29",5.8,"trt2"
31 | "30",5.26,"trt2"
32 |
--------------------------------------------------------------------------------
/ds/hypo-testing/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | statsmodels
3 |
--------------------------------------------------------------------------------
/ds/inside-airbnb/.idea/inside-airbnb.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/ds/inside-airbnb/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/ds/inside-airbnb/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/ds/inside-airbnb/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | Code to download the images by using the image URLs provided in the NYC data from
4 | http://insideairbnb.com/get-the-data.html and save those images into local folders.
5 |
6 | # Setup and Run
7 |
8 | Python 3
9 |
10 | - create virtual environment: `$virtualenv -p python3 venv`
11 | - activate virtual env: `$source venv/bin/activate`
12 | - install required packages: `pip3 install -r requirements.txt`
13 |
14 | To run (NOTE: for the NYC dataset, it took about 4.5 hours to download the ~ 45,000 images):
15 |
16 | 1. copy the real listing csv file to /data/ and comment out the `# listings = pd.read_csv('data/listings.csv')`
17 | 2. `python3 get-images.py` will download the save the images to /data/images/ folder
18 |
19 |
20 |
21 | Use Katalon Recorder (Selenium IDE for Chrome) to help test the css selector:
22 |
23 | Install at: https://chrome.google.com/webstore/detail/katalon-recorder-selenium/
24 |
25 | Then enter the css selector as below and search to see whether you are find the element you need:
26 |
27 |
28 |
29 | Example: open https://www.airbnb.com/rooms/18461891 in chrome and search css=button[data-veloute='hero-view-photos-button'], the View Photo button should be highlighted
--------------------------------------------------------------------------------
/ds/inside-airbnb/add-columns.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | listings = pd.read_csv('data/nyc-listings.csv') # testing data
4 | # listings = pd.read_csv('data/listings.csv')
5 | listings.info()
6 |
7 | listings["total_photos"] = 0
8 |
9 | # the original list used string not boolean
10 | listings["photo_downloaded"] = "f"
11 | listings["host_photo_downloaded"] = "f"
12 |
13 | listings.to_csv('data/nyc-listings_new.csv', encoding='utf-8', index=False)
14 |
15 | print("csv file processed")
--------------------------------------------------------------------------------
/ds/inside-airbnb/get-one-photo.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from urllib.error import HTTPError
3 | from urllib.request import urlretrieve
4 |
5 | listings = pd.read_csv('data/nyc-listings.csv') # testing data
6 | # listings = pd.read_csv('data/listings.csv')
7 | listings.info()
8 | count = 0 # count for successful downloads
9 | err_count = 0 # error counts
10 | for index, row in listings.iterrows():
11 | # print(row["id"], row["xl_picture_url"])
12 | # check whether the URL exists
13 | if pd.isnull(row["xl_picture_url"]):
14 | url = row["picture_url"]
15 | else:
16 | url = row["xl_picture_url"]
17 |
18 | try:
19 | urlretrieve(url, "./data/images/" + str(row["id"]) + ".jpg")
20 | count += 1
21 | print("downloading " + str(row["id"]))
22 | except FileNotFoundError as err:
23 | print(err) # something wrong with local path
24 | except HTTPError as err:
25 | print(err) # something wrong with url
26 | except:
27 | print("something is wrong, skipped one line")
28 |
29 |
30 | print("downloading complete with " + str(count) + " images.")
--------------------------------------------------------------------------------
/ds/inside-airbnb/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | selenium
--------------------------------------------------------------------------------
/ds/matplotlib/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | Grouped bar chart with percentage change using matplotlib. Excel version is at https://www.excelcampus.com/charts/column-chart-percentage-change/
4 |
5 | ## Setup
6 |
7 | go to the tutorial folder and do the following:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ds/matplotlib/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter>=1.0.0
2 | matplotlib
3 | numpy
4 | seaborn
--------------------------------------------------------------------------------
/ds/multi-armed-bandit/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | My revised code based on:
4 |
5 | https://towardsdatascience.com/solving-multiarmed-bandits-a-comparison-of-epsilon-greedy-and-thompson-sampling-d97167ca9a50
6 |
7 | https://github.com/conormm/bandit_algorithms/blob/master/bandits_post_code.py
8 |
9 |
10 | # Setup
11 |
12 | Setup virtual environment and install packages:
13 | ```
14 | python3 -m venv venv
15 | source venv/bin/activate
16 | pip install -r requirements.txt
17 | ```
18 |
19 | then, open the notebook
20 |
--------------------------------------------------------------------------------
/ds/multi-armed-bandit/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | matplotlib
3 | numpy
4 | pandas
5 | scipy
6 | tqdm
--------------------------------------------------------------------------------
/ds/pymongo/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | https://realpython.com/introduction-to-mongodb-and-python/
4 |
5 | ## Setup
6 |
7 | Install MongoDB:
8 |
9 | ```
10 | brew tap mongodb/brew
11 | brew install mongodb-community@5.0
12 | ```
13 |
14 | run/stop as a service
15 |
16 | ```
17 | brew services start mongodb-community@5.0
18 | brew services stop mongodb-community@5.0
19 | ```
20 |
21 | connect:
22 |
23 | ```
24 | mongosh
25 | ```
26 |
27 | create a new db
28 |
29 | ```
30 | use rptutorials
31 | show dbs
32 | db
33 | ```
34 |
35 | create a collection (table) using dot notation
36 |
37 | ```
38 | db.tutorial
39 | ```
40 |
41 | document (table row)
42 |
43 | When you’re building a MongoDB database application, probably your most important decision is about the structure of documents. In other words, you’ll have to decide which fields and values your documents will have.
44 |
45 | insert a document:
46 |
47 | ```
48 | db.tutorial.insertOne(
49 | {
50 | "title": "Reading and Writing CSV Files in Python",
51 | "author": "Jon",
52 | "contributors": [
53 | "Aldren",
54 | "Geir Arne",
55 | "Joanna",
56 | "Jason"
57 | ],
58 | "url": "https://realpython.com/python-csv/"
59 | }
60 | )
61 |
62 | db.tutorial.insertOne(
63 | {
64 | "title": "Python 3's f-Strings: An Improved String Formatting Syntax",
65 | "author": "Joanna",
66 | "contributors": [
67 | "Adriana",
68 | "David",
69 | "Dan",
70 | "Jim",
71 | "Pavel"
72 | ],
73 | "url": "https://realpython.com/python-f-strings/"
74 | }
75 | )
76 | ```
77 |
78 | find
79 | ```
80 | db.tutorial.find()
81 | db.tutorial.find({author: "Joanna"})
82 | ```
83 |
--------------------------------------------------------------------------------
/ds/pymongo/requirements.txt:
--------------------------------------------------------------------------------
1 | pymongo
2 |
--------------------------------------------------------------------------------
/ds/seaborn/README.md:
--------------------------------------------------------------------------------
1 | ## Seaborn Basics with Python 3
2 |
3 | This is my revision of the tutorials at
4 |
5 | - The Ultimate Python Seaborn Tutorial: https://elitedatascience.com/python-seaborn-tutorial
6 | - Styling plots with Seaborn: http://jose-coto.com/styling-with-seaborn
7 |
8 | ## Setup
9 |
10 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
11 |
12 | ```
13 | $ cd path_to_this folder
14 | $ virtualenv -p python3 venv
15 | $ source venv/bin/activate
16 | $ pip3 install -r requirements.txt
17 | ```
18 |
19 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
20 |
--------------------------------------------------------------------------------
/ds/seaborn/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | seaborn
5 | scikit-learn
6 |
7 |
--------------------------------------------------------------------------------
/ds/spark-basics/datacamp-notes.md:
--------------------------------------------------------------------------------
1 | https://www.datacamp.com/community/tutorials/apache-spark-tutorial-machine-learning
2 |
3 | ## installation on Mac
4 | https://medium.com/beeranddiapers/installing-apache-spark-on-mac-os-ce416007d79f
5 | ```
6 | brew upgrade && brew update
7 |
8 | brew install --cask java
9 |
10 | java -version
11 | openjdk version "11.0.1" 2018-10-16
12 | OpenJDK Runtime Environment 18.9 (build 11.0.1+13)
13 | OpenJDK 64-Bit Server VM 18.9 (build 11.0.1+13, mixed mode)
14 |
15 | xcode-select --install
16 |
17 | brew install scala
18 |
19 | scala -version
20 | Scala code runner version 2.13.5 -- Copyright 2002-2020, LAMP/EPFL and Lightbend, Inc.
21 |
22 | brew install apache-spark
23 |
24 | spark-shell
25 |
26 | pyspark
27 | ```
28 |
29 |
30 | ```
31 | pip install pyspark
32 | pip install findspark
33 | ```
--------------------------------------------------------------------------------
/ds/spark-basics/datacamp-spark.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/spark-basics/datacamp-spark.ipynb
--------------------------------------------------------------------------------
/ds/statsmodels-tutorial/README.md:
--------------------------------------------------------------------------------
1 | ## Python statsmodels tutorial
2 |
3 | This is my code of the tutorial at https://www.statsmodels.org/stable/gettingstarted.html
4 |
5 | ## Setup
6 |
7 | within the tutorial folder:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ds/statsmodels-tutorial/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | linearmodels
3 | matplotlib
4 | statsmodels
5 |
--------------------------------------------------------------------------------
/ds/streamlit/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | This is my revised code for tutorial at https://towardsdatascience.com/streamlit-101-an-in-depth-introduction-fc8aad9492f2
4 |
5 | Changes:
6 |
7 | - changed data file and make it self-contained in the repo
8 | - added requirements.txt and virtual environment setup
9 |
10 | ## Local Setup
11 |
12 | Python 3 required, see my tutorial to setup Python 3: https://bit.ly/2uX6wAX
13 |
14 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
15 |
16 |
17 | ```shell
18 | $ python3 -m venv venv
19 | $ source venv/bin/activate
20 | $ pip install -r requirements.txt
21 | ```
22 |
23 | Run the app locally (Local URL: http://localhost:8501) using terminal: `streamlit run airbnb.py`
24 |
25 | Stop the app by using ctrl + C or closing the terminal
26 |
27 | Deploy the app to the cloud for public access via services such as streamlit share, heroku, aws by following my tutorial at https://github.com/harrywang/streamlit-basics. you can see an example at: https://st-demo-harrywang.herokuapp.com/
28 |
--------------------------------------------------------------------------------
/ds/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib>=3.4.2
2 | pandas>=1.2.4
3 | plotly>=5.0.0
4 | streamlit>=0.82.0
--------------------------------------------------------------------------------
/ds/superset/README.md:
--------------------------------------------------------------------------------
1 | Install a local copy
2 |
3 | https://superset.apache.org/docs/installation/installing-superset-using-docker-compose
4 |
5 | I made some changes to the instructions found in the doc above:
6 |
7 | - make sure local PostgreSQL is stopped otherwise superset port conflict
8 | - run into this problem https://github.com/apache/superset/issues/12723 with docker deskptop 2G - increased to 7.5G
9 |
10 | I did the followings:
11 |
12 | - get code `git clone https://github.com/apache/superset.git`
13 | - make sure to use master branch `git checkout master`
14 | - change redis from 3.2 to latest in `docker-compose-non-dev.yml`
15 |
16 | - use `docker-compose -f docker-compose-non-dev.yml up` to start the server
17 | - wait some time and superset_init exited with 0 is expected - it does not affect the server:
18 |
19 | - login at http://localhost:8088/ using admin/admin
--------------------------------------------------------------------------------
/ds/time-series-additive-model/README.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 | This tutorial is originally published by William Koehrsen at https://towardsdatascience.com/time-series-analysis-in-python-an-introduction-70d5a5b1d52a
3 |
4 | # Setup (Mac)
5 |
6 | - Install python 3 using Homebrew if not done yet: `$ brew install python3`
7 |
8 | - Create python3 virtualenv: `virtualenv -p python3 venv`
9 |
10 | - Activate it: `source venv/bin/activate`
11 |
12 | - Install packages: `$ pip3 install -r requirements.txt`, which include quandl seaborn matplotlib numpy pandas scipy scikit-learn fbprophet
13 |
14 | - Change API key for Quandl: We will access financial data using the Quandl library. Please go to https://www.quandl.com/ and register to get your api_key. You will need to use your own api_key to pull data from the quandl financial library. **You should never put your real API key in the code and push to Github.** We use a local environment variable for the API key: `quandl.ApiConfig.api_key = os.environ.get('QUANDL_KEY')`. You need to add one line `export QUANDL_KEY='your_real_api_key'` to the `~/.bash_profile` file (use `vim` to edit, `source` to execute it, then use `env` to double check):
15 | ```
16 | $ vim ~/.bash_profile
17 | $ source ~/.bash_profile
18 | $ env
19 | ```
20 | **NOTE: You may need to close the Terminal window and restart it for Jupyter Notebook to read the new QUANDL_KEY you just added.**
21 |
22 | # Run
23 |
24 | - Start Virtual Env:
25 | ```
26 | $ virtualenv -p python3 venv
27 | $ source venv/bin/activate
28 | ```
29 | - Run Jupyter: `jupyter notebook`
30 | - Run additive_models.ipynb
31 |
32 | ### TODO: Get rid of the Deprecation Warnings
33 |
--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/Workbook1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/Workbook1.xlsx
--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/gm_sales.csv:
--------------------------------------------------------------------------------
1 | Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total
2 | 2017,195909,237388,256224,244406,237364,243151,226107,275552,279397,252813,245387,308539,3002237
3 | 2016,203745,227825,252128,259557,240449,255209,267258,256429,249795,258626,252644,319108,3042773
4 | 2015,202786,231378,249875,269055,293097,259346,272512,270480,251310,262993,229296,290230,3082358
5 | 2014,171486,222104,256047,254076,284694,267461,256160,272422,223437,226819,225818,274483,2935007
6 | 2013,194699,224314,245950,237646,252894,264843,234071,275847,187195,226402,212060,230157,2786078
7 | 2012,167962,209306,231052,213387,245256,248750,201237,240520,210245,195764,186505,245733,2595717
8 | 2011,178896,207028,206621,232538,221192,215358,214915,218479,207145,186895,180402,234351,2503820
9 | 2010,145098,138849,185406,183091,222305,194828,199432,184921,172969,183392,168704,223932,2202927
--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/gm_sales.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/gm_sales.xlsx
--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/recessions.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/recessions.csv
--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/recessions.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/recessions.xlsx
--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/tesla_search_terms.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:699ee134222a9b87c1434bd28d7e66ab0a163788d22c84af35386459961ac202
3 | size 880
4 |
--------------------------------------------------------------------------------
/ds/time-series-additive-model/requirements.txt:
--------------------------------------------------------------------------------
1 | quandl
2 | seaborn
3 | matplotlib
4 | numpy
5 | pandas
6 | scipy
7 | scikit-learn
8 | pystan
9 | fbprophet
10 | jupyter
11 |
--------------------------------------------------------------------------------
/ds/time-series-basics/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | Time Series Analysis with Pandas: https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
4 |
5 | ## Setup
6 |
7 | Tested with Python 3.6 via virtual environment. Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
8 |
9 |
10 | ```shell
11 | $ python3.6 -m venv venv
12 | $ source venv/bin/activate
13 | $ pip install -r requirements.txt
14 | ```
15 |
16 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
17 |
--------------------------------------------------------------------------------
/ds/time-series-basics/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | pandas==1.0.0
3 | matplotlib==3.1.2
4 | seaborn==0.10.0
5 |
--------------------------------------------------------------------------------
/ml/attention/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | [](https://colab.research.google.com/drive/1v_S1r-iPuuVAkqVo8hAL6-OsQ-hZhGx8)
4 |
5 |
6 | I combined and revised the following tutorials:
7 | - https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
8 | - https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39
9 |
10 | ## Setup
11 |
12 | within the tutorial folder:
13 |
14 | ```
15 | python3 -m venv venv
16 | source venv/bin/activate
17 | pip install -r requirements.txt
18 | ```
19 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
20 |
21 |
--------------------------------------------------------------------------------
/ml/attention/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib==3.1.0 # latest version breaks the seaborn heatmap
3 | seaborn
4 |
--------------------------------------------------------------------------------
/ml/autogluon/README.md:
--------------------------------------------------------------------------------
1 | ## Kaggle Kernel
2 |
3 | You can run this kernel directly at Kaggle.com: https://www.kaggle.com/harrywang/housing-price-prediction
4 |
5 | ## Run Locally
6 |
7 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
8 |
9 | ```
10 | $ cd path_to_this folder
11 | $ virtualenv -p python3 venv
12 | $ source venv/bin/activate
13 | $ pip3 install -r requirements.txt
14 | ```
15 |
16 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
17 |
18 | ## Source
19 |
20 | This is the dataset used in this book: https://github.com/ageron/handson-ml/tree/master/datasets/housing to illustrate a sample end-to-end ML project workflow (pipeline). This is a great book - I highly recommend!
21 |
22 | The data is based on California Census in 1990.
23 |
24 | ### About the Data (from the book):
25 |
26 | "This dataset is a modified version of the California Housing dataset available from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the StatLib repository (which is closed now). The dataset may also be downloaded from StatLib mirrors.
27 |
28 | The following is the description from the book author:
29 |
30 | This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people).
31 |
32 | The dataset in this directory is almost identical to the original, with two differences:
33 | 207 values were randomly removed from the total_bedrooms column, so we can discuss what to do with missing data.
34 | An additional categorical attribute called ocean_proximity was added, indicating (very roughly) whether each block group is near the ocean, near the Bay area, inland or on an island. This allows discussing what to do with categorical data.
35 | Note that the block groups are called "districts" in the Jupyter notebooks, simply because in some contexts the name "block group" was confusing."
36 |
37 | ### About the Data (From Luís Torgo page):
38 | http://www.dcc.fc.up.pt/%7Eltorgo/Regression/cal_housing.html
39 |
40 | This is a dataset obtained from the StatLib repository. Here is the included description:
41 |
42 | "We collected information on the variables using all the block groups in California from the 1990 Cens us. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value)."
43 |
44 |
45 | ### End-to-End ML Project Steps (Chapter 2 of the book)
46 |
47 | 1. Look at the big picture
48 | 2. Get the data
49 | 3. Discover and visualize the data to gain insights
50 | 4. Prepare the data for Machine Learning algorithms
51 | 5. Select a model and train it
52 | 6. Fine-tune your model
53 | 7. Present your solution
54 | 8. Launch, monitor, and maintain your system
55 |
56 | ## The 10-Step Machine Learning Project Workflow (My Version)
57 |
58 | 1. Define business object
59 | 2. Make sense of the data from a high level
60 | - data types (number, text, object, etc.)
61 | - continuous/discrete
62 | - basic stats (min, max, std, median, etc.) using boxplot
63 | - frequency via histogram
64 | - scales and distributions of different features
65 | 3. Create the traning and test sets using proper sampling methods, e.g., random vs. stratified
66 | 4. Correlation analysis (pair-wise and attribute combinations)
67 | 5. Data cleaning (missing data, outliers, data errors)
68 | 6. Data transformation via pipelines (categorical text to number using one hot encoding, feature scaling via normalization/standardization, feature combinations)
69 | 7. Train and cross validate different models and select the most promising one (Linear Regression, Decision Tree, and Random Forest were tried in this tutorial)
70 | 8. Fine tune the model using trying different combinations of hyperparameters
71 | 9. Evaluate the model with best estimators in the test set
72 | 10. Launch, monitor, and refresh the model and system
73 |
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/learner.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/learner.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/CatBoost/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/CatBoost/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/ExtraTreesEntr/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/ExtraTreesEntr/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/ExtraTreesGini/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/ExtraTreesGini/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/KNeighborsDist/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/KNeighborsDist/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/KNeighborsUnif/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/KNeighborsUnif/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/LightGBM/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBM/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/LightGBMLarge/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBMLarge/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/LightGBMXT/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBMXT/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model-internals.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model-internals.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/RandomForestEntr/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/RandomForestEntr/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/RandomForestGini/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/RandomForestGini/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/model_template.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/model_template.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/oof.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/oof.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/XGBoost/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/XGBoost/model.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/trainer.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/trainer.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/predictor.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/predictor.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/X.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/X.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/X_val.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/X_val.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/y.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/y.pkl
--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/y_val.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/y_val.pkl
--------------------------------------------------------------------------------
/ml/autogluon/input/anscombe.csv:
--------------------------------------------------------------------------------
1 | dataset,x,y
2 | I,10.0,8.04
3 | I,8.0,6.95
4 | I,13.0,7.58
5 | I,9.0,8.81
6 | I,11.0,8.33
7 | I,14.0,9.96
8 | I,6.0,7.24
9 | I,4.0,4.26
10 | I,12.0,10.84
11 | I,7.0,4.82
12 | I,5.0,5.68
13 | II,10.0,9.14
14 | II,8.0,8.14
15 | II,13.0,8.74
16 | II,9.0,8.77
17 | II,11.0,9.26
18 | II,14.0,8.1
19 | II,6.0,6.13
20 | II,4.0,3.1
21 | II,12.0,9.13
22 | II,7.0,7.26
23 | II,5.0,4.74
24 | III,10.0,7.46
25 | III,8.0,6.77
26 | III,13.0,12.74
27 | III,9.0,7.11
28 | III,11.0,7.81
29 | III,14.0,8.84
30 | III,6.0,6.08
31 | III,4.0,5.39
32 | III,12.0,8.15
33 | III,7.0,6.42
34 | III,5.0,5.73
35 | IV,8.0,6.58
36 | IV,8.0,5.76
37 | IV,8.0,7.71
38 | IV,8.0,8.84
39 | IV,8.0,8.47
40 | IV,8.0,7.04
41 | IV,8.0,5.25
42 | IV,19.0,12.5
43 | IV,8.0,5.56
44 | IV,8.0,7.91
45 | IV,8.0,6.89
46 |
--------------------------------------------------------------------------------
/ml/autogluon/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | autogluon
3 |
--------------------------------------------------------------------------------
/ml/clearml-server/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | Setup ClearML server locally on Mac: https://allegro.ai/docs/deploying_trains/trains_server_linux_mac/
4 |
5 | For how to use ClearML, check out https://github.com/harrywang/tutorial-buffet/tree/master/clearml
6 |
7 | Note: I set 6G for Docker
8 |
9 | - Make sure docker is running correctly: `docker run hello-world`
10 | - Create the mounting folder: `sudo mkdir /opt/trains` and then Open the Docker app., On the File Sharing tab, add `/opt/trains`.
11 |
12 | **NOTE: you have to restart docker app after this step!!**
13 |
14 |
15 | - By default, ElasticSearch is mounted at `/opt/trains/data/elastic_7`, you need to create the folder and then give write permission as follows:
16 |
17 | ```
18 | $ sudo mkdir -p /opt/trains/data/elastic_7
19 | $ chmod 777 /opt/trains/data/elastic_7
20 | ```
21 |
22 | - Grant access to the Dockers, depending upon your operating system:
23 | ```
24 | sudo chown -R $(whoami):staff /opt/trains
25 | ```
26 |
27 | - download `docker-compose.yml` to the `/opt/trains` folder:
28 | ```
29 | sudo curl https://raw.githubusercontent.com/allegroai/trains-server/master/docker-compose.yml -o /opt/trains/docker-compose.yml
30 | ```
31 | - Start the server: `docker-compose -f /opt/trains/docker-compose.yml up -d`
32 |
33 | Then go to http://localhost:8080/ to login
34 |
35 |
36 |
37 | - Restart:
38 |
39 | ```
40 | docker-compose -f /opt/trains/docker-compose.yml down
41 | docker-compose -f /opt/trains/docker-compose.yml up -d
42 | ```
43 |
44 |
--------------------------------------------------------------------------------
/ml/clearml/matplotlib/matplotlib_example.py:
--------------------------------------------------------------------------------
1 | # TRAINS - Example of Matplotlib and Seaborn integration and reporting
2 | #
3 | import matplotlib
4 | matplotlib.use('agg') # use agg instead of tkinter
5 | import numpy as np
6 | import matplotlib.pyplot as plt
7 | import seaborn as sns
8 | from clearml import Task
9 |
10 |
11 | task = Task.init(project_name='examples', task_name='Matplotlib example by Harry')
12 |
13 | # Create a plot
14 | N = 50
15 | x = np.random.rand(N)
16 | y = np.random.rand(N)
17 | colors = np.random.rand(N)
18 | area = (30 * np.random.rand(N))**2 # 0 to 15 point radii
19 | plt.scatter(x, y, s=area, c=colors, alpha=0.5)
20 | # Plot will be reported automatically
21 | plt.show()
22 |
23 | # Alternatively, in order to report the plot with a more meaningful title/series and iteration number
24 | area = (40 * np.random.rand(N))**2
25 | plt.scatter(x, y, s=area, c=colors, alpha=0.5)
26 | task.logger.report_matplotlib_figure(title="My Plot Title", series="My Plot Series", iteration=10, figure=plt)
27 | plt.show()
28 |
29 | # Create another plot - with a name
30 | x = np.linspace(0, 10, 30)
31 | y = np.sin(x)
32 | plt.plot(x, y, 'o', color='black')
33 | # Plot will be reported automatically
34 | plt.show()
35 |
36 | # Create image plot
37 | m = np.eye(256, 256, dtype=np.uint8)
38 | plt.imshow(m)
39 | # Plot will be reported automatically
40 | plt.show()
41 |
42 | # Create image plot - with a name
43 | m = np.eye(256, 256, dtype=np.uint8)
44 | plt.imshow(m)
45 | plt.title('Image Title')
46 | # Plot will be reported automatically
47 | plt.show()
48 |
49 | sns.set(style="darkgrid")
50 | # Load an example dataset with long-form data
51 | fmri = sns.load_dataset("fmri")
52 | # Plot the responses for different events and regions
53 | sns.lineplot(x="timepoint", y="signal",
54 | hue="region", style="event",
55 | data=fmri)
56 | # Plot will be reported automatically
57 | plt.show()
58 |
59 | print('This is a Matplotlib & Seaborn example')
60 |
--------------------------------------------------------------------------------
/ml/clearml/matplotlib/mlp_grouped_errorbar.py:
--------------------------------------------------------------------------------
1 | # Grouped bar chart with precentage change bars and labels
2 | import matplotlib
3 | matplotlib.use('agg') # use agg instead of tkinter
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | plt.style.use('seaborn')
7 |
8 | from clearml import Task
9 | task = Task.init(project_name='examples', task_name='Matplotlib GroupedBar example by Harry')
10 |
11 | men_means = np.array([20, 35, 30, 35, 27])
12 | women_means = np.array([25, 32, 34, 20, 25])
13 |
14 | ind = np.arange(len(men_means)) # the x locations for the groups
15 | width = 0.35 # the width of the bars
16 |
17 | fig, ax = plt.subplots()
18 |
19 | rects1 = ax.bar(ind - width/2, men_means, width,
20 | label='Men')
21 | rects2 = ax.bar(ind + width/2, women_means, width,
22 | label='Women')
23 |
24 | # Add some text for labels, title and custom x-axis tick labels, etc.
25 | ax.set_ylabel('Scores')
26 | ax.set_title('Scores by group and gender')
27 | ax.set_xticks(ind)
28 | ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))
29 | ax.legend()
30 |
31 |
32 | def autolabel(rects, xpos='center'):
33 | """
34 | Attach a text label above each bar in *rects*, displaying its height.
35 |
36 | *xpos* indicates which side to place the text w.r.t. the center of
37 | the bar. It can be one of the following {'center', 'right', 'left'}.
38 |
39 | ha: horizontal alignment
40 | """
41 |
42 | ha = {'center': 'center', 'right': 'left', 'left': 'right'}
43 | offset = {'center': 0, 'right': 1, 'left': -1}
44 |
45 | for rect in rects:
46 | height = rect.get_height()
47 | ax.annotate('{}'.format(height),
48 | xy=(rect.get_x() + rect.get_width() / 2, height),
49 | xytext=(offset[xpos]*3, 3), # use 3 points offset
50 | textcoords="offset points", # in both directions
51 | ha=ha[xpos], va='bottom')
52 |
53 |
54 | autolabel(rects1)
55 | autolabel(rects2)
56 |
57 | # custom error bar
58 | diff = (men_means - women_means)/2
59 | change_percentage = np.abs((men_means - women_means)/men_means)
60 | errorbar_y = men_means - diff # the y of the error bar
61 | errorbar_x_offset = 0.1
62 |
63 |
64 | # show the small caps on error bar ends:
65 | # capsize=3 (bar width) AND markeredgewidth=1 (bar width - default is 0)
66 | # elinewidth=1 is the error bar line width
67 | ax.errorbar(ind + width + errorbar_x_offset, errorbar_y,
68 | yerr=diff, fmt='none', elinewidth=1,
69 | capsize=3, markeredgewidth=1)
70 |
71 | # show the change percentage labels
72 |
73 | errorbar_text_offset = 0.625 # the offset fro the man's bar x location
74 |
75 | for i in range(len(rects1)):
76 | # find the higher bar to determine label height
77 | height1 = rects1[i].get_height()
78 | height2 = rects2[i].get_height()
79 | height = height1 if height1 > height2 else height2
80 |
81 | # add the percentage change labels
82 | ax.annotate(f'{change_percentage[i]:.1%}', # the text
83 | xy=(ind[i] - width/2 + 0.625, height), # x y for the text
84 | xytext=(0, 3), # 0 point horizotal and 3 points vertical offsets
85 | textcoords="offset points", # in both directions
86 | ha='center', # horizontal alignment
87 | va='bottom') # vertical alignment
88 |
89 | fig.tight_layout()
90 |
91 | plt.show()
--------------------------------------------------------------------------------
/ml/clearml/matplotlib/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib >= 3.1.1 ; python_version >= '3.6'
2 | matplotlib >= 2.2.4 ; python_version < '3.6'
3 | seaborn
4 | clearml
--------------------------------------------------------------------------------
/ml/clearml/pytorch/manual_model_upload.py:
--------------------------------------------------------------------------------
1 | # TRAINS - Example of manual model configuration and uploading
2 | #
3 | import os
4 | from tempfile import gettempdir
5 |
6 | import torch
7 | from trains import Task
8 |
9 |
10 | task = Task.init(project_name='examples', task_name='Model configuration and upload')
11 |
12 | # create a model
13 | model = torch.nn.Module
14 |
15 | # Connect a local configuration file
16 | config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json')
17 | config_file = task.connect_configuration(config_file)
18 | # then read configuration as usual, the backend will contain a copy of it.
19 | # later when executing remotely, the returned `config_file` will be a temporary file
20 | # containing a new copy of the configuration retrieved form the backend
21 | # # model_config_dict = json.load(open(config_file, 'rt'))
22 |
23 | # Or Store dictionary of definition for a specific network design
24 | model_config_dict = {
25 | 'value': 13.37,
26 | 'dict': {'sub_value': 'string', 'sub_integer': 11},
27 | 'list_of_ints': [1, 2, 3, 4],
28 | }
29 | model_config_dict = task.connect_configuration(model_config_dict)
30 |
31 | # We now update the dictionary after connecting it, and the changes will be tracked as well.
32 | model_config_dict['new value'] = 10
33 | model_config_dict['value'] *= model_config_dict['new value']
34 |
35 | # store the label enumeration of the training model
36 | labels = {'background': 0, 'cat': 1, 'dog': 2}
37 | task.connect_label_enumeration(labels)
38 |
39 | # storing the model, it will have the task network configuration and label enumeration
40 | print('Any model stored from this point onwards, will contain both model_config and label_enumeration')
41 |
42 | torch.save(model, os.path.join(gettempdir(), "model.pt"))
43 | print('Model saved')
44 |
--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/audio/README.md:
--------------------------------------------------------------------------------
1 | The `audio_classifier_UrbanSound8K.ipynb` example uses a small dataset based on [UrbanSound8K dataset](https://urbansounddataset.weebly.com/urbansound8k.html).
--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/audio/audio_preprocessing_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "! pip install -U pip\n",
12 | "! pip install -U torch==1.5.1\n",
13 | "! pip install -U torchaudio==0.5.1\n",
14 | "! pip install -U matplotlib==3.2.1\n",
15 | "! pip install -U trains>=0.16.1\n",
16 | "! pip install -U tensorboard==2.2.1"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import os\n",
26 | "import torch\n",
27 | "import torchaudio\n",
28 | "from torch.utils.tensorboard import SummaryWriter\n",
29 | "import matplotlib.pyplot as plt\n",
30 | "\n",
31 | "from trains import Task\n",
32 | "\n",
33 | "%matplotlib inline"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "task = Task.init(project_name='Audio Example', task_name='data pre-processing')\n",
43 | "configuration_dict = {'number_of_samples': 3}\n",
44 | "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
45 | "print(configuration_dict) # printing actual configuration (after override in remote mode)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "tensorboard_writer = SummaryWriter('./tensorboard_logs')"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "scrolled": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "if not os.path.isdir('./data'):\n",
66 | " os.mkdir('./data')\n",
67 | "yesno_data = torchaudio.datasets.YESNO('./data', download=True)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "def plot_signal(signal, title, cmap=None):\n",
77 | " plt.figure()\n",
78 | " if signal.ndim == 1:\n",
79 | " plt.plot(signal)\n",
80 | " else:\n",
81 | " plt.imshow(signal, cmap=cmap) \n",
82 | " plt.title(title)\n",
83 | " plt.show()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "pycharm": {
91 | "name": "#%%\n"
92 | },
93 | "scrolled": true
94 | },
95 | "outputs": [],
96 | "source": [
97 | "fixed_sample_rate = 22050\n",
98 | "for n in range(configuration_dict.get('number_of_samples', 3)):\n",
99 | " audio, sample_rate, labels = yesno_data[n]\n",
100 | " tensorboard_writer.add_audio('Audio samples/{}'.format(n), audio, n, sample_rate)\n",
101 | " \n",
102 | " resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=fixed_sample_rate)\n",
103 | " melspectogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=fixed_sample_rate, n_mels=128)\n",
104 | " \n",
105 | " audio_mono = torch.mean(resample_transform(audio), dim=0, keepdim=True)\n",
106 | " plot_signal(audio_mono[0,:], 'Original waveform')\n",
107 | " \n",
108 | " melspectogram = melspectogram_transform(audio_mono)\n",
109 | " plot_signal(melspectogram.squeeze().numpy(), 'Mel spectogram', 'hot')\n",
110 | " plot_signal(torchaudio.transforms.AmplitudeToDB()(melspectogram).squeeze().numpy(), 'Mel spectogram DB', 'hot')"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.7.4"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
136 |
--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/table/download_and_split.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "! pip install -U pip\n",
10 | "! pip install -U trains==0.16.2rc0\n",
11 | "! pip install -U pandas==1.0.4\n",
12 | "! pip install -U scikit-learn==0.23.1\n",
13 | "! pip install -U pathlib2==2.3.5"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import pandas as pd\n",
23 | "from pathlib2 import Path\n",
24 | "from sklearn.model_selection import train_test_split\n",
25 | "\n",
26 | "from trains import Task"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "task = Task.init(project_name='Tabular Example', task_name='Download and split tabular dataset')\n",
36 | "logger = task.get_logger()\n",
37 | "configuration_dict = {'test_size': 0.1, 'split_random_state': 0}\n",
38 | "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
39 | "print(configuration_dict) # printing actual configuration (after override in remote mode)"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "# **Downloading**"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# Download the shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n",
56 | "# and save it to your cloud storage or your mounted local storage\n",
57 | "# If the data is on your cloud storage, you can use trains' storage manager to get a local copy of it:\n",
58 | "# from trains.storage import StorageManager\n",
59 | "# path_to_ShelterAnimal = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip\", \n",
60 | "# extract_archive=True)\n",
61 | "path_to_ShelterAnimal = '/home/sam/Datasets/shelter-animal-outcomes'"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "train_set = pd.read_csv(Path(path_to_ShelterAnimal) / 'train.csv')\n",
71 | "logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "# **Splitting to train and val**"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "X = train_set.drop(columns= ['OutcomeType'])\n",
88 | "Y = train_set['OutcomeType']\n",
89 | "X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=configuration_dict.get('test_size', 0.1), \n",
90 | " random_state=configuration_dict.get('split_random_state', 0))"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "train_df = X_train.join(Y_train)\n",
100 | "val_df = X_val.join(Y_val)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "task.upload_artifact('train_data', artifact_object=train_df)\n",
110 | "task.upload_artifact('val_data', artifact_object=val_df)"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.7.4"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
136 |
--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/table/pick_best_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "! pip install -U pip\n",
10 | "! pip install -U trains==0.16.2rc0"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "from trains import Task, OutputModel"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "task = Task.init(project_name='Tabular Example', task_name='pick best model')\n",
29 | "configuration_dict = {'train_tasks_ids': ['c9bff3d15309487a9e5aaa00358ff091', 'c9bff3d15309487a9e5aaa00358ff091']}\n",
30 | "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
31 | "print(configuration_dict) # printing actual configuration (after override in remote mode)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "results = {}\n",
41 | "for task_id in configuration_dict.get('train_tasks_ids'):\n",
42 | " train_task = Task.get_task(task_id)\n",
43 | " results[task_id] = train_task.get_last_scalar_metrics()['accuracy']['total']['last']"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "print(results)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "best_model_task_id = max(results.items(), key=lambda x: x[1])[0]\n",
62 | "best_model_id = Task.get_task(best_model_task_id).output_model_id"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "OutputModel(base_model_id=best_model_id)"
72 | ]
73 | }
74 | ],
75 | "metadata": {
76 | "kernelspec": {
77 | "display_name": "Python 3",
78 | "language": "python",
79 | "name": "python3"
80 | },
81 | "language_info": {
82 | "codemirror_mode": {
83 | "name": "ipython",
84 | "version": 3
85 | },
86 | "file_extension": ".py",
87 | "mimetype": "text/x-python",
88 | "name": "python",
89 | "nbconvert_exporter": "python",
90 | "pygments_lexer": "ipython3",
91 | "version": "3.7.4"
92 | }
93 | },
94 | "nbformat": 4,
95 | "nbformat_minor": 4
96 | }
97 |
--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/table/tabular_ml_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# pip install with locked versions\n",
10 | "! pip install -U pip\n",
11 | "! pip install -U trains==0.16.2rc0"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from trains import Task\n",
21 | "from trains.automation.controller import PipelineController"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "task = Task.init(project_name='Tabular Example', task_name='tabular training pipeline', task_type=Task.TaskTypes.controller)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "pipe = PipelineController(default_execution_queue='dan_queue', add_pipeline_tags=True)\n",
40 | "pipe.add_step(name='preprocessing_1', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n",
41 | " parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
42 | " 'General/fill_categorical_NA': 'True',\n",
43 | " 'General/fill_numerical_NA': 'True'})\n",
44 | "pipe.add_step(name='preprocessing_2', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n",
45 | " parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
46 | " 'General/fill_categorical_NA': 'False',\n",
47 | " 'General/fill_numerical_NA': 'True'})\n",
48 | " \n",
49 | "pipe.add_step(name='train_1', parents=['preprocessing_1'],\n",
50 | " base_task_project='Tabular Example', base_task_name='tabular prediction',\n",
51 | " parameter_override={'General/data_task_id': '${preprocessing_1.id}'})\n",
52 | "pipe.add_step(name='train_2', parents=['preprocessing_2'],\n",
53 | " base_task_project='Tabular Example', base_task_name='tabular prediction',\n",
54 | " parameter_override={'General/data_task_id': '${preprocessing_2.id}'})\n",
55 | " \n",
56 | "pipe.add_step(name='pick_best', parents=['train_1', 'train_2'],\n",
57 | " base_task_project='Tabular Example', base_task_name='pick best model',\n",
58 | " parameter_override={'General/train_tasks_ids': '[${train_1.id}, ${train_2.id}]'}) "
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# Starting the pipeline (in the background)\n",
68 | "pipe.start()\n",
69 | "# Wait until pipeline terminates\n",
70 | "pipe.wait()\n",
71 | "# cleanup everything\n",
72 | "pipe.stop()"
73 | ]
74 | }
75 | ],
76 | "metadata": {
77 | "kernelspec": {
78 | "display_name": "Python 3",
79 | "language": "python",
80 | "name": "python3"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.7.4"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 4
97 | }
98 |
--------------------------------------------------------------------------------
/ml/clearml/pytorch/pytorch_tensorboardx.py:
--------------------------------------------------------------------------------
1 | ../tensorboardx/pytorch_tensorboardX.py
--------------------------------------------------------------------------------
/ml/clearml/pytorch/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | tensorboardX
3 | tensorboard>=1.14.0
4 | torch>=1.1.0
5 | torchvision>=0.3.0
6 | clearml
--------------------------------------------------------------------------------
/ml/clearml/pytorch/tensorboard_toy_pytorch.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tempfile import gettempdir
3 |
4 | import numpy as np
5 | from PIL import Image
6 | from torch.utils.tensorboard import SummaryWriter
7 |
8 | from trains import Task
9 | task = Task.init(project_name='examples', task_name='pytorch tensorboard toy example')
10 |
11 |
12 | writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs'))
13 |
14 | # convert to 4d [batch, col, row, RGB-channels]
15 | image_open = Image.open(os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg"))
16 | image = np.asarray(image_open)
17 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
18 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
19 | image_rgba = image_rgba[np.newaxis, :, :, :]
20 | image = image[np.newaxis, :, :, :]
21 |
22 | writer.add_image("test/first", image[0], dataformats='HWC')
23 | writer.add_image("test_gray/second", image_gray[0], dataformats='HWC')
24 | writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
25 | # writer.add_image("image/first_series", image, max_outputs=10)
26 | # writer.add_image("image_gray/second_series", image_gray, max_outputs=10)
27 | # writer.add_image("image_rgba/third_series", image_rgba, max_outputs=10)
28 |
29 | print('Done!')
30 |
--------------------------------------------------------------------------------
/ml/clearml/requirements.txt:
--------------------------------------------------------------------------------
1 | PyJWT==1.7.1
2 |
--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/model-harry.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/clearml/scikit-learn/model-harry.pkl
--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/clearml/scikit-learn/model.pkl
--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.13.2
2 | matplotlib >= 3.1.1 ; python_version >= '3.6'
3 | matplotlib >= 2.2.4 ; python_version < '3.6'
4 | scikit-learn
5 | clearml
--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/sklearn_joblib_example.py:
--------------------------------------------------------------------------------
1 | try:
2 | import joblib
3 | except ImportError:
4 | from sklearn.externals import joblib
5 |
6 | from sklearn import datasets
7 | from sklearn.linear_model import LogisticRegression
8 | from sklearn.model_selection import train_test_split
9 | import numpy as np
10 | import matplotlib
11 | matplotlib.use('agg') # use agg instead of tkinter
12 | import matplotlib.pyplot as plt
13 |
14 |
15 |
16 | from clearml import Task
17 |
18 | task = Task.init(project_name="examples", task_name="scikit-learn joblib example")
19 |
20 | iris = datasets.load_iris()
21 | X = iris.data
22 | y = iris.target
23 |
24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
25 |
26 | model = LogisticRegression(solver='liblinear', multi_class='auto') # sklearn LogisticRegression class
27 | model.fit(X_train, y_train)
28 |
29 | joblib.dump(model, 'model-harry.pkl', compress=True)
30 |
31 | loaded_model = joblib.load('model.pkl')
32 | result = loaded_model.score(X_test, y_test)
33 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
34 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
35 | h = .02 # step size in the mesh
36 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
37 | plt.figure(1, figsize=(4, 3))
38 |
39 | plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
40 | plt.xlabel('Sepal length')
41 | plt.ylabel('Sepal width')
42 |
43 | plt.xlim(xx.min(), xx.max())
44 | plt.ylim(yy.min(), yy.max())
45 | plt.xticks(())
46 | plt.yticks(())
47 |
48 | plt.show()
49 |
--------------------------------------------------------------------------------
/ml/clearml/tensorflow/legacy/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard>=1.14.0
2 | tensorflow>=1.14.0
3 |
--------------------------------------------------------------------------------
/ml/clearml/tensorflow/legacy/tensorboard_toy.py:
--------------------------------------------------------------------------------
1 | # TRAINS - Example of tensorboard with tensorflow (without any actual training)
2 | #
3 | import os
4 | from tempfile import gettempdir
5 |
6 | import tensorflow as tf
7 | import numpy as np
8 | from PIL import Image
9 |
10 | from trains import Task
11 | task = Task.init(project_name='examples', task_name='tensorboard toy example')
12 |
13 |
14 | k = tf.placeholder(tf.float32)
15 |
16 | # Make a normal distribution, with a shifting mean
17 | mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
18 | # Record that distribution into a histogram summary
19 | tf.summary.histogram("normal/moving_mean", mean_moving_normal)
20 | tf.summary.scalar("normal/value", mean_moving_normal[-1])
21 |
22 | # Make a normal distribution with shrinking variance
23 | variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
24 | # Record that distribution too
25 | tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
26 | tf.summary.scalar("normal/variance_shrinking_normal", variance_shrinking_normal[-1])
27 |
28 | # Let's combine both of those distributions into one dataset
29 | normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
30 | # We add another histogram summary to record the combined distribution
31 | tf.summary.histogram("normal/bimodal", normal_combined)
32 | tf.summary.scalar("normal/normal_combined", normal_combined[0])
33 |
34 | # Add a gamma distribution
35 | gamma = tf.random_gamma(shape=[1000], alpha=k)
36 | tf.summary.histogram("gamma", gamma)
37 |
38 | # And a poisson distribution
39 | poisson = tf.random_poisson(shape=[1000], lam=k)
40 | tf.summary.histogram("poisson", poisson)
41 |
42 | # And a uniform distribution
43 | uniform = tf.random_uniform(shape=[1000], maxval=k*10)
44 | tf.summary.histogram("uniform", uniform)
45 |
46 | # Finally, combine everything together!
47 | all_distributions = [mean_moving_normal, variance_shrinking_normal, gamma, poisson, uniform]
48 | all_combined = tf.concat(all_distributions, 0)
49 | tf.summary.histogram("all_combined", all_combined)
50 |
51 | # Log text value
52 | tf.summary.text("this is a test", tf.make_tensor_proto("This is the content", dtype=tf.string))
53 |
54 | # convert to 4d [batch, col, row, RGB-channels]
55 | image_open = Image.open(os.path.join("..", "..", "..", "reporting", "data_samples", "picasso.jpg"))
56 | image = np.asarray(image_open)
57 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
58 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
59 | image_rgba = image_rgba[np.newaxis, :, :, :]
60 | image = image[np.newaxis, :, :, :]
61 |
62 | tf.summary.image("test", image, max_outputs=10)
63 | tf.summary.image("test_gray", image_gray, max_outputs=10)
64 | tf.summary.image("test_rgba", image_rgba, max_outputs=10)
65 |
66 | # Setup a session and summary writer
67 | summaries = tf.summary.merge_all()
68 | sess = tf.Session()
69 |
70 | logger = task.get_logger()
71 |
72 | # Use original FileWriter for comparison , run:
73 | # % tensorboard --logdir=/tmp/histogram_example
74 | writer = tf.summary.FileWriter(os.path.join(gettempdir(), "histogram_example"))
75 |
76 | # Setup a loop and write the summaries to disk
77 | N = 40
78 | for step in range(N):
79 | k_val = step/float(N)
80 | summ = sess.run(summaries, feed_dict={k: k_val})
81 | writer.add_summary(summ, global_step=step)
82 |
83 | print('Done!')
84 |
--------------------------------------------------------------------------------
/ml/clearml/tensorflow/manual_model_upload.py:
--------------------------------------------------------------------------------
1 | # TRAINS - Example of manual model configuration and uploading
2 | #
3 | import os
4 | import tempfile
5 |
6 | import tensorflow as tf
7 | from trains import Task
8 |
9 | task = Task.init(project_name='examples', task_name='Model configuration and upload')
10 |
11 | model = tf.Module()
12 |
13 | # Connect a local configuration file
14 | config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json')
15 | config_file = task.connect_configuration(config_file)
16 | # then read configuration as usual, the backend will contain a copy of it.
17 | # later when executing remotely, the returned `config_file` will be a temporary file
18 | # containing a new copy of the configuration retrieved form the backend
19 | # # model_config_dict = json.load(open(config_file, 'rt'))
20 |
21 | # Or Store dictionary of definition for a specific network design
22 | model_config_dict = {
23 | 'value': 13.37,
24 | 'dict': {'sub_value': 'string', 'sub_integer': 11},
25 | 'list_of_ints': [1, 2, 3, 4],
26 | }
27 | model_config_dict = task.connect_configuration(model_config_dict)
28 |
29 | # We now update the dictionary after connecting it, and the changes will be tracked as well.
30 | model_config_dict['new value'] = 10
31 | model_config_dict['value'] *= model_config_dict['new value']
32 |
33 | # store the label enumeration of the training model
34 | labels = {'background': 0, 'cat': 1, 'dog': 2}
35 | task.connect_label_enumeration(labels)
36 |
37 | # storing the model, it will have the task network configuration and label enumeration
38 | print('Any model stored from this point onwards, will contain both model_config and label_enumeration')
39 |
40 | tempdir = tempfile.mkdtemp()
41 | tf.saved_model.save(model, os.path.join(tempdir, "model"))
42 | print('Model saved')
43 |
--------------------------------------------------------------------------------
/ml/clearml/tensorflow/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard>=2.0
2 | tensorflow>=2.0
3 | trains
--------------------------------------------------------------------------------
/ml/clearml/tensorflow/tensorboard_toy.py:
--------------------------------------------------------------------------------
1 | # TRAINS - Example of tensorboard with tensorflow (without any actual training)
2 | #
3 | import os
4 | import tensorflow as tf
5 | import numpy as np
6 | from tempfile import gettempdir
7 | from PIL import Image
8 |
9 | from trains import Task
10 |
11 |
12 | def generate_summary(k, step):
13 | # Make a normal distribution, with a shifting mean
14 | mean_moving_normal = tf.random.normal(shape=[1000], mean=(5 * k), stddev=1)
15 | # Record that distribution into a histogram summary
16 | tf.summary.histogram("normal/moving_mean", mean_moving_normal, step=step)
17 | tf.summary.scalar("normal/value", mean_moving_normal[-1], step=step)
18 |
19 | # Make a normal distribution with shrinking variance
20 | variance_shrinking_normal = tf.random.normal(shape=[1000], mean=0, stddev=1-k)
21 | # Record that distribution too
22 | tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal, step=step)
23 | tf.summary.scalar("normal/variance_shrinking_normal", variance_shrinking_normal[-1], step=step)
24 |
25 | # Let's combine both of those distributions into one dataset
26 | normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
27 | # We add another histogram summary to record the combined distribution
28 | tf.summary.histogram("normal/bimodal", normal_combined, step=step)
29 | tf.summary.scalar("normal/normal_combined", normal_combined[0], step=step)
30 |
31 | # Add a gamma distribution
32 | gamma = tf.random.gamma(shape=[1000], alpha=k)
33 | tf.summary.histogram("gamma", gamma, step=step)
34 |
35 | # And a poisson distribution
36 | poisson = tf.random.poisson(shape=[1000], lam=k)
37 | tf.summary.histogram("poisson", poisson, step=step)
38 |
39 | # And a uniform distribution
40 | uniform = tf.random.uniform(shape=[1000], maxval=k*10)
41 | tf.summary.histogram("uniform", uniform, step=step)
42 |
43 | # Finally, combine everything together!
44 | all_distributions = [mean_moving_normal, variance_shrinking_normal, gamma, poisson, uniform]
45 | all_combined = tf.concat(all_distributions, 0)
46 | tf.summary.histogram("all_combined", all_combined, step=step)
47 |
48 | # Log text value
49 | tf.summary.text("this is a test", "This is the content", step=step)
50 |
51 | # convert to 4d [batch, col, row, RGB-channels]
52 | image_open = Image.open(os.path.join('..', '..', 'reporting', 'data_samples', 'picasso.jpg'))
53 | image = np.asarray(image_open)
54 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
55 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
56 | image_rgba = image_rgba[np.newaxis, :, :, :]
57 | image = image[np.newaxis, :, :, :]
58 |
59 | tf.summary.image("test", image, max_outputs=10, step=step)
60 | tf.summary.image("test_gray", image_gray, max_outputs=10, step=step)
61 | tf.summary.image("test_rgba", image_rgba, max_outputs=10, step=step)
62 |
63 |
64 | task = Task.init(project_name='examples', task_name='tensorboard toy example')
65 |
66 | # create the tensorboard file writer in a temp folder
67 | writer = tf.summary.create_file_writer(os.path.join(gettempdir(), "toy_tb_example"))
68 |
69 | # Setup a loop and write the summaries to disk
70 | N = 40
71 | for step in range(N):
72 | k_val = step/float(N)
73 | with writer.as_default():
74 | generate_summary(k_val, tf.cast(step, tf.int64))
75 |
76 | print('Tensorboard toy example done')
77 |
--------------------------------------------------------------------------------
/ml/clearml/wandb/latest-run:
--------------------------------------------------------------------------------
1 | run-20210201_173509-jrmpee7z
--------------------------------------------------------------------------------
/ml/clearml/wandb/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | tensorboardX
3 | tensorboard>=1.14.0
4 | torch>=1.1.0
5 | torchvision>=0.3.0
6 | clearml
7 | wandb
--------------------------------------------------------------------------------
/ml/clearml/xgboost/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib >= 3.1.1 ; python_version >= '3.6'
2 | matplotlib >= 2.2.4 ; python_version < '3.6'
3 | sklearn
4 | trains
5 | xgboost>=0.90 ; python_version >= '3'
6 | xgboost>=0.82 ; python_version < '3'
7 | # sudo apt-get install graphviz
8 | graphviz>=0.8
9 |
--------------------------------------------------------------------------------
/ml/clearml/xgboost/xgboost_sample.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import xgboost as xgb
3 | from sklearn import datasets
4 | from sklearn.metrics import accuracy_score
5 | from sklearn.model_selection import train_test_split
6 | from xgboost import plot_tree
7 |
8 | from trains import Task
9 |
10 | task = Task.init(project_name='examples', task_name='XGBoost simple example')
11 | iris = datasets.load_iris()
12 | X = iris.data
13 | y = iris.target
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
15 | dtrain = xgb.DMatrix(X_train, label=y_train)
16 | dtest = xgb.DMatrix(X_test, label=y_test)
17 | param = {
18 | 'max_depth': 3, # the maximum depth of each tree
19 | 'eta': 0.3, # the training step for each iteration
20 | 'silent': 1, # logging mode - quiet
21 | 'objective': 'multi:softprob', # error evaluation for multiclass training
22 | 'num_class': 3} # the number of classes that exist in this datset
23 | num_round = 20 # the number of training iterations
24 |
25 | # noinspection PyBroadException
26 | try:
27 | # try to load a model
28 | bst = xgb.Booster(params=param, model_file='xgb.01.model')
29 | bst.load_model('xgb.01.model')
30 | except Exception:
31 | bst = None
32 |
33 | # if we dont have one train a model
34 | if bst is None:
35 | bst = xgb.train(param, dtrain, num_round)
36 |
37 | # store trained model model v1
38 | bst.save_model('xgb.01.model')
39 | bst.dump_model('xgb.01.raw.txt')
40 |
41 | # build classifier
42 | model = xgb.XGBClassifier()
43 | model.fit(X_train, y_train)
44 |
45 | # store trained classifier model
46 | model.save_model('xgb.02.model')
47 |
48 | # make predictions for test data
49 | y_pred = model.predict(X_test)
50 | predictions = [round(value) for value in y_pred]
51 |
52 | # evaluate predictions
53 | accuracy = accuracy_score(y_test, predictions)
54 | print("Accuracy: %.2f%%" % (accuracy * 100.0))
55 | labels = dtest.get_label()
56 |
57 | # plot results
58 | xgb.plot_importance(model)
59 | plot_tree(model)
60 | plt.show()
61 |
--------------------------------------------------------------------------------
/ml/document-clustering/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | Document Clustering with Python 3
4 |
5 | This is my revision of the great tutorial at http://brandonrose.org/clustering - many thanks to the author.
6 |
7 | ## TL;DR
8 | **Data**: Top 100 movies (http://www.imdb.com/list/ls055592025/) with title, genre, and synopsis (IMDB and Wiki)
9 |
10 | **Goal**: Put 100 movies into 5 clusters by text-mining their synopses and plot the result as follows
11 |
12 |
13 |
14 | ## Setup
15 |
16 | First, clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
17 |
18 | ```
19 | $ cd path_to_document-clustering
20 | $ virtualenv -p python3 venv
21 | $ source venv/bin/activate
22 | $ pip3 install -r requirements.txt
23 | ```
24 | Second, use nltk.download() to download all nltk packages (a GUI will open and you can choose to install all packages: ~ 3.5G), which are saved to /Users/your_mac_username/nltk_data
25 |
26 | ```
27 | ipython
28 | import nltk
29 | nltk.download()
30 | ```
31 |
32 | Lastly, run `$ jupyter notebook` to go over the tutorial step-by-step.
33 |
34 | ## Key Steps
35 | 1. **Read data**: read titles, genres, synopses, rankings into four arrays
36 | 2. **Tokenize and stem**: break paragraphs into sentences, then to words, stem the words (without removing stopwords) - each synopsis essentially becomes a bag of stemmed words.
37 | 3. **Generate tf-idf matrix**: each row is a term (unigram, bigram, trigram...generated from the bag of words in 2.), each column is a synopsis.
38 | 4. **Generate clusters**: based on the tf-idf matrix, 5 (or any number) clusters are generated using k-means. The top key terms are selected for each cluster.
39 | 5. **Calculate similarity**: generate the cosine similarity matrix using the tf-idf matrix (100x100), then generate the distance matrix (1 - similarity matrix), so each pair of synopsis has a distance number between 0 and 1.
40 | 6. **Plot clusters**: use multidimensional scaling (MDS) to convert distance matrix to a 2-dimensional array, each synopsis has (x, y) that represents their relative location based on the distance matrix. Plot the 100 points with their (x, y) using matplotlib (I added an example on using plotly.js).
41 |
--------------------------------------------------------------------------------
/ml/document-clustering/data/genres_list.txt:
--------------------------------------------------------------------------------
1 | [u' Crime', u' Drama']
2 | [u' Crime', u' Drama']
3 | [u' Biography', u' Drama', u' History']
4 | [u' Biography', u' Drama', u' Sport']
5 | [u' Drama', u' Romance', u' War']
6 | [u' Drama']
7 | [u' Drama', u' Romance', u' War']
8 | [u' Drama', u' Mystery']
9 | [u' Adventure', u' Family', u' Fantasy', u' Musical']
10 | [u' Drama', u' Romance']
11 | [u' Adventure', u' Biography', u' Drama', u' History', u' War']
12 | [u' Crime', u' Drama']
13 | [u' Horror', u' Mystery', u' Thriller']
14 | [u' Drama', u' Film-Noir']
15 | [u' Mystery', u' Romance', u' Thriller']
16 | [u' Crime', u' Drama']
17 | [u' Drama', u' Romance']
18 | [u' Biography', u' Drama', u' Family', u' Musical', u' Romance']
19 | [u' Crime', u' Drama', u' Musical', u' Romance', u' Thriller']
20 | [u' Action', u' Adventure', u' Fantasy', u' Sci-Fi']
21 | [u' Adventure', u' Family', u' Sci-Fi']
22 | [u' Mystery', u' Sci-Fi']
23 | [u' Crime', u' Drama', u' Thriller']
24 | [u' Drama', u' Mystery', u' Thriller']
25 | [u' Adventure', u' Drama', u' War']
26 | [u' Comedy', u' Musical', u' Romance']
27 | [u' Drama', u' Family', u' Fantasy']
28 | [u' Comedy']
29 | [u' Drama']
30 | [u' Comedy', u' War']
31 | [u' Biography', u' Drama', u' Music']
32 | [u' Drama', u' War']
33 | [u' Biography', u' Drama', u' History']
34 | [u' Adventure', u' Fantasy']
35 | [u' Action', u' Drama']
36 | [u' Drama', u' Romance', u' War']
37 | [u' Action', u' Drama', u' War']
38 | [u' Western']
39 | [u' Action', u' Adventure']
40 | [u' Drama', u' Sport']
41 | [u' Drama']
42 | [u' Comedy', u' Romance']
43 | [u' Drama']
44 | [u' Musical', u' Romance']
45 | [u' Drama', u' Romance', u' War']
46 | [u' Drama', u' Family', u' Musical', u' Romance']
47 | [u' Adventure', u' Drama']
48 | [u' Drama', u' Romance', u' War']
49 | [u' Biography', u' Drama', u' War']
50 | [u' Drama', u' Thriller']
51 | [u' Action', u' Biography', u' Drama', u' History', u' War']
52 | [u' Western']
53 | [u' Biography', u' Crime', u' Western']
54 | [u' Action', u' Adventure', u' Drama', u' Western']
55 | [u' Comedy', u' Drama', u' Romance']
56 | [u' Drama', u' War']
57 | [u' Western']
58 | [u' Adventure', u' Drama', u' Western']
59 | [u' Biography', u' Drama', u' War']
60 | [u' Biography', u' Crime', u' Drama']
61 | [u' Horror']
62 | [u' Drama', u' War']
63 | [u' Drama', u' War']
64 | [u' Action', u' Crime', u' Thriller']
65 | [u' Comedy', u' Drama', u' Romance']
66 | [u' Biography', u' Drama', u' History']
67 | [u' Comedy', u' Romance']
68 | [u' Drama', u' Romance']
69 | [u' Drama']
70 | [u' Drama']
71 | [u' Drama']
72 | [u' Comedy', u' Drama', u' Romance']
73 | [u' Biography', u' Drama', u' Romance']
74 | [u' Drama']
75 | [u' Comedy', u' Drama']
76 | [u' Comedy', u' Drama', u' Romance']
77 | [u' Crime', u' Drama', u' Thriller']
78 | [u' Drama', u' Romance']
79 | [u' Drama']
80 | [u' Drama', u' Romance', u' Western']
81 | [u' Crime', u' Drama', u' Fantasy', u' Mystery']
82 | [u' Drama', u' Sci-Fi']
83 | [u' Drama']
84 | [u' Drama', u' Music']
85 | [u' Comedy', u' Drama', u' Romance']
86 | [u' Comedy', u' Drama']
87 | [u' Crime', u' Drama', u' Thriller']
88 | [u' Adventure', u' Romance', u' War']
89 | [u' Adventure', u' Western']
90 | [u' Adventure', u' Drama', u' History']
91 | [u' Drama', u' Film-Noir', u' Mystery']
92 | [u' Crime', u' Drama', u' Sci-Fi']
93 | [u' Crime', u' Drama']
94 | [u' Drama', u' Romance']
95 | [u' Crime', u' Drama', u' Film-Noir', u' Thriller']
96 | [u' Drama']
97 | [u' Mystery', u' Thriller']
98 | [u' Film-Noir', u' Mystery', u' Thriller']
99 | [u' Mystery', u' Thriller']
100 | [u' Biography', u' Drama', u' Musical']
101 |
--------------------------------------------------------------------------------
/ml/document-clustering/data/title_list.txt:
--------------------------------------------------------------------------------
1 | The Godfather
2 | The Shawshank Redemption
3 | Schindler's List
4 | Raging Bull
5 | Casablanca
6 | One Flew Over the Cuckoo's Nest
7 | Gone with the Wind
8 | Citizen Kane
9 | The Wizard of Oz
10 | Titanic
11 | Lawrence of Arabia
12 | The Godfather: Part II
13 | Psycho
14 | Sunset Blvd.
15 | Vertigo
16 | On the Waterfront
17 | Forrest Gump
18 | The Sound of Music
19 | West Side Story
20 | Star Wars
21 | E.T. the Extra-Terrestrial
22 | 2001: A Space Odyssey
23 | The Silence of the Lambs
24 | Chinatown
25 | The Bridge on the River Kwai
26 | Singin' in the Rain
27 | It's a Wonderful Life
28 | Some Like It Hot
29 | 12 Angry Men
30 | Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
31 | Amadeus
32 | Apocalypse Now
33 | Gandhi
34 | The Lord of the Rings: The Return of the King
35 | Gladiator
36 | From Here to Eternity
37 | Saving Private Ryan
38 | Unforgiven
39 | Raiders of the Lost Ark
40 | Rocky
41 | A Streetcar Named Desire
42 | The Philadelphia Story
43 | To Kill a Mockingbird
44 | An American in Paris
45 | The Best Years of Our Lives
46 | My Fair Lady
47 | Ben-Hur
48 | Doctor Zhivago
49 | Patton
50 | Jaws
51 | Braveheart
52 | The Good, the Bad and the Ugly
53 | Butch Cassidy and the Sundance Kid
54 | The Treasure of the Sierra Madre
55 | The Apartment
56 | Platoon
57 | High Noon
58 | Dances with Wolves
59 | The Pianist
60 | Goodfellas
61 | The Exorcist
62 | The Deer Hunter
63 | All Quiet on the Western Front
64 | The French Connection
65 | City Lights
66 | The King's Speech
67 | It Happened One Night
68 | A Place in the Sun
69 | Midnight Cowboy
70 | Mr. Smith Goes to Washington
71 | Rain Man
72 | Annie Hall
73 | Out of Africa
74 | Good Will Hunting
75 | Terms of Endearment
76 | Tootsie
77 | Fargo
78 | Giant
79 | The Grapes of Wrath
80 | Shane
81 | The Green Mile
82 | Close Encounters of the Third Kind
83 | Network
84 | Nashville
85 | The Graduate
86 | American Graffiti
87 | Pulp Fiction
88 | The African Queen
89 | Stagecoach
90 | Mutiny on the Bounty
91 | The Maltese Falcon
92 | A Clockwork Orange
93 | Taxi Driver
94 | Wuthering Heights
95 | Double Indemnity
96 | Rebel Without a Cause
97 | Rear Window
98 | The Third Man
99 | North by Northwest
100 | Yankee Doodle Dandy
101 |
--------------------------------------------------------------------------------
/ml/document-clustering/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | jupyter
3 | matplotlib
4 | nltk
5 | pandas
6 | scikit-learn
7 | scipy
8 |
--------------------------------------------------------------------------------
/ml/feature-importance/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | This is my revised code of the tutorial at:
4 | - https://machinelearningmastery.com/calculate-feature-importance-with-python/
5 | - https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/
6 |
7 | ## Setup
8 |
9 | for xgboost to work, do `brew install libomp` on Mac
10 |
11 | ```
12 | python3 -m venv venv
13 | source venv/bin/activate
14 | pip install -r requirements.txt
15 | ```
16 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
17 |
18 |
--------------------------------------------------------------------------------
/ml/feature-importance/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | scikit-learn
5 | xgboost
6 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/.gitignore:
--------------------------------------------------------------------------------
1 | /datasets/omniglot/data
2 | /model
3 | /logs
--------------------------------------------------------------------------------
/ml/few-shot-learning/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | This is my revised code of the tutorial at: https://medium.com/@barnrang/re-implementation-of-the-prototypical-network-for-few-shot-learning-using-tensorflow-2-0-keras-b2adac8e49e0
4 |
5 | The related paper is: Jake Snell and Kevin Swersky and Richard S. Zemel (2017). Prototypical Networks for Few-shot LearningCoRR, abs/1703.05175. https://arxiv.org/abs/1703.05175
6 | ## Setup
7 |
8 | Setup virtual environment and install packages
9 |
10 | ```
11 | python3 -m venv venv
12 | source venv/bin/activate
13 | pip install -r requirements.txt
14 | ```
15 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
16 |
17 | ## Prepare Datasets
18 | Download dataset at https://drive.google.com/file/d/1UQEdAv4g_Mh2t15YtorNHkoHfQZJfmoE/view?usp=sharing
19 |
20 | For omniglot dataset:
21 |
22 | ```
23 | cd datasets/omniglot
24 | mkdir data
25 | unzip images_background.zip -d data/
26 | unzip images_evaluation.zip -d data/
27 | mv data/images_evaluation/* data/images_background/
28 | python dataloader_omniglot.py
29 | ```
30 |
31 | Note that we split (1200 * 4(rotate 4 direction)) classes for training and the rest for the test set. The dataset will be collected into a numpy file .npy
32 |
33 | ## Train and Test
34 |
35 | To train:
36 |
37 | In the root folder of this repo, run `python train_omniglot.py` to train 2 epochs by default (about 10 minutes on MacBook Pro).
38 |
39 | You can use different arguments:
40 |
41 | - `python train_omniglot.py --epoch 100`
42 | - `python train_omniglot.py --train_way 60 --train_query 5 --val_way 20 --shot 1 --gpu 0[for specify the gpu]`
43 |
44 | temp checkpoints (with the format `omniglot_conv_{epoch}_{shot}_{val_way}`) and the final model `omniglot_conv` are saved in the `/model` folder (ignored by git)
45 |
46 | Show training visualization using Tensorboard, run `tensorboard --logdir=./logs --port=6006`
47 |
48 | Then, you can access TensorBoard at http://localhost:6006/
49 |
50 |
51 |
52 | To test:
53 |
54 | `python test_omniglot.py --model model/omniglot_conv --shot 1 --test_way 20`
55 |
56 |
57 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/datasets/mini_imagenet/dataloader_mini_imagenet.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from skimage.io import imread
4 | from skimage.transform import resize as imresize
5 | import os
6 |
7 | train_label = pd.read_csv('train.csv')
8 | val_label = pd.read_csv('val.csv')
9 | test_label = pd.read_csv('test.csv')
10 |
11 | train_images = []
12 |
13 | PATH = 'images'
14 |
15 | for name, df in train_label['filename'].groupby(train_label['label']):
16 | images = []
17 | for image_name in df.values:
18 | image = imread(os.path.join(PATH, image_name))
19 | image = (imresize(image, (84,84)) * 255.).astype(np.uint8)
20 | images.append(image)
21 |
22 | train_images.append(images)
23 |
24 | val_images = []
25 |
26 | PATH = 'images'
27 |
28 | for name, df in val_label['filename'].groupby(val_label['label']):
29 | images = []
30 | for image_name in df.values:
31 | image = imread(os.path.join(PATH, image_name))
32 | image = (imresize(image, (84,84)) * 255.).astype(np.uint8)
33 | images.append(image)
34 |
35 | val_images.append(images)
36 |
37 | test_images = []
38 |
39 | PATH = 'images'
40 |
41 | for name, df in test_label['filename'].groupby(test_label['label']):
42 | images = []
43 | for image_name in df.values:
44 | image = imread(os.path.join(PATH, image_name))
45 | image = (imresize(image, (84,84)) * 255.).astype(np.uint8)
46 | images.append(image)
47 |
48 | test_images.append(images)
49 |
50 | train_images = np.array(train_images)
51 |
52 | val_images = np.array(val_images)
53 | test_images = np.array(test_images)
54 |
55 | np.save('mini_train', train_images)
56 | np.save('mini_val', val_images)
57 | np.save('mini_test', test_images)
58 |
59 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/datasets/omniglot/dataloader_omniglot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import matplotlib.pyplot as plt
4 | from tqdm import tqdm
5 | from skimage.transform import resize as imresize
6 | from skimage.transform import rotate
7 | BASE_PATH = "data/images_background"
8 | TRAIN_CLASS = 1200
9 |
10 |
11 | def loader(path=None):
12 | index = 0
13 | train_images = []
14 | eval_images = []
15 | current_save = train_images
16 | if path is None:
17 | path = BASE_PATH
18 | folders_list = os.listdir(path)
19 | folders_list.sort()
20 | count = 0
21 | loading_eval = False
22 | for folder in tqdm(folders_list):
23 | path1 = os.path.join(path, folder)
24 | try: #In case of invalid folder
25 | for char_type in os.listdir(path1):
26 | if not loading_eval and count >= 1200:
27 | loading_eval = True
28 | current_save = eval_images
29 | print("Start to collect eval")
30 |
31 | path2 = os.path.join(path1, char_type)
32 | try:
33 | for rot in [0,90,180,270]:
34 | class_image = []
35 | for image_name in os.listdir(path2):
36 | image = plt.imread(os.path.join(path2, image_name))
37 | image = imresize(image,(28,28), anti_aliasing=False)
38 | image = rotate(image, rot)
39 | image = np.expand_dims(image, axis=-1)
40 | class_image.append(image)
41 | current_save.append(class_image)
42 | count += 1
43 | except NotADirectoryError:
44 | print(f"Cannot load from {path2}")
45 | except NotADirectoryError:
46 | print(f"cannot load from {path1}")
47 | continue
48 |
49 | np.save(f"./data/train_omniglot.npy", (np.array(train_images) * 255).astype(np.uint8))
50 | np.save(f"./data/test_omniglot.npy", (np.array(eval_images) * 255).astype(np.uint8))
51 |
52 |
53 | if __name__ == "__main__":
54 | images = loader()
55 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/loader_omniglot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import tensorflow
4 | from tensorflow import keras
5 |
6 |
7 | class DataGenerator(tensorflow.keras.utils.Sequence):
8 | 'Generates data for Keras'
9 | def __init__(self, data_type='train', dim=(28,28), n_channels=1,
10 | way=20, shot=1, query=1, num_batch=500):
11 | 'Initialization'
12 | self.type = data_type
13 | # if self.type == 'train':
14 | # self.is_training = np.array([True for _ in range(batch_size)])
15 | # else:
16 | # self.is_training = np.array([False for _ in range(batch_size)])
17 | self.dim = dim
18 | #self.batch_size = batch_size
19 | self.n_channels = n_channels
20 | self.num_per_class = 20
21 | self.num_batch = num_batch
22 | #self.y_target = np.zeros(self.batch_size)
23 | self.build_data(self.type)
24 | self.on_epoch_end()
25 | self.way = way
26 | self.shot = shot
27 | self.query = query
28 | #TODO!!!!
29 | #self.hard_batch = np.zeros(batch_size, *dim, n_channels)
30 |
31 | def build_data(self, data_type):
32 | if data_type == 'train':
33 | self.class_data = np.load('datasets/omniglot/data/train_omniglot.npy')
34 | else:
35 | self.class_data = np.load('datasets/omniglot/data/test_omniglot.npy')
36 |
37 | self.n_classes = len(self.class_data)
38 |
39 | def __len__(self):
40 | 'Denotes the number of batches per epoch'
41 | return self.num_batch
42 |
43 | def __getitem__(self, index):
44 | 'Generate one batch of data'
45 | # Generate data
46 | X_sample, X_query, label = self.__data_generation()
47 | #way = np.ones((self.way * self.shot, 1)) * self.way
48 |
49 |
50 | return [X_sample, X_query], label
51 |
52 | def on_epoch_end(self):
53 | 'Updates indexes after each epoch'
54 | pass
55 |
56 | def __data_generation(self):
57 | 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
58 | # Initialization
59 | X_sample = np.empty((self.way, self.shot, *self.dim, self.n_channels))
60 | X_query = np.empty((self.way, self.query, *self.dim, self.n_channels))
61 | chosen_class = random.sample(range(self.n_classes), self.way)
62 | label = np.empty(self.way * self.query)
63 | # print(pos, neg)
64 | # print(self.class_data[pos][0].shape)
65 | # Generate data
66 | for i in range(self.way):
67 | sample_idx = random.sample(range(self.num_per_class), self.shot + self.query)
68 | sample_data = self.class_data[chosen_class[i]][sample_idx]/255.
69 | X_sample[i] = sample_data[:self.shot]
70 | X_query[i] = sample_data[self.shot:self.shot + self.query]
71 | label[i * self.query: (i+1) * self.query] = i
72 | return X_sample, X_query, keras.utils.to_categorical(label)
73 | #return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
74 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/mini_imagenet/mini_proto_model.py:
--------------------------------------------------------------------------------
1 | from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D, Activation
2 | from tensorflow.keras.layers import BatchNormalization
3 | from tensorflow.keras.models import Model, Sequential
4 | from tensorflow.keras.regularizers import l2
5 | from tensorflow.keras import backend as K
6 | from tensorflow.keras.optimizers import SGD,Adam
7 | from tensorflow.keras.losses import binary_crossentropy
8 | import tensorflow as tf
9 | import numpy.random as rng
10 | import numpy as np
11 | import os
12 | import matplotlib.pyplot as plt
13 | eps = 1e-12
14 |
15 | def W_init(shape,name=None):
16 | """Initialize weights as in paper"""
17 | values = rng.normal(loc=0,scale=1e-2,size=shape)
18 | return K.variable(values,name=name)
19 | #//TODO: figure out how to initialize layer biases in tensorflow.keras.
20 | def b_init(shape,name=None):
21 | """Initialize bias as in paper"""
22 | values=rng.normal(loc=0.5,scale=1e-2,size=shape)
23 | return K.variable(values,name=name)
24 |
25 | input_shape = (84,84, 3)
26 |
27 | #build convnet to use in each siamese 'leg'
28 | def conv_net():
29 | convnet = Sequential()
30 | for i in range(4):
31 | convnet.add(Conv2D(64,(3,3),padding='same',input_shape=input_shape))
32 | convnet.add(BatchNormalization())
33 | convnet.add(Activation('relu'))
34 | convnet.add(MaxPooling2D())
35 | convnet.add(Flatten())
36 | return convnet
37 |
38 | def l1_distance(x,y):
39 | return tf.reduce_sum(tf.maximum(tf.abs(x-y),eps), axis=1, keep_dims=True)
40 |
41 | def l2_distance(x,y):
42 | return tf.sqrt(tf.reduce_sum(tf.maximum(tf.square(x-y),eps), axis=1, keep_dims=True))
43 |
44 | def hinge_loss(target, pred, h=1.):
45 | loss = tf.reduce_mean(tf.maximum(pred + h, 0.))
46 | return loss
47 |
48 | def acc(target, pred):
49 | result = tf.cast(tf.less(pred, target), dtype=tf.float32)
50 | return tf.reduce_mean(result)
51 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/mini_imagenet/mini_proto_test.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 | import os
4 | def parser():
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument('--test_way', dest='test_way', type=int, default=5)
7 | parser.add_argument('--shot', dest='shot', type=int, default=1)
8 | parser.add_argument('--gpu', dest='gpu', type=int, default=0)
9 | parser.add_argument('--model', dest='model')
10 |
11 | return parser.parse_args()
12 |
13 | args = parser()
14 | os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
15 |
16 | from tensorflow.keras import callbacks as cb
17 | from tensorflow.keras.optimizers import Adam
18 | from tensorflow.keras.models import load_model, Model, save_model
19 | from tensorflow.keras.layers import *
20 | from tensorflow.keras.models import Sequential
21 | from tensorflow.keras import regularizers as rg
22 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
23 | from tensorflow.keras.applications.xception import Xception
24 | from tensorflow.keras import backend as K
25 |
26 |
27 | import numpy.random as rng
28 |
29 | import numpy as np
30 | import matplotlib.pyplot as plt
31 | import matplotlib.image as img
32 | import random
33 | from python.dataloader import loader
34 | from mini_protoloader import DataGenerator
35 | from mini_proto_model import conv_net, hinge_loss, l2_distance, acc, l1_distance
36 | #from transform import transform_gate
37 | from util.tensor_op import *
38 | from util.loss import *
39 | input_shape = (None,84,84,3)
40 | batch_size = 20
41 | test_way = args.test_way
42 | shot = args.shot
43 | model_path = args.model
44 | lr = 0.002
45 |
46 | def scheduler(epoch):
47 | global lr
48 | if epoch % 15 == 0:
49 | lr /= 2
50 | return lr
51 |
52 | class SaveConv(tf.keras.callbacks.Callback):
53 | def on_epoch_end(self, epoch, logs=None):
54 | if epoch % 5 == 0:
55 | save_model(conv, f"model/miniimage_conv_{epoch}_{shot}_{val_way}")
56 |
57 |
58 | if __name__ == "__main__":
59 | #conv = conv_net()
60 | conv = load_model(model_path)
61 | sample = Input(input_shape)
62 | conv_5d = TimeDistributed(conv)
63 | out_feature = conv_5d(sample)
64 | out_feature = Lambda(reduce_tensor)(out_feature)
65 | inp = Input(input_shape)
66 | map_feature = conv_5d(inp)
67 | map_feature = Lambda(reshape_query)(map_feature)
68 | pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance
69 | combine = Model([sample, inp], pred)
70 |
71 | optimizer = Adam(0.001)
72 | combine.compile(loss='categorical_crossentropy', optimizer=optimizer,
73 | metrics=['categorical_accuracy'])
74 | test_loader = DataGenerator(data_type='test',way=test_way, shot=shot, num_batch=10000)
75 |
76 | combine.evaluate(test_loader)
77 |
78 |
79 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/mini_imagenet/mini_protoloader.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from keras.utils import np_utils
3 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
4 | import tensorflow
5 | import keras
6 | import random
7 | from python.dataloader import loader
8 |
9 | class DataGenerator(tensorflow.keras.utils.Sequence):
10 | 'Generates data for Keras'
11 | def __init__(self, data_type='train', dim=(84,84), n_channels=3,
12 | way=5, shot=1, query=5, num_batch=500):
13 | 'Initialization'
14 | self.type = data_type
15 | # if self.type == 'train':
16 | # self.is_training = np.array([True for _ in range(batch_size)])
17 | # else:
18 | # self.is_training = np.array([False for _ in range(batch_size)])
19 | self.dim = dim
20 | #self.batch_size = batch_size
21 | self.n_channels = n_channels
22 | self.num_per_class = 600
23 | self.num_batch = num_batch
24 | #self.y_target = np.zeros(self.batch_size)
25 | self.build_data(self.type)
26 | self.on_epoch_end()
27 | self.way = way
28 | self.shot = shot
29 | self.query = query
30 | self.transformer = ImageDataGenerator(
31 | width_shift_range=0.1,
32 | height_shift_range=0.1,
33 | zoom_range=0.2,
34 | rotation_range=30,
35 | horizontal_flip=True,
36 | shear_range=0.1
37 |
38 | )
39 | #TODO!!!!
40 | #self.hard_batch = np.zeros(batch_size, *dim, n_channels)
41 |
42 | def build_data(self, data_type):
43 | if data_type == 'train':
44 | self.class_data = np.load('python/mini_train.npy')
45 | elif data_type == 'val':
46 | self.class_data = np.load('python/mini_val.npy')
47 | else:
48 | self.class_data = np.load('python/mini_test.npy')
49 |
50 | self.n_classes = len(self.class_data)
51 |
52 | def __len__(self):
53 | 'Denotes the number of batches per epoch'
54 | return self.num_batch
55 |
56 | def __getitem__(self, index):
57 | 'Generate one batch of data'
58 | # Generate data
59 | X_sample, X_query, label = self.__data_generation()
60 | #way = np.ones((self.way * self.shot, 1)) * self.way
61 |
62 |
63 | return [X_sample, X_query], label
64 |
65 | def on_epoch_end(self):
66 | 'Updates indexes after each epoch'
67 | pass
68 |
69 | def __data_generation(self):
70 | 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
71 | # Initialization
72 | X_sample = np.empty((self.way, self.shot, *self.dim, self.n_channels))
73 | X_query = np.empty((self.way, self.query, *self.dim, self.n_channels))
74 | chosen_class = random.sample(range(self.n_classes), self.way)
75 | label = np.empty(self.way * self.query)
76 | # print(pos, neg)
77 | # print(self.class_data[pos][0].shape)
78 | # Generate data
79 | for i in range(self.way):
80 | sample_idx = random.sample(range(self.num_per_class), self.shot + self.query)
81 | sample_data = self.class_data[chosen_class[i]][sample_idx]/255.
82 | if True:
83 | #if self.type != 'train':
84 | X_sample[i] = sample_data[:self.shot]
85 | X_query[i] = sample_data[self.shot:self.shot + self.query]
86 | else:
87 | for j in range(self.shot):
88 | params = self.transformer.get_random_transform(self.dim + (self.n_channels,))
89 | x = self.transformer.apply_transform(sample_data[j], params)
90 | X_sample[i][j] = x
91 |
92 | for j in range(self.shot, self.shot + self.query):
93 | params = self.transformer.get_random_transform(self.dim + (self.n_channels,))
94 | x = self.transformer.apply_transform(sample_data[j], params)
95 | X_query[i][j-self.shot] = x
96 |
97 | label[i * self.query: (i+1) * self.query] = i
98 | return X_sample, X_query, np_utils.to_categorical(label)
99 | #return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
100 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/model_omniglot.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten, MaxPooling2D, Activation, BatchNormalization
4 | from tensorflow.keras.models import Model, Sequential
5 |
6 |
7 | eps = 1e-12
8 |
9 | def W_init(shape,name=None):
10 | """Initialize weights as in paper"""
11 | values = np.random.normal(loc=0, scale=1e-2, size=shape)
12 | return tf.variable(values, name=name)
13 |
14 |
15 | #//TODO: figure out how to initialize layer biases in tensorflow.keras.
16 | def b_init(shape, name=None):
17 | """Initialize bias as in paper"""
18 | values=np.random.normal(loc=0.5, scale=1e-2, size=shape)
19 | return tf.variable(values, name=name)
20 |
21 | input_shape = (28, 28, 1)
22 |
23 |
24 | #build convnet to use in each siamese 'leg'
25 | def conv_net():
26 | convnet = Sequential()
27 | for i in range(4):
28 | convnet.add(Conv2D(64,(3,3),padding='same', input_shape=input_shape))
29 | convnet.add(BatchNormalization())
30 | convnet.add(Activation('relu'))
31 | convnet.add(MaxPooling2D())
32 | convnet.add(Flatten())
33 | return convnet
34 |
35 |
36 | def l1_distance(x,y):
37 | return tf.reduce_sum(tf.maximum(tf.abs(x-y),eps), axis=1, keep_dims=True)
38 |
39 |
40 | def l2_distance(x,y):
41 | return tf.sqrt(tf.reduce_sum(tf.maximum(tf.square(x-y),eps), axis=1, keep_dims=True))
42 |
43 |
44 | def hinge_loss(target, pred, h=1.):
45 | loss = tf.reduce_mean(tf.maximum(pred + h, 0.))
46 | return loss
47 |
48 |
49 | def acc(target, pred):
50 | result = tf.cast(tf.less(pred, target), dtype=tf.float32)
51 | return tf.reduce_mean(result)
52 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/notebooks/dataloader_notebook/images_background_small2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/few-shot-learning/notebooks/dataloader_notebook/images_background_small2.zip
--------------------------------------------------------------------------------
/ml/few-shot-learning/notebooks/dataloader_notebook/loss_test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "/home/barnrang/.conda/envs/chatbot/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
13 | " return f(*args, **kwds)\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import tensorflow as tf\n",
19 | "from util.loss import prior_dist\n",
20 | "%load_ext autoreload\n",
21 | "%autoreload 2"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "x = tf.placeholder(shape=[None, 2], dtype=tf.float32)\n",
33 | "y = tf.placeholder(shape=[None, 2], dtype=tf.float32)"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 6,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "Tensor(\"Sum_4:0\", shape=(?, 1), dtype=float32) Tensor(\"Sum_5:0\", shape=(?, 1), dtype=float32)\n",
46 | "Tensor(\"MatMul_2:0\", shape=(?, ?), dtype=float32)\n"
47 | ]
48 | }
49 | ],
50 | "source": [
51 | "z = prior_dist([x,y])"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 8,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | "[[ 9. 17. 29.]\n",
64 | " [ 1. 1. 5.]]\n"
65 | ]
66 | }
67 | ],
68 | "source": [
69 | "with tf.Session() as sess:\n",
70 | " print(sess.run(z, feed_dict={x:[[1,3],[2,4],[3,5]],\n",
71 | " y:[[1,0],[1,4]]}))"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": []
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 3",
87 | "language": "python",
88 | "name": "python3"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 3
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython3",
100 | "version": "3.6.3"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 2
105 | }
106 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | numpy
3 | scikit-image
4 | tensorflow >= 2.0
5 | tqdm
--------------------------------------------------------------------------------
/ml/few-shot-learning/test_omniglot.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | from tensorflow.keras import callbacks as cb
5 | from tensorflow.keras.optimizers import Adam
6 | from tensorflow.keras.models import load_model, Model, save_model
7 | from tensorflow.keras.layers import *
8 |
9 | from loader_omniglot import DataGenerator
10 | from model_omniglot import conv_net
11 | from util.tensor_op import *
12 | from util.loss import *
13 |
14 |
15 | def parser():
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('--test_way', dest='test_way', type=int, default=5)
18 | parser.add_argument('--shot', dest='shot', type=int, default=1)
19 | parser.add_argument('--gpu', dest='gpu', type=int, default=0)
20 | parser.add_argument('--model', dest='model')
21 |
22 | return parser.parse_args()
23 |
24 | args = parser()
25 | os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
26 | test_way = args.test_way
27 | shot = args.shot
28 | model_path = args.model
29 |
30 | input_shape = (None, 28, 28, 1)
31 | batch_size = 20
32 | lr = 0.002
33 |
34 |
35 | def scheduler(epoch):
36 | global lr
37 | if epoch % 15 == 0:
38 | lr /= 2
39 | return lr
40 |
41 |
42 | class SaveConv(tf.keras.callbacks.Callback):
43 | def on_epoch_end(self, epoch, logs=None):
44 | if epoch % 5 == 0:
45 | save_model(conv, f"model/omniglot_conv_{epoch}_{shot}_{val_way}")
46 |
47 |
48 | if __name__ == "__main__":
49 | conv = load_model(model_path)
50 | sample = Input(input_shape)
51 | conv_5d = TimeDistributed(conv)
52 | out_feature = conv_5d(sample)
53 | out_feature = Lambda(reduce_tensor)(out_feature)
54 | inp = Input(input_shape)
55 | map_feature = conv_5d(inp)
56 | map_feature = Lambda(reshape_query)(map_feature)
57 | pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance
58 | combine = Model([sample, inp], pred)
59 |
60 | optimizer = Adam(0.001)
61 | combine.compile(loss='categorical_crossentropy', optimizer=optimizer,
62 | metrics=['categorical_accuracy'])
63 | test_loader = DataGenerator(data_type='test', way=test_way, shot=shot, num_batch=10000)
64 |
65 | combine.evaluate(test_loader)
--------------------------------------------------------------------------------
/ml/few-shot-learning/train_omniglot.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | from tensorflow.keras import callbacks as cb
5 | from tensorflow.keras.optimizers import Adam
6 | from tensorflow.keras.models import load_model, Model, save_model
7 | from tensorflow.keras.layers import *
8 |
9 | # import from custom modules
10 | from loader_omniglot import DataGenerator
11 | from model_omniglot import conv_net
12 | from util.tensor_op import *
13 | from util.loss import *
14 |
15 | # command line argument parser
16 | def parser():
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--train_way', dest='train_way', type=int, default=60)
19 | parser.add_argument('--train_query', dest='train_query', type=int, default=5)
20 | parser.add_argument('--val_way', dest='val_way', type=int, default=20)
21 | parser.add_argument('--shot', dest='shot', type=int, default=1)
22 | parser.add_argument('--gpu', dest='gpu', type=int, default=0)
23 | parser.add_argument('--epochs', dest='epochs', type=int, default=2)
24 |
25 | return parser.parse_args()
26 |
27 | args = parser()
28 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
29 |
30 | # get values from the command line arguments
31 | train_way = args.train_way
32 | train_query = args.train_query
33 | val_way = args.val_way
34 | shot = args.shot
35 | epochs = args.epochs
36 |
37 | # specify model parameters
38 | input_shape = (None, 28, 28, 1)
39 | batch_size = 20
40 | lr = 0.002
41 |
42 | def scheduler(epoch):
43 | global lr
44 | if epoch % 100 == 0:
45 | lr /= 2
46 | return lr
47 |
48 | class SaveConv(tf.keras.callbacks.Callback):
49 | def on_epoch_end(self, epoch, logs=None):
50 | if epoch % 50 == 0:
51 | save_model(conv, f"model/omniglot_conv_{epoch}_{shot}_{val_way}")
52 |
53 | if __name__ == "__main__":
54 | conv = conv_net()
55 | sample = Input(input_shape)
56 | conv_5d = TimeDistributed(conv)
57 | out_feature = conv_5d(sample)
58 | out_feature = Lambda(reduce_tensor)(out_feature)
59 | inp = Input(input_shape)
60 | map_feature = conv_5d(inp)
61 | map_feature = Lambda(reshape_query)(map_feature)
62 | # proto_dist is from util/loss.py
63 | pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance
64 | combine = Model([sample, inp], pred)
65 |
66 | optimizer = Adam(0.001)
67 | combine.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
68 |
69 | train_loader = DataGenerator(way=train_way, query=train_query, shot=shot, num_batch=1000)
70 | val_loader = DataGenerator(data_type='val',way=val_way, shot=shot)
71 |
72 | (x,y), z = train_loader[0]
73 | print(x.shape, y.shape, z.shape)
74 | print(combine.summary())
75 |
76 | save_conv = SaveConv()
77 | reduce_lr = cb.ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=2, min_lr=1e-8)
78 | lr_sched = cb.LearningRateScheduler(scheduler)
79 | tensorboard = cb.TensorBoard()
80 |
81 | combine.fit_generator(
82 | train_loader,
83 | epochs=epochs,
84 | validation_data=val_loader,
85 | use_multiprocessing=False,
86 | workers=4,
87 | shuffle=False,
88 | callbacks=[save_conv, lr_sched, tensorboard]
89 | )
90 |
91 | save_model(conv, "model/omniglot_conv")
92 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/util/loss.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def proto_dist(x):
4 | feature, pred = x
5 | pred_dist = tf.reduce_sum(pred ** 2, axis=1, keepdims=True)
6 | feature_dist = tf.reduce_sum(feature ** 2, axis=1, keepdims=True)
7 | dot = tf.matmul(pred, tf.transpose(feature))
8 | return tf.nn.softmax(-(tf.sqrt(pred_dist + tf.transpose(feature_dist) - 2 * dot)))
9 |
10 |
--------------------------------------------------------------------------------
/ml/few-shot-learning/util/tensor_op.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def slice_tensor_and_sum(x, way=20):
4 | sliced = tf.split(x, num_or_size_splits=way,axis=0)
5 | return tf.reduce_mean(sliced, axis=1)
6 |
7 | def reduce_tensor(x):
8 | return tf.reduce_mean(x, axis=1)
9 |
10 | def reshape_query(x):
11 | return tf.reshape(x, [-1, tf.shape(x)[-1]])
12 |
--------------------------------------------------------------------------------
/ml/fine-tune-pegasus/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 | Fine tune pegasus-large using XSUM dataset
3 |
4 | adapted from https://towardsdatascience.com/how-to-perform-abstractive-summarization-with-pegasus-3dd74e48bafb
5 |
6 | Colab Version (a little different from this notebook - include pip installation and batch size is 2): https://colab.research.google.com/drive/1RyUsYDAo6bA1RZICMb-FxYLszBcDY81X?usp=sharing
7 |
8 | ## Setup
9 |
10 | ```
11 | $ python3 -m venv venv
12 | $ source venv/bin/activate
13 | $ pip install -r requirements.txt
14 | ```
--------------------------------------------------------------------------------
/ml/fine-tune-pegasus/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | SentencePiece
3 | transformers>=4.3.3
4 | datasets>=1.4.1
5 | torch>=1.8.0
6 |
--------------------------------------------------------------------------------
/ml/graph/.gitignore:
--------------------------------------------------------------------------------
1 | /logs
--------------------------------------------------------------------------------
/ml/graph/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | https://www.analyticsvidhya.com/blog/2020/01/link-prediction-how-to-predict-your-future-connections-on-facebook/
4 | https://www.analyticsvidhya.com/blog/2019/11/graph-feature-extraction-deepwalk/
5 | https://www.tensorflow.org/tutorials/text/word2vec
6 |
7 |
8 |
9 | ## Setup
10 |
11 | ```
12 | $ python3 -m venv venv
13 | $ source venv/bin/activate
14 | $ pip install -r requirements.txt
15 | ```
--------------------------------------------------------------------------------
/ml/graph/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | matplotlib
4 | tensorflow
5 | tqdm
6 | sklearn
7 | networkx
8 | node2vec
9 | lightgbm
--------------------------------------------------------------------------------
/ml/greedy-layer-wise-pretraning/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | This is my revised code of the tutorial at https://machinelearningmastery.com/greedy-layer-wise-pretraining-tutorial/
4 |
5 | ## Setup
6 |
7 | within the tutorial folder:
8 |
9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 |
16 |
--------------------------------------------------------------------------------
/ml/greedy-layer-wise-pretraning/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | scikit-learn
5 | tensorflow
6 |
--------------------------------------------------------------------------------
/ml/house-price-prediction/README.md:
--------------------------------------------------------------------------------
1 | ## Kaggle Kernel
2 |
3 | You can run this kernel directly at Kaggle.com: https://www.kaggle.com/harrywang/housing-price-prediction
4 |
5 | ## Run Locally
6 |
7 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
8 |
9 | ```
10 | $ cd path_to_this folder
11 | $ virtualenv -p python3 venv
12 | $ source venv/bin/activate
13 | $ pip3 install -r requirements.txt
14 | ```
15 |
16 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
17 |
18 | ## Source
19 |
20 | This is the dataset used in this book: https://github.com/ageron/handson-ml/tree/master/datasets/housing to illustrate a sample end-to-end ML project workflow (pipeline). This is a great book - I highly recommend!
21 |
22 | The data is based on California Census in 1990.
23 |
24 | ### About the Data (from the book):
25 |
26 | "This dataset is a modified version of the California Housing dataset available from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the StatLib repository (which is closed now). The dataset may also be downloaded from StatLib mirrors.
27 |
28 | The following is the description from the book author:
29 |
30 | This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people).
31 |
32 | The dataset in this directory is almost identical to the original, with two differences:
33 | 207 values were randomly removed from the total_bedrooms column, so we can discuss what to do with missing data.
34 | An additional categorical attribute called ocean_proximity was added, indicating (very roughly) whether each block group is near the ocean, near the Bay area, inland or on an island. This allows discussing what to do with categorical data.
35 | Note that the block groups are called "districts" in the Jupyter notebooks, simply because in some contexts the name "block group" was confusing."
36 |
37 | ### About the Data (From Luís Torgo page):
38 | http://www.dcc.fc.up.pt/%7Eltorgo/Regression/cal_housing.html
39 |
40 | This is a dataset obtained from the StatLib repository. Here is the included description:
41 |
42 | "We collected information on the variables using all the block groups in California from the 1990 Cens us. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value)."
43 |
44 |
45 | ### End-to-End ML Project Steps (Chapter 2 of the book)
46 |
47 | 1. Look at the big picture
48 | 2. Get the data
49 | 3. Discover and visualize the data to gain insights
50 | 4. Prepare the data for Machine Learning algorithms
51 | 5. Select a model and train it
52 | 6. Fine-tune your model
53 | 7. Present your solution
54 | 8. Launch, monitor, and maintain your system
55 |
56 | ## The 10-Step Machine Learning Project Workflow (My Version)
57 |
58 | 1. Define business object
59 | 2. Make sense of the data from a high level
60 | - data types (number, text, object, etc.)
61 | - continuous/discrete
62 | - basic stats (min, max, std, median, etc.) using boxplot
63 | - frequency via histogram
64 | - scales and distributions of different features
65 | 3. Create the traning and test sets using proper sampling methods, e.g., random vs. stratified
66 | 4. Correlation analysis (pair-wise and attribute combinations)
67 | 5. Data cleaning (missing data, outliers, data errors)
68 | 6. Data transformation via pipelines (categorical text to number using one hot encoding, feature scaling via normalization/standardization, feature combinations)
69 | 7. Train and cross validate different models and select the most promising one (Linear Regression, Decision Tree, and Random Forest were tried in this tutorial)
70 | 8. Fine tune the model using trying different combinations of hyperparameters
71 | 9. Evaluate the model with best estimators in the test set
72 | 10. Launch, monitor, and refresh the model and system
73 |
--------------------------------------------------------------------------------
/ml/house-price-prediction/input/anscombe.csv:
--------------------------------------------------------------------------------
1 | dataset,x,y
2 | I,10.0,8.04
3 | I,8.0,6.95
4 | I,13.0,7.58
5 | I,9.0,8.81
6 | I,11.0,8.33
7 | I,14.0,9.96
8 | I,6.0,7.24
9 | I,4.0,4.26
10 | I,12.0,10.84
11 | I,7.0,4.82
12 | I,5.0,5.68
13 | II,10.0,9.14
14 | II,8.0,8.14
15 | II,13.0,8.74
16 | II,9.0,8.77
17 | II,11.0,9.26
18 | II,14.0,8.1
19 | II,6.0,6.13
20 | II,4.0,3.1
21 | II,12.0,9.13
22 | II,7.0,7.26
23 | II,5.0,4.74
24 | III,10.0,7.46
25 | III,8.0,6.77
26 | III,13.0,12.74
27 | III,9.0,7.11
28 | III,11.0,7.81
29 | III,14.0,8.84
30 | III,6.0,6.08
31 | III,4.0,5.39
32 | III,12.0,8.15
33 | III,7.0,6.42
34 | III,5.0,5.73
35 | IV,8.0,6.58
36 | IV,8.0,5.76
37 | IV,8.0,7.71
38 | IV,8.0,8.84
39 | IV,8.0,8.47
40 | IV,8.0,7.04
41 | IV,8.0,5.25
42 | IV,19.0,12.5
43 | IV,8.0,5.56
44 | IV,8.0,7.91
45 | IV,8.0,6.89
46 |
--------------------------------------------------------------------------------
/ml/house-price-prediction/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | sklearn
4 | matplotlib
5 | seaborn
6 |
--------------------------------------------------------------------------------
/ml/imbalanced-multi-classification/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | This is my revised code of the tutorial at: https://machinelearningmastery.com/multi-class-imbalanced-classification/
4 |
5 | ## Setup
6 |
7 |
8 | ```
9 | python3 -m venv venv
10 | source venv/bin/activate
11 | pip install -r requirements.txt
12 | ```
13 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
14 |
15 |
--------------------------------------------------------------------------------
/ml/imbalanced-multi-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | imbalanced-learn
3 | matplotlib
4 | pandas
5 | scikit-learn
--------------------------------------------------------------------------------
/ml/openml-csv-arff/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | This is the pre-processing script to convert a csv to arff for openml data upload at https://www.openml.org/d/42634
4 |
5 | After a csv is created, you need to use the weka to load and save the csv as raff file and then upload to openml.org:
6 |
7 |
8 |
9 |
10 | NOTE: https://pypi.org/project/csv2arff/ does not work - lots of errors.
11 |
12 |
13 |
14 | ## Setup
15 |
16 | Tested with Python 3.6 via virtual environment:
17 | ```shell
18 | $ python3.6 -m venv venv
19 | $ source venv/bin/activate
20 | $ jupyter notebook
21 | ```
22 |
23 |
--------------------------------------------------------------------------------
/ml/process-mining/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | A script to convert a txt log into process mining log format. The key of this notebook is to show how to use a pre-defined time gap to identify a user session
4 |
5 | ## Data
6 |
7 | The log file is very big, the following commands can help get a small sample of the file for EDA and other tasks.
8 |
9 | ### Utility Commands
10 |
11 | Run `$ wc -l search.txt` to find out the total number of lines in the file. When the file is too big for efficient pandas analysis and you can split it into smaller files with 100,000 lines each by using Linux version of `split`.
12 |
13 | ```
14 | $ brew install coreutils
15 | $ gsplit -a 4 -d -l 10000 file.txt search_
16 | ```
17 |
18 | ## Setup
19 |
20 | Tested with Python 3.6 via virtual environment:
21 | ```shell
22 | $ python3.6 -m venv venv
23 | $ source venv/bin/activate
24 | $ pip install -r requirements.txt
25 | $ jupyter notebook
26 | ```
27 |
28 | - log2csv notebook converts the text into a csv.
29 | - log-eda notebook converts the log into process mining log
30 |
--------------------------------------------------------------------------------
/ml/process-mining/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | pandas==0.25.2
3 | matplotlib==3.1.2
4 | seaborn==0.9.0
5 |
--------------------------------------------------------------------------------
/ml/tf-serving/.gitignore:
--------------------------------------------------------------------------------
1 | models/
--------------------------------------------------------------------------------
/ml/tf-serving/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | My revised code based on:
4 |
5 | - https://thelongrun.blog/2020/01/12/rest-api-tensorflow-serving-pt1/
6 | - https://thelongrun.blog/2020/01/26/rest-api-tensorflow-serving-pt2/
7 |
8 |
9 | # Setup
10 |
11 | Setup virtual environment and install packages:
12 | ```
13 | python3 -m venv venv
14 | source venv/bin/activate
15 | pip install -r requirements.txt
16 | ```
17 |
18 | Pull tf serving docker image (I assume you installed docker: https://docs.docker.com/get-docker/):
19 | ```
20 | docker pull tensorflow/serving:latest
21 | ```
22 |
23 | Create tf serving servables from tf functions and pre-trained models, two servables will be generated and saved in `/models/` folder:
24 |
25 | ```
26 | python make_servables.py
27 | ```
28 |
29 | # Start TF Serving Servers
30 |
31 | Use tf serving to serve two models from different host ports: 8501 and 8502 by running the following command **in the repo root folder**:
32 | ```
33 | docker run -t --rm -p 8501:8501 -v "$(pwd)/models/mobilenet_v2_test:/models/mobilenet_v2_test" -e MODEL_NAME=mobilenet_v2_test tensorflow/serving &
34 |
35 | docker run -t --rm -p 8502:8501 -v "$(pwd)/models/add_two:/models/add_two" -e MODEL_NAME=add_two tensorflow/serving &
36 | ```
37 |
38 | You should see two docker apps running:
39 |
40 |
41 |
42 | # Use REST APIs for Computing/Inference
43 |
44 | Call the `AddTwo()` function using `curl`, which will add 2 to each number in the tensor:
45 | ```
46 | curl -H "Content-Type: application/json" -d '{"instances":[1.0, 5.0, 4.0]}' http://localhost:8502/v1/models/add_two:predict
47 | ```
48 |
49 | Call MobileNet classifier using `curl`:
50 | ```
51 | chmod +x client_curl.sh
52 | ./client_curl.sh ./images/animal.jpg
53 | ```
54 |
55 | Call MobileNet classifier using `curl` via python:
56 |
57 | ```
58 | python client.py
59 | ```
60 |
61 |
62 |
--------------------------------------------------------------------------------
/ml/tf-serving/client.py:
--------------------------------------------------------------------------------
1 | import json
2 | import requests
3 | import base64
4 |
5 | data = {}
6 | with open('images/animal.jpg', mode='rb') as file:
7 | img = file.read()
8 | data = {"inputs":[{"b64":base64.encodebytes(img).decode("utf-8")}]}
9 |
10 | # Making the request
11 | r = requests.post("http://localhost:8501/v1/models/mobilenet_v2_test:predict", data=json.dumps(data))
12 | print(r.content)
13 | # And returns:
14 | # b'{\n "outputs": [\n "giant panda"\n ]\n}'
--------------------------------------------------------------------------------
/ml/tf-serving/client_curl.sh:
--------------------------------------------------------------------------------
1 | # $1 refers to the path where the image file is located
2 | ENCODED_IMG="$(base64 $1)"
3 | (echo '{"inputs": [{"b64": "'; echo "$ENCODED_IMG"; echo '"}]}') | curl -H "Content-Type: application/json" -d @- http://localhost:8501/v1/models/mobilenet_v2_test:predict
--------------------------------------------------------------------------------
/ml/tf-serving/images/animal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/animal.jpg
--------------------------------------------------------------------------------
/ml/tf-serving/images/clear.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/clear.jpg
--------------------------------------------------------------------------------
/ml/tf-serving/images/ponds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/ponds.png
--------------------------------------------------------------------------------
/ml/tf-serving/make_servables.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow_hub as hub
3 |
4 |
5 | class AddTwo(tf.Module):
6 | @tf.function(input_signature=[tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name='x')])
7 | def add_two(self, x):
8 | return x + 2
9 |
10 |
11 | class CustomMobileNet(tf.keras.Model):
12 | model_handler = "https://tfhub.dev/google/imagenet/mobilenet_v2_035_224/classification/4"
13 |
14 | def __init__(self):
15 | super(CustomMobileNet, self).__init__()
16 | self.model = hub.load(self.__class__.model_handler)
17 | self.labels = None
18 |
19 | @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.string)])
20 | def call(self, input_img):
21 | def _preprocess(img_file):
22 | img_bytes = tf.reshape(img_file, [])
23 | img = tf.io.decode_jpeg(img_bytes, channels=3)
24 | img = tf.image.convert_image_dtype(img, tf.float32)
25 | return tf.image.resize(img, (224, 224))
26 |
27 | labels = tf.io.read_file(self.labels)
28 | labels = tf.strings.split(labels, sep='\n')
29 | img = _preprocess(input_img)[tf.newaxis,:]
30 | logits = self.model(img)
31 | get_class = lambda x: labels[tf.argmax(x)]
32 | class_text = tf.map_fn(get_class, logits, fn_output_signature=tf.string)
33 | return class_text # index of the class
34 |
35 | # create a servable from a tf function
36 | tf_func_servable = AddTwo()
37 | tf.saved_model.save(tf_func_servable, "models/add_two/1")
38 |
39 | # create a servable from a pre-trian model downloaded from tf hub
40 | tf_model_servable = CustomMobileNet()
41 | tf_model_servable.labels = tf.saved_model.Asset("data/ImageNetLabels.txt") # save lables txt as an asset
42 | tf.saved_model.save(tf_model_servable, "models/mobilenet_v2_test/1/")
43 |
--------------------------------------------------------------------------------
/ml/tf-serving/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=2.3.1
2 | tensorflow_hub>=0.10.0
--------------------------------------------------------------------------------
/ml/tfidf-bm25/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | tfidf and bm25 examples for document retrieval using the Cranfield dataset
4 |
5 | "The Cranfield collection. This was the pioneering test collection in allowing precise quantitative measures of information retrieval effectiveness, but is nowadays too small for anything but the most elementary pilot experiments. Collected in the United Kingdom starting in the late 1950s, it contains 1398 abstracts of aerodynamics journal articles, a set of 225 queries, and exhaustive relevance judgments of all (query, document) pairs." -
6 | https://nlp.stanford.edu/IR-book/html/htmledition/standard-test-collections-1.html
7 |
8 | ## Setup
9 |
10 | ```
11 | $ python3 -m venv venv
12 | $ source venv/bin/activate
13 | $ pip install -r requirements.txt
14 | ```
15 | ## Data
16 |
17 | The data is in `data` folder in JSON format:
18 | - `cranfield_docs.json`: information about 1400 documents, which are abstracts from papers related to Aeronautics with information about author, bibliography, body (abstract), title:
19 | ```
20 | {
21 |
22 | "id" : 1,
23 | "author" : "brenckman,m.",
24 | "bibliography" : "j. ae. scs. 25, 1958, 324.",
25 | "body" : "experimental investigation of the aerodynamics of a wing in a slipstream . an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios . the results were intended in part as an evaluation basis for different theoretical treatments of this problem . the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect . the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory . an empirical evaluation of the destalling effects was made for the specific configuration of the experiment .",
26 | "title" : "experimental investigation of the aerodynamics of a wing in a slipstream ."
27 |
28 | }
29 | ```
30 | - `cranfield_queries.json`: 225 queries representing users' information need.
31 | ```
32 | {
33 | "query_id": 1,
34 | "query": "what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft ."
35 | }
36 | ```
37 |
38 | - `cranfield_relevance.json`: the relevance score (1, 2, 3, 4 as 1 being the highest relevance) of each query and related documents.
39 | ```
40 | {"query_id": "1", "r_score": 2, "doc_id": "184"},
41 | {"query_id": "2", "r_score": 1, "doc_id": "12"},
42 | ```
43 | - 1 : the document is the complete answer to the query
44 | - 2 : the document has a high degree of relevance to the query
45 | - 3 : the document is useful to the query as general background information
46 | - 4 : the document is of minimum interest to the query
47 |
48 |
49 |
50 | ## Evaluation Metrics
51 |
52 | Precision and Recall are used in the examples. See https://nlp.stanford.edu/IR-book/html/htmledition/information-retrieval-system-evaluation-1.html for more evaluation metrics.
53 |
--------------------------------------------------------------------------------
/ml/tfidf-bm25/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab>=2.2.9
2 | matplotlib>=3.2.1
3 | nltk>=3.5
4 | pandas>=1.0.3
5 | scikit-learn>=0.22.2
6 | rank-bm25
7 |
--------------------------------------------------------------------------------
/ml/topic-modeling/.gitignore:
--------------------------------------------------------------------------------
1 | /data
2 |
--------------------------------------------------------------------------------
/ml/topic-modeling/README.md:
--------------------------------------------------------------------------------
1 | ## About
2 |
3 | My revised code for
4 |
5 | https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
6 | https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
7 |
8 | and
9 |
10 | LDA from Scratch
11 | My revised tutorial based on https://www.depends-on-the-definition.com/lda-from-scratch/
12 |
13 | I also found another similar tutorial at https://gist.github.com/umbertogriffo/5041b9e4ec6c3478cef99b8653530032
14 |
15 | ## Setup
16 |
17 | within the tutorial folder:
18 |
19 | ```
20 | python3 -m venv venv
21 | source venv/bin/activate
22 | pip install -r requirements.txt
23 | ```
24 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
25 |
26 |
--------------------------------------------------------------------------------
/ml/topic-modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim
2 | jupyter
3 | matplotlib
4 | nltk
5 | pandas
6 | scikit-learn
--------------------------------------------------------------------------------
/ml/tweet-sentiment-analysis/README.md:
--------------------------------------------------------------------------------
1 | ## Tweet Sentiment Analysis with Python 3
2 |
3 | This is my revision of the tutorial at https://dev.to/rodolfoferro/sentiment-analysis-on-trumpss-tweets-using-python - many thanks to the author. The original repo is at https://github.com/RodolfoFerro/pandas_twitter
4 |
5 | The original author provides markdown version of his tutorial. I combine all files into one: tutorial.md and create a English version of the Jupyter notebook (the author only had a Spanish version).
6 |
7 | ## Summary
8 | **Data**: 200 Tweets from Donald Trump: https://twitter.com/realDonaldTrump
9 |
10 | **Goal**: Conduct a sentiment analysis of the tweets with sample result:
11 |
12 | - Percentage of positive tweets: 53.5%
13 | - Percentage of neutral tweets: 23.0%
14 | - Percentage of negative tweets: 23.5%
15 |
16 | Python packages used: jupyter, pandas, numpy, tweepy, textblob
17 |
18 | ## API Keys
19 | Change API key for Twitter: In order to extract tweets for a posterior analysis, we need to access to our Twitter account and create an app. The website to do this is https://apps.twitter.com/. (If you don't know how to do this, you can follow this tutorial video https://www.youtube.com/watch?v=BOA7SD_09Qk to create an account and an application.)
20 |
21 |
22 | - Consumer Key (API Key)
23 | - Consumer Secret (API Secret)
24 | - Access Token
25 | - Access Token Secret
26 |
27 | **You should never put your real API key in the code and push to Github.** We use local environment variables for the API keys:
28 |
29 | ```
30 | # Get the API key from local environment variable
31 | CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY')
32 | CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET')
33 | ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN')
34 | ACCESS_SECRET = os.environ.get('TWITTER_ACCESS_SECRET')
35 | ```
36 |
37 | You need to add the following lines to the `~/.bash_profile` file
38 | ```
39 | export TWITTER_CONSUMER_KEY='yourealkey'
40 | export TWITTER_CONSUMER_SECRET='yourealkey'
41 | export TWITTER_ACCESS_TOKEN='yourealkey'
42 | export TWITTER_ACCESS_SECRET='yourealkey'
43 | ```
44 |
45 | then, use `vim` to edit, `source` to execute it, then use `env` to double check):
46 |
47 | ```
48 | $ vim ~/.bash_profile
49 | $ source ~/.bash_profile
50 | $ env
51 | ```
52 | **NOTE: You may need to close the Terminal window and restart it for Jupyter Notebook to read the new variables you just added.**
53 |
54 | ## Setup
55 |
56 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
57 |
58 | ```
59 | $ cd path_to_document-clustering
60 | $ virtualenv -p python3 venv
61 | $ source venv/bin/activate
62 | $ pip3 install -r requirements.txt
63 | ```
64 |
65 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
66 |
--------------------------------------------------------------------------------
/ml/tweet-sentiment-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | numpy
4 | tweepy
5 | textblob
6 |
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | .DS_Store
104 |
105 | # PyCharm project settings
106 | .idea
107 |
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | A script to convert Chinese folder names to pinyin. revised according to: http://sunzhen.blogspot.com/2016/05/rename-chinese-filenames-to-pinyin.html
4 |
5 |
6 | ## Setup
7 |
8 | This script converts all file and folder names in the sub folder "data" into Pinyin.
9 |
10 | If different unicode maps to the same pinyin such as "利" and "立"" both map to "li" and 1 will be added to the filename.
11 |
12 | Tested with Python 3.6 via virtual environment:
13 | ```shell
14 | $ python3.6 -m venv venv
15 | $ source venv/bin/activate
16 | $ python ch-to-pinyin.py
17 | ```
18 |
19 | ## An Example
20 |
21 | Before:
22 |
23 |
24 |
25 | After:
26 |
27 |
28 |
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/ch-to-pinyin.py:
--------------------------------------------------------------------------------
1 | # renameCH2Pinyin.py
2 | # Rename filename from Chinese characters to capitalized pinyin using the
3 | # mapping file and taking out the tone numbers
4 |
5 | import os
6 | import re
7 |
8 | # File uni2pinyin is a mapping from hex to Pinyin with a tone number
9 | f = open('uni2pinyin')
10 | wf = f.read() # read the whole mapping file
11 |
12 | os.chdir('data') # to rename all files in sub folder 'voc'
13 | filename_list = os.listdir(u'.') # read all file names in unicode mode
14 | print(filename_list)
15 | for filename_unicode in filename_list: # each file name
16 | filename_pinyin = ''
17 | for c in filename_unicode: # each character
18 | if 0x4e00 <= ord(c) <= 0x9fff: # Chinese Character Unicode range
19 | hexCH = (hex(ord(c))[2:]).upper() # strip leading '0x' and change
20 | # to uppercase
21 | p = re.compile(hexCH+'\t([a-z]+)[\d]*') # define the match pattern
22 | mp = p.search(wf)
23 | filename_pinyin+=mp.group(1).title() # get the pinyin without the tone
24 | # number and capitalize it
25 | else:
26 | filename_pinyin+=c
27 | print(filename_unicode, filename_pinyin)
28 |
29 | latest_filename_list = os.listdir(u'.')
30 | while filename_pinyin in latest_filename_list:
31 | filename_pinyin= filename_pinyin + '1'
32 | print(filename_pinyin)
33 | os.rename(filename_unicode, filename_pinyin)
34 | os.chdir('..') # go back to the parent folder
35 |
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store11:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store11
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store111:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store111
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store111111:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store111111
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store1111111:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store1111111
--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/白/0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/白/0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg
--------------------------------------------------------------------------------
/other/color-palette/test-palette.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/color-palette/test-palette.png
--------------------------------------------------------------------------------
/other/color-palette/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/color-palette/test.png
--------------------------------------------------------------------------------
/other/csv-to-bert-text/README.md:
--------------------------------------------------------------------------------
1 | # About
2 |
3 | A script to convert csv to multiple txt based on a label column
4 |
5 | ## Data
6 |
7 | ```
8 | 1 The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor. 5 POS
9 | 2 The food was definitely more "upscale" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas. 4 NEG
10 | 3 Another small bonus was the hard-to-find Sol beer , which was great with a lime. 4 POS
11 | 4 Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!. 5 NEU
12 | ```
13 | ## Setup
14 |
15 | Tested with Python 3.6 via virtual environment:
16 | ```shell
17 | $ python3.6 -m venv venv
18 | $ source venv/bin/activate
19 | $ pip install -r requirements.txt
20 | $ jupyter notebook
21 | ```
22 |
--------------------------------------------------------------------------------
/other/csv-to-bert-text/csv-to-txt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[27]:
5 |
6 |
7 | import pandas as pd
8 | import os
9 | import shutil
10 |
11 |
12 | # In[20]:
13 |
14 |
15 | sent = pd.read_csv("real.csv", encoding='utf8')
16 | sent
17 |
18 |
19 | # In[21]:
20 |
21 |
22 | path = os.getcwd()
23 | print ("The current working directory is %s" % path)
24 |
25 |
26 | # In[32]:
27 |
28 |
29 | # clean data folder and create new folders
30 | shutil.rmtree('./data')
31 | os.makedirs("./data/pos")
32 | os.makedirs("./data/neg")
33 | os.makedirs("./data/neu")
34 |
35 |
36 | # In[16]:
37 |
38 |
39 | def write_sent(label, id, sent):
40 | filename = "./data/"+ label +"/" + id +".txt"
41 | file = open(filename,"w")
42 | file.writelines(sent)
43 | file.close()
44 |
45 |
46 | # In[17]:
47 |
48 |
49 | for index, row in sent.iterrows():
50 | write_sent(row['SentiLabel_food'].lower(), str(row['SentenceID']), row['Sentences'])
51 | print("writing sentence")
52 |
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/neg/43.txt:
--------------------------------------------------------------------------------
1 | I was slightly saddened that either they do not offer chalula here or that they ran out.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/neg/44.txt:
--------------------------------------------------------------------------------
1 | Sometimes I feel that breakfast is just not complete without it and hope that they can stock up because that louisiana hot sauce they carried did not cut it.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/neu/29.txt:
--------------------------------------------------------------------------------
1 | Food was ok ,
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/1.txt:
--------------------------------------------------------------------------------
1 | The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/2.txt:
--------------------------------------------------------------------------------
1 | The food was definitely more "upscale" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/20.txt:
--------------------------------------------------------------------------------
1 | We ordered 2 appetizers , 2 entrees , 2 bottles of wine... food was very good not great.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/3.txt:
--------------------------------------------------------------------------------
1 | Another small bonus was the hard-to-find Sol beer , which was great with a lime.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/4.txt:
--------------------------------------------------------------------------------
1 | Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/40.txt:
--------------------------------------------------------------------------------
1 | The food came out in about 10 minutes and my skillet was cooked very well.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/41.txt:
--------------------------------------------------------------------------------
1 | The sunny side up eggs here are some of the best I have ever had- they were slightly chewy and full of flavor.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/42.txt:
--------------------------------------------------------------------------------
1 | My skillet came with cheddar cheese mushroom , broccoli , and tomtaoes and it was a hearty meal that , when combined with ketchup and salt , delivered very satisfying feelings to my tastebuds and stomach.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/46.txt:
--------------------------------------------------------------------------------
1 | They were hot and had a soft , spongy texture.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/47.txt:
--------------------------------------------------------------------------------
1 | Pretty delicious and satisfied my sweet tooth.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/5.txt:
--------------------------------------------------------------------------------
1 | After shopping around , my husband and I both think they have the best pizza around Highland Park.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/50.txt:
--------------------------------------------------------------------------------
1 | Their sandwich specials looked great.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/8.txt:
--------------------------------------------------------------------------------
1 | It's a homey place with good food.
--------------------------------------------------------------------------------
/other/csv-to-bert-text/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | pandas==0.25.2
3 |
--------------------------------------------------------------------------------
/other/csv-to-bert-text/sample.csv:
--------------------------------------------------------------------------------
1 | SentenceID,Sentences,SentiScore_food,SentiLabel_food
2 | 1,"The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor.",5,POS
3 | 2,"The food was definitely more ""upscale"" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas.",4,POS
4 | 3,"Another small bonus was the hard-to-find Sol beer , which was great with a lime.",4,POS
5 | 4,"Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!.",5,POS
6 | 5,"After shopping around , my husband and I both think they have the best pizza around Highland Park.",5,POS
7 | 8,It's a homey place with good food.,4,POS
8 | 20,"We ordered 2 appetizers , 2 entrees , 2 bottles of wine... food was very good not great.",4,POS
9 | 29,"Food was ok , ",3,NEU
10 | 40,The food came out in about 10 minutes and my skillet was cooked very well.,5,POS
11 | 41,The sunny side up eggs here are some of the best I have ever had- they were slightly chewy and full of flavor.,5,POS
12 | 42,"My skillet came with cheddar cheese mushroom , broccoli , and tomtaoes and it was a hearty meal that , when combined with ketchup and salt , delivered very satisfying feelings to my tastebuds and stomach.",5,POS
13 | 43,I was slightly saddened that either they do not offer chalula here or that they ran out.,2,NEG
14 | 44,Sometimes I feel that breakfast is just not complete without it and hope that they can stock up because that louisiana hot sauce they carried did not cut it.,2,NEG
15 | 46,"They were hot and had a soft , spongy texture.",4,POS
16 | 47,Pretty delicious and satisfied my sweet tooth.,4,POS
17 | 50,Their sandwich specials looked great.,5,POS
--------------------------------------------------------------------------------
/other/list-like-to-list/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/other/list-like-to-list/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/other/list-like-to-list/.idea/movie-genre.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/other/list-like-to-list/README.md:
--------------------------------------------------------------------------------
1 | # Intro
2 | This is a program to do some data transformation.
3 |
4 | The key challenge is converting the following "list-like" string into a real list:
5 |
6 | ```
7 | "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"
8 | ```
9 |
10 | `ast.literal_eval` does the trick: https://docs.python.org/2/library/ast.html
11 |
12 | Another useful trick is converting a list of list:
13 |
14 | ```
15 | [[862, 16], [862, 35], [862, 10751], [8844, 12]]
16 | ```
17 | into a csv file:
18 |
19 | ```
20 | 862,16
21 | 862,35
22 | 862,10751
23 | 8844,12
24 | ```
25 | pandas makes it easy:
26 | ```
27 | my_df = pd.DataFrame(my_list)
28 | my_df.to_csv('my_csv.csv', index=False, header=False)
29 | ```
30 |
31 | # Run
32 |
33 | Python 2.x
34 |
35 | - create virtual environment: `$virtualenv venv`
36 | - activate virtual env: `$source venv/bin/activate`
37 | - install required packages: `pip install -r requirements.txt`
38 |
--------------------------------------------------------------------------------
/other/list-like-to-list/movie.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import ast
3 | import pandas as pd
4 |
5 | movies = []
6 | # how to read a csv into a list
7 | with open('input.csv', 'rb') as f:
8 | reader = csv.reader(f)
9 | movies = list(reader)
10 |
11 | my_list = []
12 |
13 | for movie in movies:
14 | # print '****** one movie ******'
15 | for genre in ast.literal_eval(movie[1]):
16 | line = []
17 | # print '****** one genre ******'
18 | # print movie[0]
19 | # print genre['id']
20 | # print genre['name']
21 | line.append(int(movie[0]))
22 | line.append(genre['id'])
23 | # print line
24 | my_list.append(line)
25 |
26 | print(my_list)
27 | my_df = pd.DataFrame(my_list)
28 | my_df.to_csv('movie_genre.csv', index=False, header=False)
29 | print 'See result in movie_genre.csv file'
30 |
--------------------------------------------------------------------------------
/other/list-like-to-list/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.21.0
2 |
--------------------------------------------------------------------------------
/other/list-of-dicts-to-columns/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/list-of-dicts-to-columns/README.md
--------------------------------------------------------------------------------
/other/list-of-dicts-to-columns/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
--------------------------------------------------------------------------------
/other/screenshot-gif-generation/.gitignore:
--------------------------------------------------------------------------------
1 | /screenshots/*.jpg
2 | /screenshots/*.gif
3 |
--------------------------------------------------------------------------------
/other/screenshot-gif-generation/README.md:
--------------------------------------------------------------------------------
1 | # Generate Screenshots and Gifs via Python
2 |
3 | Code is revised based on:
4 | - https://blog.csdn.net/qq_38161040/article/details/91040640
5 | - https://medium.com/swlh/python-animated-images-6a85b9b68f86
--------------------------------------------------------------------------------
/other/screenshot-gif-generation/gif-generation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "language_info": {
4 | "codemirror_mode": {
5 | "name": "ipython",
6 | "version": 3
7 | },
8 | "file_extension": ".py",
9 | "mimetype": "text/x-python",
10 | "name": "python",
11 | "nbconvert_exporter": "python",
12 | "pygments_lexer": "ipython3",
13 | "version": "3.7.7-final"
14 | },
15 | "orig_nbformat": 2,
16 | "kernelspec": {
17 | "name": "python_defaultSpec_1600265837621",
18 | "display_name": "Python 3.7.7 64-bit ('venv': venv)"
19 | }
20 | },
21 | "nbformat": 4,
22 | "nbformat_minor": 2,
23 | "cells": [
24 | {
25 | "source": [
26 | "# Generate Screenshots and Gifs via Python\n",
27 | "\n",
28 | "Code is revised based on:\n",
29 | "- https://blog.csdn.net/qq_38161040/article/details/91040640\n",
30 | "- https://medium.com/swlh/python-animated-images-6a85b9b68f86"
31 | ],
32 | "cell_type": "markdown",
33 | "metadata": {}
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 27,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from PIL import ImageGrab\n",
42 | "from PIL import Image\n",
43 | "import time"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 28,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "# take a screenshot every 0.1 second, 10 jpg saved\n",
53 | "total_images = 10 # total screenshots\n",
54 | "interval = 0.1 # the interval to take a screenshot\n",
55 | "resize_ratio = 0.3 # the resize ratio to keep the screenshot smaller\n",
56 | "\n",
57 | "for i in range(total_images):\n",
58 | " time.sleep(interval)\n",
59 | " img = ImageGrab.grab()\n",
60 | " width = img.size[0]\n",
61 | " height = img.size[1]\n",
62 | "\n",
63 | " img = img.resize(\n",
64 | " (int(width*resize_ratio), int(height*resize_ratio)), \n",
65 | " Image.ANTIALIAS)\n",
66 | " \n",
67 | " img = img.convert('RGB') # if save to jpg\n",
68 | " img.save(f'./screenshots/screenshot{str(i+1)}.jpg')"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 29,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "# generate the gif\n",
78 | "import imageio\n",
79 | "\n",
80 | "gif_images = []\n",
81 | "for i in range(total_images):\n",
82 | " gif_images.append(imageio.imread(f'./screenshots/screenshot{str(i+1)}.jpg'))\n",
83 | "\n",
84 | "imageio.mimsave(\"./screenshots/screenshot.gif\", gif_images, fps=5)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 30,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "# reduce the gif file size\n",
94 | "from pygifsicle import optimize\n",
95 | "\n",
96 | "gif_orginal = './screenshots/screenshot.gif'\n",
97 | "\n",
98 | "# create a new onegit \n",
99 | "optimize(gif_orginal, './screenshots/screenshot_optimized.gif')\n",
100 | "\n",
101 | "# overwrite the original one if needed\n",
102 | "# optimize(gif_orginal)"
103 | ]
104 | }
105 | ]
106 | }
--------------------------------------------------------------------------------
/other/screenshot-gif-generation/git-gen.py:
--------------------------------------------------------------------------------
1 | # # Generate Screenshots and Gifs via Python
2 | #
3 | # Code is revised based on:
4 | # - https://blog.csdn.net/qq_38161040/article/details/91040640
5 | # - https://medium.com/swlh/python-animated-images-6a85b9b68f86
6 | # pip install the following three packages: imageio, pillow, pygifsicle
7 |
8 | # %%
9 | from PIL import ImageGrab
10 | from PIL import Image
11 | import time
12 |
13 |
14 | # %%
15 | # take a screenshot every 0.1 second, 10 jpg saved
16 | total_images = 10 # total screenshots
17 | interval = 0.1 # the interval to take a screenshot
18 | resize_ratio = 0.3 # the resize ratio to keep the screenshot smaller
19 |
20 | for i in range(total_images):
21 | time.sleep(interval)
22 | img = ImageGrab.grab()
23 | width = img.size[0]
24 | height = img.size[1]
25 |
26 | img = img.resize(
27 | (int(width*resize_ratio), int(height*resize_ratio)),
28 | Image.ANTIALIAS)
29 |
30 | img = img.convert('RGB') # if save to jpg
31 | img.save(f'./screenshots/screenshot{str(i+1)}.jpg')
32 |
33 |
34 | # %%
35 | import imageio
36 |
37 | gif_images = []
38 | for i in range(total_images):
39 | gif_images.append(imageio.imread(f'./screenshots/screenshot{str(i+1)}.jpg'))
40 |
41 | imageio.mimsave("./screenshots/screenshot.gif", gif_images, fps=5)
42 |
43 |
44 | # %%
45 | from pygifsicle import optimize
46 |
47 | gif_orginal = './screenshots/screenshot.gif'
48 |
49 | # create a new onegit
50 | optimize(gif_orginal, './screenshots/screenshot_optimized.gif')
51 |
52 | # overwrite the original one if needed
53 | # optimize(gif_orginal)
54 |
55 |
56 |
--------------------------------------------------------------------------------
/other/screenshot-gif-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | imageio
3 | pillow
4 | pygifsicle
--------------------------------------------------------------------------------
/other/screenshot-gif-generation/screenshots/screenshot-folder.md:
--------------------------------------------------------------------------------
1 | temp files are saved in this folder
--------------------------------------------------------------------------------