├── .gitignore
├── LICENSE
├── README.md
├── ai
    ├── autogen
    │   ├── README.md
    │   ├── autogen-code-execution.ipynb
    │   ├── autogen-tools.ipynb
    │   ├── autogent-tutorial.ipynb
    │   ├── docker-example.ipynb
    │   └── requirements.txt
    ├── langchain
    │   └── langchain-rag-basics
    │   │   ├── basics.ipynb
    │   │   ├── data
    │   │       ├── chroma
    │   │       │   ├── 39b238f5-b82f-42ff-a683-cc8d5aea4747
    │   │       │   │   ├── data_level0.bin
    │   │       │   │   ├── header.bin
    │   │       │   │   ├── length.bin
    │   │       │   │   └── link_lists.bin
    │   │       │   └── chroma.sqlite3
    │   │       ├── getting-real
    │   │       │   ├── getting-real-01-introduction.pdf
    │   │       │   ├── getting-real-02-starting-line.pdf
    │   │       │   ├── getting-real-03-stay-lean.pdf
    │   │       │   ├── getting-real-04-priorities.pdf
    │   │       │   ├── getting-real-05-feature-selection.pdf
    │   │       │   ├── getting-real-06-process.pdf
    │   │       │   ├── getting-real-07-organization.pdf
    │   │       │   ├── getting-real-08-staffing.pdf
    │   │       │   ├── getting-real-09-interface-design.pdf
    │   │       │   ├── getting-real-10-code.pdf
    │   │       │   ├── getting-real-11-words.pdf
    │   │       │   ├── getting-real-12-pricing-signup.pdf
    │   │       │   ├── getting-real-13-promotion.pdf
    │   │       │   ├── getting-real-14-support.pdf
    │   │       │   ├── getting-real-15-post-launch.pdf
    │   │       │   └── getting-real-full.pdf
    │   │       └── nba-rules-2023.pdf
    │   │   ├── qa.ipynb
    │   │   ├── rag-retrieval.ipynb
    │   │   └── requirements.txt
    └── litellm
    │   ├── README.md
    │   ├── deepseek_example.py
    │   └── requirements.txt
├── ds
    ├── ab-testing
    │   ├── README.md
    │   ├── Walkthrough.ipynb
    │   ├── ab-testing-math.ipynb
    │   ├── requirements.txt
    │   └── utils
    │   │   ├── data.py
    │   │   ├── plot.py
    │   │   └── stats.py
    ├── airflow
    │   ├── README.md
    │   ├── requirements.txt
    │   └── simple_bash_dag.py
    ├── aws-pyspark
    │   ├── README.md
    │   └── emr_bootstrap.sh
    ├── cohort-analysis
    │   ├── README.md
    │   ├── cohort-analysis.ipynb
    │   └── requirements.txt
    ├── dask
    │   ├── .gitignore
    │   ├── README.md
    │   ├── dask-array.ipynb
    │   ├── dask-big-dataset.ipynb
    │   ├── dask-intro.ipynb
    │   ├── dask-taxi.ipynb
    │   ├── dask-worker-space
    │   │   ├── global.lock
    │   │   ├── purge.lock
    │   │   ├── worker-3y9yh5wc.dirlock
    │   │   ├── worker-5u5lbrxx.dirlock
    │   │   ├── worker-82zb8rgu.dirlock
    │   │   ├── worker-9wl7s6m3.dirlock
    │   │   ├── worker-_n7kuuyd.dirlock
    │   │   ├── worker-bbjm31ih.dirlock
    │   │   ├── worker-fwxxmool.dirlock
    │   │   ├── worker-l28a891y.dirlock
    │   │   ├── worker-l8y7v2oj.dirlock
    │   │   ├── worker-lckuq0ub.dirlock
    │   │   ├── worker-ofkwc26n.dirlock
    │   │   └── worker-wuu54xyo.dirlock
    │   ├── mydask.png
    │   └── requirements.txt
    ├── data-driven-growth
    │   ├── .gitignore
    │   ├── README.md
    │   ├── know-your-metrics.ipynb
    │   ├── requirements.txt
    │   └── utils
    │   │   ├── data.py
    │   │   ├── plot.py
    │   │   └── stats.py
    ├── diff-in-diff
    │   ├── Panel101.dta
    │   ├── README.md
    │   ├── did-min-wage.ipynb
    │   ├── did-panel101.ipynb
    │   ├── mini-wage.csv
    │   ├── mini-wage.dat
    │   ├── panel101.csv
    │   └── requirements.txt
    ├── dvc
    │   ├── .gitignore
    │   └── README.md
    ├── hypo-testing
    │   ├── README.md
    │   ├── blood-pressure.csv
    │   ├── chi-test.csv
    │   ├── crop-yield.csv
    │   ├── hypo-testing.ipynb
    │   ├── plant-growth.csv
    │   └── requirements.txt
    ├── inside-airbnb
    │   ├── .idea
    │   │   ├── inside-airbnb.iml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── workspace.xml
    │   ├── README.md
    │   ├── add-columns.py
    │   ├── data
    │   │   ├── nyc-listings.csv
    │   │   └── nyc-listings_new.csv
    │   ├── get-one-photo.py
    │   ├── get-photos.py
    │   └── requirements.txt
    ├── matplotlib
    │   ├── README.md
    │   ├── grouped-bar-plot-with-precentage-change-matplotlib.ipynb
    │   └── requirements.txt
    ├── multi-armed-bandit
    │   ├── README.md
    │   ├── mab.ipynb
    │   └── requirements.txt
    ├── pymongo
    │   ├── README.md
    │   ├── pymongo.ipynb
    │   └── requirements.txt
    ├── seaborn
    │   ├── README.md
    │   ├── pokemon.csv
    │   ├── requirements.txt
    │   └── seaborn_basics.ipynb
    ├── spark-basics
    │   ├── datacamp-notes.md
    │   └── datacamp-spark.ipynb
    ├── statsmodels-tutorial
    │   ├── README.md
    │   ├── lr-python.ipynb
    │   ├── requirements.txt
    │   ├── statsmodels.ipynb
    │   └── statsmodels_getstarted.ipynb
    ├── streamlit
    │   ├── README.md
    │   ├── airbnb.py
    │   ├── listings.csv
    │   └── requirements.txt
    ├── superset
    │   └── README.md
    ├── time-series-additive-model
    │   ├── README.md
    │   ├── additive_models.ipynb
    │   ├── data
    │   │   ├── Workbook1.xlsx
    │   │   ├── gm_sales.csv
    │   │   ├── gm_sales.xlsx
    │   │   ├── recessions.csv
    │   │   ├── recessions.xlsx
    │   │   └── tesla_search_terms.csv
    │   └── requirements.txt
    └── time-series-basics
    │   ├── README.md
    │   ├── data
    │       └── opsd_germany_daily.csv
    │   ├── requirements.txt
    │   └── time_series_basics.ipynb
├── ml
    ├── attention
    │   ├── README.md
    │   ├── attention_explained.ipynb
    │   └── requirements.txt
    ├── autogluon
    │   ├── README.md
    │   ├── agModels-predictClass
    │   │   ├── learner.pkl
    │   │   ├── models
    │   │   │   ├── CatBoost
    │   │   │   │   └── model.pkl
    │   │   │   ├── ExtraTreesEntr
    │   │   │   │   └── model.pkl
    │   │   │   ├── ExtraTreesGini
    │   │   │   │   └── model.pkl
    │   │   │   ├── KNeighborsDist
    │   │   │   │   └── model.pkl
    │   │   │   ├── KNeighborsUnif
    │   │   │   │   └── model.pkl
    │   │   │   ├── LightGBM
    │   │   │   │   └── model.pkl
    │   │   │   ├── LightGBMLarge
    │   │   │   │   └── model.pkl
    │   │   │   ├── LightGBMXT
    │   │   │   │   └── model.pkl
    │   │   │   ├── NeuralNetFastAI
    │   │   │   │   ├── model-internals.pkl
    │   │   │   │   └── model.pkl
    │   │   │   ├── RandomForestEntr
    │   │   │   │   └── model.pkl
    │   │   │   ├── RandomForestGini
    │   │   │   │   └── model.pkl
    │   │   │   ├── WeightedEnsemble_L2
    │   │   │   │   ├── model.pkl
    │   │   │   │   └── utils
    │   │   │   │   │   ├── model_template.pkl
    │   │   │   │   │   └── oof.pkl
    │   │   │   ├── XGBoost
    │   │   │   │   └── model.pkl
    │   │   │   └── trainer.pkl
    │   │   ├── predictor.pkl
    │   │   └── utils
    │   │   │   └── data
    │   │   │       ├── X.pkl
    │   │   │       ├── X_val.pkl
    │   │   │       ├── y.pkl
    │   │   │       └── y_val.pkl
    │   ├── autogluon.ipynb
    │   ├── housing-prediction.ipynb
    │   ├── input
    │   │   ├── anscombe.csv
    │   │   └── housing.csv
    │   └── requirements.txt
    ├── clearml-server
    │   └── README.md
    ├── clearml
    │   ├── README.md
    │   ├── matplotlib
    │   │   ├── Allegro_Trains_matplotlib_example.ipynb
    │   │   ├── matplotlib_example.py
    │   │   ├── mlp_grouped_errorbar.py
    │   │   └── requirements.txt
    │   ├── pytorch
    │   │   ├── manual_model_upload.py
    │   │   ├── notebooks
    │   │   │   ├── audio
    │   │   │   │   ├── README.md
    │   │   │   │   ├── audio_classifier_UrbanSound8K.ipynb
    │   │   │   │   └── audio_preprocessing_example.ipynb
    │   │   │   ├── image
    │   │   │   │   ├── hyperparameter_search.ipynb
    │   │   │   │   └── image_classification_CIFAR10.ipynb
    │   │   │   ├── table
    │   │   │   │   ├── download_and_preprocessing.ipynb
    │   │   │   │   ├── download_and_split.ipynb
    │   │   │   │   ├── pick_best_model.ipynb
    │   │   │   │   ├── preprocessing_and_encoding.ipynb
    │   │   │   │   ├── tabular_ml_pipeline.ipynb
    │   │   │   │   └── train_tabular_predictor.ipynb
    │   │   │   └── text
    │   │   │   │   └── text_classification_AG_NEWS.ipynb
    │   │   ├── pytorch_distributed_example.py
    │   │   ├── pytorch_matplotlib.py
    │   │   ├── pytorch_mnist.py
    │   │   ├── pytorch_tensorboard.py
    │   │   ├── pytorch_tensorboardx.py
    │   │   ├── requirements.txt
    │   │   └── tensorboard_toy_pytorch.py
    │   ├── requirements.txt
    │   ├── scikit-learn
    │   │   ├── model-harry.pkl
    │   │   ├── model.pkl
    │   │   ├── requirements.txt
    │   │   ├── sklearn_joblib_example.py
    │   │   └── sklearn_matplotlib_example.py
    │   ├── tensorflow
    │   │   ├── legacy
    │   │   │   ├── requirements.txt
    │   │   │   ├── tensorboard_pr_curve.py
    │   │   │   ├── tensorboard_toy.py
    │   │   │   ├── tensorflow_eager.py
    │   │   │   └── tensorflow_mnist_with_summaries.py
    │   │   ├── manual_model_upload.py
    │   │   ├── requirements.txt
    │   │   ├── tensorboard_pr_curve.py
    │   │   ├── tensorboard_toy.py
    │   │   └── tensorflow_mnist.py
    │   ├── wandb
    │   │   ├── latest-run
    │   │   ├── pytorch_mnist_clearml.py
    │   │   ├── pytorch_mnist_wandb.py
    │   │   └── requirements.txt
    │   └── xgboost
    │   │   ├── requirements.txt
    │   │   └── xgboost_sample.py
    ├── clip-image-classification
    │   └── clip-img-cls.ipynb
    ├── document-clustering
    │   ├── README.md
    │   ├── data
    │   │   ├── genres_list.txt
    │   │   ├── synopses_list_imdb.txt
    │   │   ├── synopses_list_wiki.txt
    │   │   └── title_list.txt
    │   ├── doc_clustering.ipynb
    │   └── requirements.txt
    ├── feature-importance
    │   ├── README.md
    │   ├── breast-cancer.csv
    │   ├── feature-importance.ipynb
    │   ├── feature-selection.ipynb
    │   └── requirements.txt
    ├── few-shot-learning
    │   ├── .gitignore
    │   ├── README.md
    │   ├── datasets
    │   │   ├── mini_imagenet
    │   │   │   └── dataloader_mini_imagenet.py
    │   │   └── omniglot
    │   │   │   └── dataloader_omniglot.py
    │   ├── loader_omniglot.py
    │   ├── mini_imagenet
    │   │   ├── mini_proto_model.py
    │   │   ├── mini_proto_test.py
    │   │   ├── mini_proto_train.py
    │   │   └── mini_protoloader.py
    │   ├── model_omniglot.py
    │   ├── notebooks
    │   │   └── dataloader_notebook
    │   │   │   ├── Omniglot.ipynb
    │   │   │   ├── dataloader.ipynb
    │   │   │   ├── images_background_small2.zip
    │   │   │   └── loss_test.ipynb
    │   ├── requirements.txt
    │   ├── test_omniglot.py
    │   ├── train_omniglot.py
    │   └── util
    │   │   ├── loss.py
    │   │   └── tensor_op.py
    ├── fine-tune-pegasus
    │   ├── README.md
    │   ├── pegasus_finetuning_xsum.ipynb
    │   └── requirements.txt
    ├── graph
    │   ├── .gitignore
    │   ├── README.md
    │   ├── data
    │   │   ├── fb-pages-food.edges
    │   │   ├── fb-pages-food.nodes
    │   │   └── shakespeare.txt
    │   ├── deepwalk.ipynb
    │   ├── fb-page-link-prediction.ipynb
    │   ├── metadata.tsv
    │   ├── requirements.txt
    │   ├── vectors.tsv
    │   └── word2vec.ipynb
    ├── greedy-layer-wise-pretraning
    │   ├── README.md
    │   ├── layer-wise-pretrain.ipynb
    │   └── requirements.txt
    ├── house-price-prediction
    │   ├── README.md
    │   ├── house_price_prediction.ipynb
    │   ├── input
    │   │   ├── anscombe.csv
    │   │   └── housing.csv
    │   └── requirements.txt
    ├── imbalanced-multi-classification
    │   ├── README.md
    │   ├── glass.csv
    │   ├── imbalanced-classification.ipynb
    │   └── requirements.txt
    ├── openml-csv-arff
    │   ├── README.md
    │   └── news-aggregator.ipynb
    ├── process-mining
    │   ├── README.md
    │   ├── log-eda.ipynb
    │   ├── log2csv.ipynb
    │   ├── pm-log.csv
    │   ├── requirements.txt
    │   ├── sample.csv
    │   └── sample.txt
    ├── tf-serving
    │   ├── .gitignore
    │   ├── README.md
    │   ├── client.py
    │   ├── client_curl.sh
    │   ├── data
    │   │   └── ImageNetLabels.txt
    │   ├── images
    │   │   ├── animal.jpg
    │   │   ├── clear.jpg
    │   │   └── ponds.png
    │   ├── make_servables.py
    │   └── requirements.txt
    ├── tfidf-bm25
    │   ├── README.md
    │   ├── data
    │   │   ├── cranfield_docs.json
    │   │   ├── cranfield_queries.json
    │   │   └── cranfield_relevance.json
    │   ├── requirements.txt
    │   └── tfidf-bm25.ipynb
    ├── topic-modeling
    │   ├── .gitignore
    │   ├── LDA_news_headlines.ipynb
    │   ├── README.md
    │   ├── abcnews-small.csv
    │   ├── lda_from_scratch.ipynb
    │   └── requirements.txt
    └── tweet-sentiment-analysis
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── trump-tweets.csv
    │   ├── tutorial.md
    │   └── tweet_sentiment_analysis.ipynb
└── other
    ├── chinese-to-pinyin
        ├── .gitignore
        ├── README.md
        ├── ch-to-pinyin.py
        ├── data
        │   ├── .DS_Store11
        │   ├── .DS_Store111
        │   ├── .DS_Store111111
        │   ├── .DS_Store1111111
        │   └── 白
        │   │   └── 0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg
        └── uni2pinyin
    ├── color-palette
        ├── color-palette.ipynb
        ├── test-palette.png
        └── test.png
    ├── csv-to-bert-text
        ├── README.md
        ├── csv-to-txt.ipynb
        ├── csv-to-txt.py
        ├── data
        │   ├── neg
        │   │   ├── 43.txt
        │   │   └── 44.txt
        │   ├── neu
        │   │   └── 29.txt
        │   └── pos
        │   │   ├── 1.txt
        │   │   ├── 2.txt
        │   │   ├── 20.txt
        │   │   ├── 3.txt
        │   │   ├── 4.txt
        │   │   ├── 40.txt
        │   │   ├── 41.txt
        │   │   ├── 42.txt
        │   │   ├── 46.txt
        │   │   ├── 47.txt
        │   │   ├── 5.txt
        │   │   ├── 50.txt
        │   │   └── 8.txt
        ├── requirements.txt
        └── sample.csv
    ├── list-like-to-list
        ├── .idea
        │   ├── misc.xml
        │   ├── modules.xml
        │   ├── movie-genre.iml
        │   └── workspace.xml
        ├── README.md
        ├── input.csv
        ├── movie.py
        ├── movie_genre.csv
        └── requirements.txt
    ├── list-of-dicts-to-columns
        ├── README.md
        ├── example.csv
        ├── list-to-columns.ipynb
        └── requirements.txt
    └── screenshot-gif-generation
        ├── .gitignore
        ├── README.md
        ├── gif-generation.ipynb
        ├── git-gen.py
        ├── requirements.txt
        └── screenshots
            └── screenshot-folder.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | .DS_Store
104 | 
105 | .npy
106 | .pkl
107 | 
108 | # vscode
109 | 
110 | .vscode
111 | 
112 | tmp/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Harry Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repo contains a set of AI and Data Science tutorials in Python curated and revised by me. I modified most of the tutorials to add more instructions and make sure they work well in configured virtual environments. Many thanks to the tutorial authors and other contributors. See the README in each tutorial folder for details.
 2 | 
 3 | I organize the tutorials into four folders:
 4 | 
 5 | - `ai` for AI tutorials
 6 | - `ds` for Data Science tutorials
 7 | - `ml` for machine learning/deep learning tutorials
 8 | - `other` for code on things like data processing, one-off tricks, etc.
 9 | 
10 | ## Setup
11 | 
12 | Each tutorial may have different version requirements for certain packages. So, each tutorial will use a separate virtual environment. 
13 | 
14 | For some tutorials, you may need to set API keys. You need to add a `.env` file and include the API keys as follows (see my blog post on [Manage Environment Variables in Python Projects](https://harrywang.me/env)):
15 | 
16 | ```
17 | OPENAI_API_KEY=sk-proj-xxxx
18 | LANGCHAIN_API_KEY=ls__69650xxxx
19 | REPLICATE_API_TOKEN=r8_W0V3rJxxx
20 | ```
21 | 
22 | To run each tutorial, you need to do the following at the root of this project - I use `document_clustering` tutorial as an example:
23 | 
24 | ```
25 | cd document_clustering
26 | python3 -m venv venv
27 | source venv/bin/activate
28 | pip install -r requirements.txt
29 | ```
30 | 
31 | Then, you can use VSCode `code .` to open the notebooks.
32 | 


--------------------------------------------------------------------------------
/ai/autogen/README.md:
--------------------------------------------------------------------------------
1 | Start autogenstudio
2 | 
3 | ```
4 | autogenstudio ui
5 | ```


--------------------------------------------------------------------------------
/ai/autogen/docker-example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 10,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "\u001b[31m\n",
 13 |       ">>>>>>>> EXECUTING CODE BLOCK (inferred language is shell)...\u001b[0m\n",
 14 |       "exitcode: 0 (execution succeeded)\n",
 15 |       "Code output: \n",
 16 |       "\u001b[31m\n",
 17 |       ">>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...\u001b[0m\n",
 18 |       "exitcode: 0 (execution succeeded)\n",
 19 |       "Code output: Scatter plot saved to line.png\n",
 20 |       "\n"
 21 |      ]
 22 |     }
 23 |    ],
 24 |    "source": [
 25 |     "import os\n",
 26 |     "from dotenv import load_dotenv\n",
 27 |     "from autogen import ConversableAgent\n",
 28 |     "from autogen.coding import DockerCommandLineCodeExecutor\n",
 29 |     "\n",
 30 |     "load_dotenv()  # take environment variables from .env.\n",
 31 |     "\n",
 32 |     "llm_config={\"config_list\": [{\n",
 33 |     "    \"model\": \"gpt-4-turbo\",\n",
 34 |     "    \"cache\": None,\n",
 35 |     "    \"temperature\": 0.9, \n",
 36 |     "    \"api_key\": os.environ.get(\"OPENAI_API_KEY\")}]}\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "# Create a temporary directory to store the code files.\n",
 40 |     "temp_dir = './tmp'\n",
 41 |     "\n",
 42 |     "docker_container_name = 'autogen'\n",
 43 |     "\n",
 44 |     "docker_executor = DockerCommandLineCodeExecutor(\n",
 45 |     "    image=\"python:3.12-slim\",  # Execute code using the given docker image name.\n",
 46 |     "    container_name=docker_container_name,  # Name of the Docker container.\n",
 47 |     "    timeout=180,  # Timeout for each code execution in seconds - 3 minutes\n",
 48 |     "    work_dir=temp_dir,  # Use the temporary directory to store the code files.\n",
 49 |     ")\n",
 50 |     "\n",
 51 |     "# Create an agent with code executor configuration that uses docker.\n",
 52 |     "code_executor_agent_using_docker = ConversableAgent(\n",
 53 |     "    \"code_executor_agent_docker\",\n",
 54 |     "    llm_config=False,  # Turn off LLM for this agent.\n",
 55 |     "    code_execution_config={\"executor\": docker_executor},  # Use the docker command line code executor.\n",
 56 |     "    human_input_mode=\"NEVER\",  # Change to ALWAYS to take human input for this agent for safety.\n",
 57 |     ")\n",
 58 |     "\n",
 59 |     "message_with_code_block = \"\"\"This is a message with code block.\n",
 60 |     "The code block is below:\n",
 61 |     "```shell\n",
 62 |     "pip install matplotlib numpy\n",
 63 |     "```\n",
 64 |     "This is the end of the message.\n",
 65 |     "\"\"\"\n",
 66 |     "\n",
 67 |     "reply = code_executor_agent_using_docker.generate_reply(messages=[{\"role\": \"user\", \"content\": message_with_code_block}])\n",
 68 |     "print(reply)\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "message_with_code_block = \"\"\"This is a message with code block.\n",
 72 |     "The code block is below:\n",
 73 |     "```python\n",
 74 |     "import numpy as np\n",
 75 |     "import matplotlib.pyplot as plt\n",
 76 |     "x = range(100)\n",
 77 |     "y = np.random.randint(0, 100, 100)\n",
 78 |     "plt.plot(x, y)\n",
 79 |     "plt.savefig('line.png')\n",
 80 |     "print('Scatter plot saved to line.png')\n",
 81 |     "```\n",
 82 |     "This is the end of the message.\n",
 83 |     "\"\"\"\n",
 84 |     "\n",
 85 |     "reply = code_executor_agent_using_docker.generate_reply(messages=[{\"role\": \"user\", \"content\": message_with_code_block}])\n",
 86 |     "print(reply)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": []
 95 |   }
 96 |  ],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "venv",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.10.6"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 2
118 | }
119 | 


--------------------------------------------------------------------------------
/ai/autogen/requirements.txt:
--------------------------------------------------------------------------------
1 | pyautogen
2 | python-dotenv
3 | matplotlib
4 | numpy
5 | yfinance
6 | autogenstudio


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/header.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/header.bin


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/length.bin:
--------------------------------------------------------------------------------
1 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/link_lists.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/39b238f5-b82f-42ff-a683-cc8d5aea4747/link_lists.bin


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/chroma/chroma.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/chroma/chroma.sqlite3


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-01-introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-01-introduction.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-02-starting-line.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-02-starting-line.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-03-stay-lean.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-03-stay-lean.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-04-priorities.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-04-priorities.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-05-feature-selection.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-05-feature-selection.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-06-process.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-06-process.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-07-organization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-07-organization.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-08-staffing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-08-staffing.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-09-interface-design.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-09-interface-design.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-10-code.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-10-code.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-11-words.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-11-words.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-12-pricing-signup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-12-pricing-signup.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-13-promotion.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-13-promotion.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-14-support.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-14-support.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-15-post-launch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-15-post-launch.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-full.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/getting-real/getting-real-full.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/data/nba-rules-2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ai/langchain/langchain-rag-basics/data/nba-rules-2023.pdf


--------------------------------------------------------------------------------
/ai/langchain/langchain-rag-basics/requirements.txt:
--------------------------------------------------------------------------------
 1 | langchain
 2 | openai
 3 | python-dotenv
 4 | pypdf
 5 | yt_dlp
 6 | pydub
 7 | bs4
 8 | tiktoken
 9 | langchain_openai
10 | chromadb
11 | PyPDF2
12 | lark
13 | scikit-learn
14 | panel
15 | docarray


--------------------------------------------------------------------------------
/ai/litellm/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | 1. Create and activate a virtual environment:
 4 |    ```bash
 5 |    python -m venv venv
 6 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
 7 |    ```
 8 | 
 9 | 2. Install the required packages:
10 |    ```bash
11 |    pip install -r requirements.txt
12 |    ```
13 | 
14 | 3. Create a .env file with your API key:
15 |    ```bash
16 |    echo 'API_KEY="sk-Vi-wwJMyM8vJX"' > .env
17 |    ```
18 | 
19 | 4. Run the Python script:
20 |    ```bash
21 |    python deepseek_example.py
22 |    ```
23 | 


--------------------------------------------------------------------------------
/ai/litellm/deepseek_example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from dotenv import load_dotenv
 4 | from litellm import completion
 5 | 
 6 | # Load environment variables from .env file
 7 | load_dotenv()
 8 | 
 9 | # Configure the API endpoint and key
10 | api_key = os.environ.get("API_KEY")
11 | api_base = "https://litellmud.takin.ai"  # Hardcoded API base URL
12 | model = "litellm_proxy/ollama/deepseek-r1:7b"  # Hardcoded model name
13 | 
14 | # Verify API key is set
15 | if not api_key:
16 |     print("Error: API_KEY must be set in .env file")
17 |     sys.exit(1)
18 | 
19 | def generate_response(prompt, stream=False):
20 |     try:
21 |         # Set up the request parameters
22 |         params = {
23 |             "model": model,
24 |             "messages": [{"role": "user", "content": prompt}],
25 |             "api_key": api_key,
26 |             "api_base": api_base,
27 |             "stream": stream,
28 |             # Optional parameters
29 |             "temperature": 0.6,
30 |             "max_tokens": 500,
31 |         }
32 |         
33 |         # Send the request
34 |         if stream:
35 |             # Return the streaming response generator
36 |             return completion(**params)
37 |         else:
38 |             # Get the complete response
39 |             response = completion(**params)
40 |             return response.choices[0].message.content
41 |             
42 |     except Exception as e:
43 |         print(f"Error generating response: {e}")
44 |         return f"An error occurred: {str(e)}"
45 | 
46 | def stream_response(prompt):
47 |     """
48 |     Stream a response from the model and print it to the console.
49 |     
50 |     Args:
51 |         prompt (str): The user prompt to send to the model
52 |     """
53 |     try:
54 |         print(f"\nPrompt: {prompt}")
55 |         print("\nResponse: ", end="")
56 |         
57 |         # Get the streaming response
58 |         response_stream = generate_response(prompt, stream=True)
59 |         
60 |         # Process and print each chunk
61 |         for chunk in response_stream:
62 |             content = chunk.choices[0].delta.content
63 |             if content:
64 |                 sys.stdout.write(content)
65 |                 sys.stdout.flush()
66 |                 
67 |         print("\n")
68 |     except Exception as e:
69 |         print(f"\nError streaming response: {e}")
70 | 
71 | def main():
72 |     prompt1 = "write a short poem about artificial intelligence"
73 |     print(f"Prompt: {prompt1}")
74 |     response1 = generate_response(prompt1)
75 |     print(f"Response: {response1}")
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/ai/litellm/requirements.txt:
--------------------------------------------------------------------------------
1 | litellm>=1.62.1
2 | python-dotenv>=1.0.0
3 | 


--------------------------------------------------------------------------------
/ds/ab-testing/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is my revised code of the tutorial at https://towardsdatascience.com/the-math-behind-a-b-testing-with-example-code-part-1-of-2-7be752e1d06f.
 4 | 
 5 | ## Setup
 6 | 
 7 | do the following at the root of this project:
 8 | 
 9 | ```
10 | cd ab-testing
11 | python3 -m venv venv
12 | source venv/bin/activate
13 | pip install -r requirements.txt
14 | ```
15 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
16 | 
17 | 


--------------------------------------------------------------------------------
/ds/ab-testing/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib==3.1.2
3 | pandas
4 | scipy
5 | 


--------------------------------------------------------------------------------
/ds/ab-testing/utils/data.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as scs
 2 | import pandas as pd
 3 | # import numpy as np
 4 | 
 5 | 
 6 | def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A',
 7 |                   test_label='B'):
 8 |     """Returns a pandas dataframe with fake CTR data
 9 | 
10 |     Example:
11 | 
12 |     Parameters:
13 |         N_A (int): sample size for control group
14 |         N_B (int): sample size for test group
15 |             Note: final sample size may not match N_A provided because the
16 |             group at each row is chosen at random (50/50).
17 |         p_A (float): conversion rate; conversion rate of control group
18 |         p_B (float): conversion rate; conversion rate of test group
19 |         days (int): optional; if provided, a column for 'ts' will be included
20 |             to divide the data in chunks of time
21 |             Note: overflow data will be included in an extra day
22 |         control_label (str)
23 |         test_label (str)
24 | 
25 |     Returns:
26 |         df (df)
27 |     """
28 | 
29 |     # initiate empty container
30 |     data = []
31 | 
32 |     # total amount of rows in the data
33 |     N = N_A + N_B
34 | 
35 |     # distribute events based on proportion of group size
36 |     group_bern = scs.bernoulli(N_A / (N_A + N_B))
37 | 
38 |     # initiate bernoulli distributions from which to randomly sample
39 |     A_bern = scs.bernoulli(p_A)
40 |     B_bern = scs.bernoulli(p_B)
41 | 
42 |     for idx in range(N):
43 |         # initiate empty row
44 |         row = {}
45 |         # for 'ts' column
46 |         if days is not None:
47 |             if type(days) == int:
48 |                 row['ts'] = idx // (N // days)
49 |             else:
50 |                 raise ValueError("Provide an integer for the days parameter.")
51 |         # assign group based on 50/50 probability
52 |         row['group'] = group_bern.rvs()
53 | 
54 |         if row['group'] == 0:
55 |             # assign conversion based on provided parameters
56 |             row['converted'] = A_bern.rvs()
57 |         else:
58 |             row['converted'] = B_bern.rvs()
59 |         # collect row into data container
60 |         data.append(row)
61 | 
62 |     # convert data into pandas dataframe
63 |     df = pd.DataFrame(data)
64 | 
65 |     # transform group labels of 0s and 1s to user-defined group labels
66 |     df['group'] = df['group'].apply(
67 |         lambda x: control_label if x == 0 else test_label)
68 | 
69 |     return df
70 | 


--------------------------------------------------------------------------------
/ds/ab-testing/utils/stats.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as scs
  3 | 
  4 | 
  5 | def pooled_prob(N_A, N_B, X_A, X_B):
  6 |     """Returns pooled probability for two samples"""
  7 |     return (X_A + X_B) / (N_A + N_B)
  8 | 
  9 | 
 10 | def pooled_SE(N_A, N_B, X_A, X_B):
 11 |     """Returns the pooled standard error for two samples"""
 12 |     p_hat = pooled_prob(N_A, N_B, X_A, X_B)
 13 |     SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B))
 14 |     return SE
 15 | 
 16 | 
 17 | def confidence_interval(sample_mean=0, sample_std=1, sample_size=1,
 18 |                         sig_level=0.05):
 19 |     """Returns the confidence interval as a tuple"""
 20 |     z = z_val(sig_level)
 21 | 
 22 |     left = sample_mean - z * sample_std / np.sqrt(sample_size)
 23 |     right = sample_mean + z * sample_std / np.sqrt(sample_size)
 24 | 
 25 |     return (left, right)
 26 | 
 27 | 
 28 | def z_val(sig_level=0.05, two_tailed=True):
 29 |     """Returns the z value for a given significance level"""
 30 |     z_dist = scs.norm()
 31 |     if two_tailed:
 32 |         sig_level = sig_level/2
 33 |         area = 1 - sig_level
 34 |     else:
 35 |         area = 1 - sig_level
 36 | 
 37 |     z = z_dist.ppf(area)
 38 | 
 39 |     return z
 40 | 
 41 | 
 42 | def ab_dist(stderr, d_hat=0, group_type='control'):
 43 |     """Returns a distribution object depending on group type
 44 | 
 45 |     Examples:
 46 | 
 47 |     Parameters:
 48 |         stderr (float): pooled standard error of two independent samples
 49 |         d_hat (float): the mean difference between two independent samples
 50 |         group_type (string): 'control' and 'test' are supported
 51 | 
 52 |     Returns:
 53 |         dist (scipy.stats distribution object)
 54 |     """
 55 |     if group_type == 'control':
 56 |         sample_mean = 0
 57 | 
 58 |     elif group_type == 'test':
 59 |         sample_mean = d_hat
 60 | 
 61 |     # create a normal distribution which is dependent on mean and std dev
 62 |     dist = scs.norm(sample_mean, stderr)
 63 |     return dist
 64 | 
 65 | 
 66 | def min_sample_size(bcr, mde, power=0.8, sig_level=0.05):
 67 |     """Returns the minimum sample size to set up a split test
 68 | 
 69 |     Arguments:
 70 |         bcr (float): probability of success for control, sometimes
 71 |         referred to as baseline conversion rate
 72 | 
 73 |         mde (float): minimum change in measurement between control
 74 |         group and test group if alternative hypothesis is true, sometimes
 75 |         referred to as minimum detectable effect
 76 | 
 77 |         power (float): probability of rejecting the null hypothesis when the
 78 |         null hypothesis is false, typically 0.8
 79 | 
 80 |         sig_level (float): significance level often denoted as alpha,
 81 |         typically 0.05
 82 | 
 83 |     Returns:
 84 |         min_N: minimum sample size (float)
 85 | 
 86 |     References:
 87 |         Stanford lecture on sample sizes
 88 |         http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf
 89 |     """
 90 |     # standard normal distribution to determine z-values
 91 |     standard_norm = scs.norm(0, 1)
 92 | 
 93 |     # find Z_beta from desired power
 94 |     Z_beta = standard_norm.ppf(power)
 95 | 
 96 |     # find Z_alpha
 97 |     Z_alpha = standard_norm.ppf(1-sig_level/2)
 98 | 
 99 |     # average of probabilities from both groups
100 |     pooled_prob = (bcr + bcr+mde) / 2
101 | 
102 |     min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2
103 |              / mde**2)
104 | 
105 |     return min_N
106 | 
107 | 
108 | def p_val(N_A, N_B, p_A, p_B):
109 |     """Returns the p-value for an A/B test"""
110 |     return scs.binom(N_A, p_A).pmf(p_B * N_B)
111 | 


--------------------------------------------------------------------------------
/ds/airflow/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is my revised code of the tutorial at: https://medium.com/abn-amro-developer/data-pipeline-orchestration-on-steroids-apache-airflow-tutorial-part-1-87361905db6d
 4 | 
 5 | 
 6 | ## Setup
 7 | I assume you have airflow (2.1.0) running locally on your mac by running the commands at https://airflow.apache.org/docs/apache-airflow/stable/start/local.html
 8 | 
 9 | ```
10 | export AIRFLOW_HOME=~/airflow
11 | pip install apache-airflow
12 | airflow db init
13 | airflow users create \
14 |     --username admin \
15 |     --firstname Harry \
16 |     --lastname Wang \
17 |     --role Admin \
18 |     --email harryjwang@gmail.com
19 | 
20 | # start the web server, default port is 8080
21 | airflow webserver --port 8080
22 | 
23 | # start the scheduler
24 | # open a new terminal or else run webserver with ``-D`` option to run it as a daemon
25 | airflow scheduler
26 | 
27 | ```
28 | 
29 | Now, you can access airflow at http://localhost:8080
30 | 
31 | Open `/Users/harrywang/airflow/airflow.cfg` you can find the path to hold your dag python file: `dags_folder = /Users/harrywang/airflow/dags` you may need to create this folder. 
32 | 
33 | `simple_bash_dag.py` is a simple DAG that create an empty txt file and then rename it - two tasks in a sequence. 
34 | 
35 | copy the dag files to the dag folder, then run `airflow scheduler` (airflow does not auto refresh, you have to run this command manually to see the newly added dag file), you should be able to see the dag via UI.
36 | 
37 | You can trigger the DAG as follows:
38 | 
39 | <img width="1270" alt="Screen Shot 2021-06-01 at 11 37 32 AM" src="https://user-images.githubusercontent.com/595772/120351408-defae180-c2cd-11eb-8997-42d3fbb689de.png">
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/ds/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | 


--------------------------------------------------------------------------------
/ds/airflow/simple_bash_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from airflow import DAG
 3 | from airflow.operators.bash_operator import BashOperator
 4 | 
 5 | 
 6 | default_args = {
 7 |     'owner': 'Harry Wang',
 8 |     'depends_on_past': False,
 9 |     'start_date': datetime(2021, 6, 1),
10 |     'email': ['harryjwang@gmail.com'],
11 |     'email_on_failure': False,
12 |     'email_on_retry': False,
13 |     # In case of errors, do one retry
14 |     'retries': 1,
15 |     # Do the retry with 30 seconds delay after the error
16 |     'retry_delay': timedelta(seconds=30),
17 |     # Run once every 15 minutes
18 |     'schedule_interval': '*/15 * * * *'
19 | }
20 | 
21 | with DAG(
22 |         dag_id='simple_bash_dag',
23 |         default_args=default_args,
24 |         schedule_interval=None,
25 |         tags=['my_dags'],
26 |         ) as dag:
27 |     t1 = BashOperator(
28 |             bash_command="touch ~/my_bash_file.txt",
29 |             task_id="create_file"
30 |             )
31 |     t2 = BashOperator(
32 |             bash_command="mv ~/my_bash_file.txt ~/my_bash_file_changed.txt",
33 |             task_id="change_file_name"
34 |             )
35 |     t1 >> t2  # t2 depends on t
36 | 


--------------------------------------------------------------------------------
/ds/aws-pyspark/README.md:
--------------------------------------------------------------------------------
1 | Code for tutorial at https://towardsdatascience.com/getting-started-with-pyspark-on-amazon-emr-c85154b6b921
2 | 
3 | 


--------------------------------------------------------------------------------
/ds/aws-pyspark/emr_bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo pip install -U matplotlib pandas


--------------------------------------------------------------------------------
/ds/cohort-analysis/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | revised version of https://www.kaggle.com/mahmoudelfahl/cohort-analysis-customer-segmentation-with-rfm
 4 | 
 5 | ## Setup
 6 | 
 7 | go to the tutorial folder and do the following:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ds/cohort-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | numpy
4 | seaborn


--------------------------------------------------------------------------------
/ds/dask/.gitignore:
--------------------------------------------------------------------------------
1 | data/


--------------------------------------------------------------------------------
/ds/dask/README.md:
--------------------------------------------------------------------------------
 1 | ## Python statsmodels tutorial
 2 | 
 3 | This is my code of the tutorial at https://docs.dask.org/en/stable/10-minutes-to-dask.html
 4 | 
 5 | 
 6 | 
 7 | ## Setup
 8 | 
 9 | ```
10 | brew brew install graphviz
11 | ```
12 | 
13 | within the tutorial folder:
14 | 
15 | ```
16 | python3 -m venv venv
17 | source venv/bin/activate
18 | pip install -r requirements.txt
19 | ```
20 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks.
21 | 
22 | 


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/global.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/global.lock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/purge.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/purge.lock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-3y9yh5wc.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-3y9yh5wc.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-5u5lbrxx.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-5u5lbrxx.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-82zb8rgu.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-82zb8rgu.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-9wl7s6m3.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-9wl7s6m3.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-_n7kuuyd.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-_n7kuuyd.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-bbjm31ih.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-bbjm31ih.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-fwxxmool.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-fwxxmool.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-l28a891y.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-l28a891y.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-l8y7v2oj.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-l8y7v2oj.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-lckuq0ub.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-lckuq0ub.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-ofkwc26n.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-ofkwc26n.dirlock


--------------------------------------------------------------------------------
/ds/dask/dask-worker-space/worker-wuu54xyo.dirlock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/dask-worker-space/worker-wuu54xyo.dirlock


--------------------------------------------------------------------------------
/ds/dask/mydask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/dask/mydask.png


--------------------------------------------------------------------------------
/ds/dask/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | dask[distributed]
3 | pandas
4 | bokeh>=2.1.1
5 | graphviz
6 | s3fs
7 | 


--------------------------------------------------------------------------------
/ds/data-driven-growth/.gitignore:
--------------------------------------------------------------------------------
1 | /data/*.csv


--------------------------------------------------------------------------------
/ds/data-driven-growth/README.md:
--------------------------------------------------------------------------------
 1 | ## DATA DRIVEN GROWTH WITH PYTHON
 2 | 
 3 | This is my revised code of the tutorial at https://towardsdatascience.com/data-driven-growth-with-python-part-1-know-your-metrics-812781e66a5b.
 4 | 
 5 | ## Setup
 6 | 
 7 | do the following at the root of this project:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ds/data-driven-growth/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | scipy
5 | seaborn
6 | 


--------------------------------------------------------------------------------
/ds/data-driven-growth/utils/data.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as scs
 2 | import pandas as pd
 3 | # import numpy as np
 4 | 
 5 | 
 6 | def generate_data(N_A, N_B, p_A, p_B, days=None, control_label='A',
 7 |                   test_label='B'):
 8 |     """Returns a pandas dataframe with fake CTR data
 9 | 
10 |     Example:
11 | 
12 |     Parameters:
13 |         N_A (int): sample size for control group
14 |         N_B (int): sample size for test group
15 |             Note: final sample size may not match N_A provided because the
16 |             group at each row is chosen at random (50/50).
17 |         p_A (float): conversion rate; conversion rate of control group
18 |         p_B (float): conversion rate; conversion rate of test group
19 |         days (int): optional; if provided, a column for 'ts' will be included
20 |             to divide the data in chunks of time
21 |             Note: overflow data will be included in an extra day
22 |         control_label (str)
23 |         test_label (str)
24 | 
25 |     Returns:
26 |         df (df)
27 |     """
28 | 
29 |     # initiate empty container
30 |     data = []
31 | 
32 |     # total amount of rows in the data
33 |     N = N_A + N_B
34 | 
35 |     # distribute events based on proportion of group size
36 |     group_bern = scs.bernoulli(N_A / (N_A + N_B))
37 | 
38 |     # initiate bernoulli distributions from which to randomly sample
39 |     A_bern = scs.bernoulli(p_A)
40 |     B_bern = scs.bernoulli(p_B)
41 | 
42 |     for idx in range(N):
43 |         # initiate empty row
44 |         row = {}
45 |         # for 'ts' column
46 |         if days is not None:
47 |             if type(days) == int:
48 |                 row['ts'] = idx // (N // days)
49 |             else:
50 |                 raise ValueError("Provide an integer for the days parameter.")
51 |         # assign group based on 50/50 probability
52 |         row['group'] = group_bern.rvs()
53 | 
54 |         if row['group'] == 0:
55 |             # assign conversion based on provided parameters
56 |             row['converted'] = A_bern.rvs()
57 |         else:
58 |             row['converted'] = B_bern.rvs()
59 |         # collect row into data container
60 |         data.append(row)
61 | 
62 |     # convert data into pandas dataframe
63 |     df = pd.DataFrame(data)
64 | 
65 |     # transform group labels of 0s and 1s to user-defined group labels
66 |     df['group'] = df['group'].apply(
67 |         lambda x: control_label if x == 0 else test_label)
68 | 
69 |     return df
70 | 


--------------------------------------------------------------------------------
/ds/data-driven-growth/utils/stats.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as scs
  3 | 
  4 | 
  5 | def pooled_prob(N_A, N_B, X_A, X_B):
  6 |     """Returns pooled probability for two samples"""
  7 |     return (X_A + X_B) / (N_A + N_B)
  8 | 
  9 | 
 10 | def pooled_SE(N_A, N_B, X_A, X_B):
 11 |     """Returns the pooled standard error for two samples"""
 12 |     p_hat = pooled_prob(N_A, N_B, X_A, X_B)
 13 |     SE = np.sqrt(p_hat * (1 - p_hat) * (1 / N_A + 1 / N_B))
 14 |     return SE
 15 | 
 16 | 
 17 | def confidence_interval(sample_mean=0, sample_std=1, sample_size=1,
 18 |                         sig_level=0.05):
 19 |     """Returns the confidence interval as a tuple"""
 20 |     z = z_val(sig_level)
 21 | 
 22 |     left = sample_mean - z * sample_std / np.sqrt(sample_size)
 23 |     right = sample_mean + z * sample_std / np.sqrt(sample_size)
 24 | 
 25 |     return (left, right)
 26 | 
 27 | 
 28 | def z_val(sig_level=0.05, two_tailed=True):
 29 |     """Returns the z value for a given significance level"""
 30 |     z_dist = scs.norm()
 31 |     if two_tailed:
 32 |         sig_level = sig_level/2
 33 |         area = 1 - sig_level
 34 |     else:
 35 |         area = 1 - sig_level
 36 | 
 37 |     z = z_dist.ppf(area)
 38 | 
 39 |     return z
 40 | 
 41 | 
 42 | def ab_dist(stderr, d_hat=0, group_type='control'):
 43 |     """Returns a distribution object depending on group type
 44 | 
 45 |     Examples:
 46 | 
 47 |     Parameters:
 48 |         stderr (float): pooled standard error of two independent samples
 49 |         d_hat (float): the mean difference between two independent samples
 50 |         group_type (string): 'control' and 'test' are supported
 51 | 
 52 |     Returns:
 53 |         dist (scipy.stats distribution object)
 54 |     """
 55 |     if group_type == 'control':
 56 |         sample_mean = 0
 57 | 
 58 |     elif group_type == 'test':
 59 |         sample_mean = d_hat
 60 | 
 61 |     # create a normal distribution which is dependent on mean and std dev
 62 |     dist = scs.norm(sample_mean, stderr)
 63 |     return dist
 64 | 
 65 | 
 66 | def min_sample_size(bcr, mde, power=0.8, sig_level=0.05):
 67 |     """Returns the minimum sample size to set up a split test
 68 | 
 69 |     Arguments:
 70 |         bcr (float): probability of success for control, sometimes
 71 |         referred to as baseline conversion rate
 72 | 
 73 |         mde (float): minimum change in measurement between control
 74 |         group and test group if alternative hypothesis is true, sometimes
 75 |         referred to as minimum detectable effect
 76 | 
 77 |         power (float): probability of rejecting the null hypothesis when the
 78 |         null hypothesis is false, typically 0.8
 79 | 
 80 |         sig_level (float): significance level often denoted as alpha,
 81 |         typically 0.05
 82 | 
 83 |     Returns:
 84 |         min_N: minimum sample size (float)
 85 | 
 86 |     References:
 87 |         Stanford lecture on sample sizes
 88 |         http://statweb.stanford.edu/~susan/courses/s141/hopower.pdf
 89 |     """
 90 |     # standard normal distribution to determine z-values
 91 |     standard_norm = scs.norm(0, 1)
 92 | 
 93 |     # find Z_beta from desired power
 94 |     Z_beta = standard_norm.ppf(power)
 95 | 
 96 |     # find Z_alpha
 97 |     Z_alpha = standard_norm.ppf(1-sig_level/2)
 98 | 
 99 |     # average of probabilities from both groups
100 |     pooled_prob = (bcr + bcr+mde) / 2
101 | 
102 |     min_N = (2 * pooled_prob * (1 - pooled_prob) * (Z_beta + Z_alpha)**2
103 |              / mde**2)
104 | 
105 |     return min_N
106 | 
107 | 
108 | def p_val(N_A, N_B, p_A, p_B):
109 |     """Returns the p-value for an A/B test"""
110 |     return scs.binom(N_A, p_A).pmf(p_B * N_B)
111 | 


--------------------------------------------------------------------------------
/ds/diff-in-diff/Panel101.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/diff-in-diff/Panel101.dta


--------------------------------------------------------------------------------
/ds/diff-in-diff/README.md:
--------------------------------------------------------------------------------
 1 | ## Python statsmodels tutorial
 2 | 
 3 | This is my code of the tutorial at https://medium.com/@sadhaverajasekar/diff-in-diff-testing-python-f24835330bc8
 4 | 
 5 | ## Setup
 6 | 
 7 | within the tutorial folder:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ds/diff-in-diff/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | scikit-learn
3 | statsmodels
4 | 


--------------------------------------------------------------------------------
/ds/dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /data


--------------------------------------------------------------------------------
/ds/dvc/README.md:
--------------------------------------------------------------------------------
1 | # About
2 | 
3 | This tutorial needs a separate repo, so visit https://github.com/harrywang/dvc for the code for tutorial at 
4 | https://realpython.com/python-data-version-control/


--------------------------------------------------------------------------------
/ds/hypo-testing/README.md:
--------------------------------------------------------------------------------
 1 | ## Python statsmodels tutorial
 2 | 
 3 | This is my code of the tutorial at https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce
 4 | 
 5 | ## Setup
 6 | 
 7 | within the tutorial folder:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyterlab` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ds/hypo-testing/blood-pressure.csv:
--------------------------------------------------------------------------------
  1 | patient,sex,agegrp,bp_before,bp_after
  2 | 1,Male,30-45,143,153
  3 | 2,Male,30-45,163,170
  4 | 3,Male,30-45,153,168
  5 | 4,Male,30-45,153,142
  6 | 5,Male,30-45,146,141
  7 | 6,Male,30-45,150,147
  8 | 7,Male,30-45,148,133
  9 | 8,Male,30-45,153,141
 10 | 9,Male,30-45,153,131
 11 | 10,Male,30-45,158,125
 12 | 11,Male,30-45,149,164
 13 | 12,Male,30-45,173,159
 14 | 13,Male,30-45,165,135
 15 | 14,Male,30-45,145,159
 16 | 15,Male,30-45,143,153
 17 | 16,Male,30-45,152,126
 18 | 17,Male,30-45,141,162
 19 | 18,Male,30-45,176,134
 20 | 19,Male,30-45,143,136
 21 | 20,Male,30-45,162,150
 22 | 21,Male,46-59,149,168
 23 | 22,Male,46-59,156,155
 24 | 23,Male,46-59,151,136
 25 | 24,Male,46-59,159,132
 26 | 25,Male,46-59,164,160
 27 | 26,Male,46-59,154,160
 28 | 27,Male,46-59,152,136
 29 | 28,Male,46-59,142,183
 30 | 29,Male,46-59,162,152
 31 | 30,Male,46-59,155,162
 32 | 31,Male,46-59,175,151
 33 | 32,Male,46-59,184,139
 34 | 33,Male,46-59,167,175
 35 | 34,Male,46-59,148,184
 36 | 35,Male,46-59,170,151
 37 | 36,Male,46-59,159,171
 38 | 37,Male,46-59,149,157
 39 | 38,Male,46-59,140,159
 40 | 39,Male,46-59,185,140
 41 | 40,Male,46-59,160,174
 42 | 41,Male,60+,157,167
 43 | 42,Male,60+,158,158
 44 | 43,Male,60+,162,168
 45 | 44,Male,60+,160,159
 46 | 45,Male,60+,180,153
 47 | 46,Male,60+,155,164
 48 | 47,Male,60+,172,169
 49 | 48,Male,60+,157,148
 50 | 49,Male,60+,171,185
 51 | 50,Male,60+,170,163
 52 | 51,Male,60+,175,146
 53 | 52,Male,60+,175,160
 54 | 53,Male,60+,172,175
 55 | 54,Male,60+,173,163
 56 | 55,Male,60+,170,185
 57 | 56,Male,60+,164,146
 58 | 57,Male,60+,147,176
 59 | 58,Male,60+,154,147
 60 | 59,Male,60+,172,161
 61 | 60,Male,60+,162,164
 62 | 61,Female,30-45,152,149
 63 | 62,Female,30-45,147,142
 64 | 63,Female,30-45,144,146
 65 | 64,Female,30-45,144,138
 66 | 65,Female,30-45,158,131
 67 | 66,Female,30-45,147,145
 68 | 67,Female,30-45,154,134
 69 | 68,Female,30-45,151,135
 70 | 69,Female,30-45,149,131
 71 | 70,Female,30-45,138,135
 72 | 71,Female,30-45,162,133
 73 | 72,Female,30-45,157,135
 74 | 73,Female,30-45,141,168
 75 | 74,Female,30-45,167,144
 76 | 75,Female,30-45,147,147
 77 | 76,Female,30-45,143,151
 78 | 77,Female,30-45,142,149
 79 | 78,Female,30-45,166,147
 80 | 79,Female,30-45,147,149
 81 | 80,Female,30-45,142,135
 82 | 81,Female,46-59,157,127
 83 | 82,Female,46-59,170,150
 84 | 83,Female,46-59,150,138
 85 | 84,Female,46-59,150,147
 86 | 85,Female,46-59,167,157
 87 | 86,Female,46-59,154,146
 88 | 87,Female,46-59,143,148
 89 | 88,Female,46-59,157,136
 90 | 89,Female,46-59,149,146
 91 | 90,Female,46-59,161,132
 92 | 91,Female,46-59,142,145
 93 | 92,Female,46-59,162,132
 94 | 93,Female,46-59,144,157
 95 | 94,Female,46-59,142,140
 96 | 95,Female,46-59,159,137
 97 | 96,Female,46-59,140,154
 98 | 97,Female,46-59,144,169
 99 | 98,Female,46-59,142,145
100 | 99,Female,46-59,145,137
101 | 100,Female,46-59,145,143
102 | 101,Female,60+,168,178
103 | 102,Female,60+,142,141
104 | 103,Female,60+,147,149
105 | 104,Female,60+,148,148
106 | 105,Female,60+,162,138
107 | 106,Female,60+,170,143
108 | 107,Female,60+,173,167
109 | 108,Female,60+,151,158
110 | 109,Female,60+,155,152
111 | 110,Female,60+,163,154
112 | 111,Female,60+,183,161
113 | 112,Female,60+,159,143
114 | 113,Female,60+,148,159
115 | 114,Female,60+,151,177
116 | 115,Female,60+,165,142
117 | 116,Female,60+,152,152
118 | 117,Female,60+,161,152
119 | 118,Female,60+,165,174
120 | 119,Female,60+,149,151
121 | 120,Female,60+,185,163
122 | 


--------------------------------------------------------------------------------
/ds/hypo-testing/chi-test.csv:
--------------------------------------------------------------------------------
 1 | Gender,Shopping
 2 | Male,No
 3 | Female,Yes
 4 | Male,Yes
 5 | Female,Yes
 6 | Female,Yes
 7 | Male,Yes
 8 | Male,No
 9 | Female,No
10 | Female,No
11 | 


--------------------------------------------------------------------------------
/ds/hypo-testing/crop-yield.csv:
--------------------------------------------------------------------------------
 1 | Fert,Water,Yield
 2 | A,High,27.4
 3 | A,High,33.6
 4 | A,High,29.8
 5 | A,High,35.2
 6 | A,High,33
 7 | B,High,34.8
 8 | B,High,27
 9 | B,High,30.2
10 | B,High,30.8
11 | B,High,26.4
12 | A,Low,32
13 | A,Low,32.2
14 | A,Low,26
15 | A,Low,33.4
16 | A,Low,26.4
17 | B,Low,26.8
18 | B,Low,23.2
19 | B,Low,29.4
20 | B,Low,19.4
21 | B,Low,23.8


--------------------------------------------------------------------------------
/ds/hypo-testing/plant-growth.csv:
--------------------------------------------------------------------------------
 1 | "","weight","group"
 2 | "1",4.17,"ctrl"
 3 | "2",5.58,"ctrl"
 4 | "3",5.18,"ctrl"
 5 | "4",6.11,"ctrl"
 6 | "5",4.5,"ctrl"
 7 | "6",4.61,"ctrl"
 8 | "7",5.17,"ctrl"
 9 | "8",4.53,"ctrl"
10 | "9",5.33,"ctrl"
11 | "10",5.14,"ctrl"
12 | "11",4.81,"trt1"
13 | "12",4.17,"trt1"
14 | "13",4.41,"trt1"
15 | "14",3.59,"trt1"
16 | "15",5.87,"trt1"
17 | "16",3.83,"trt1"
18 | "17",6.03,"trt1"
19 | "18",4.89,"trt1"
20 | "19",4.32,"trt1"
21 | "20",4.69,"trt1"
22 | "21",6.31,"trt2"
23 | "22",5.12,"trt2"
24 | "23",5.54,"trt2"
25 | "24",5.5,"trt2"
26 | "25",5.37,"trt2"
27 | "26",5.29,"trt2"
28 | "27",4.92,"trt2"
29 | "28",6.15,"trt2"
30 | "29",5.8,"trt2"
31 | "30",5.26,"trt2"
32 | 


--------------------------------------------------------------------------------
/ds/hypo-testing/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | statsmodels
3 | 


--------------------------------------------------------------------------------
/ds/inside-airbnb/.idea/inside-airbnb.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/ds/inside-airbnb/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 virtualenv at ~/sandbox/one-off/inside-airbnb/venv" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/ds/inside-airbnb/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/inside-airbnb.iml" filepath="$PROJECT_DIR$/.idea/inside-airbnb.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/ds/inside-airbnb/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | Code to download the images by using the image URLs provided in the NYC data from
 4 | http://insideairbnb.com/get-the-data.html and save those images into local folders.
 5 | 
 6 | # Setup and Run
 7 | 
 8 | Python 3
 9 | 
10 | - create virtual environment: `$virtualenv -p python3 venv`
11 | - activate virtual env: `$source venv/bin/activate`
12 | - install required packages: `pip3 install -r requirements.txt`
13 | 
14 | To run (NOTE: for the NYC dataset, it took about 4.5 hours to download the ~ 45,000 images):
15 | 
16 | 1. copy the real listing csv file to /data/ and comment out the `# listings = pd.read_csv('data/listings.csv')`
17 | 2. `python3 get-images.py` will download the save the images to /data/images/ folder
18 | 
19 | 
20 | 
21 | Use Katalon Recorder (Selenium IDE for Chrome) to help test the css selector:
22 | 
23 | Install at: https://chrome.google.com/webstore/detail/katalon-recorder-selenium/
24 | 
25 | Then enter the css selector as below and search to see whether you are find the element you need:
26 | 
27 | <img width="736" alt="screen shot 2018-02-14 at 1 32 32 pm" src="https://user-images.githubusercontent.com/595772/36221494-3cf3391e-118c-11e8-891c-716b8a7fcf3c.png">
28 | 
29 | Example: open https://www.airbnb.com/rooms/18461891 in chrome and search css=button[data-veloute='hero-view-photos-button'], the View Photo button should be highlighted


--------------------------------------------------------------------------------
/ds/inside-airbnb/add-columns.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | listings = pd.read_csv('data/nyc-listings.csv')  # testing data
 4 | # listings = pd.read_csv('data/listings.csv')
 5 | listings.info()
 6 | 
 7 | listings["total_photos"] = 0
 8 | 
 9 | # the original list used string not boolean
10 | listings["photo_downloaded"] = "f"
11 | listings["host_photo_downloaded"] = "f"
12 | 
13 | listings.to_csv('data/nyc-listings_new.csv', encoding='utf-8', index=False)
14 | 
15 | print("csv file processed")


--------------------------------------------------------------------------------
/ds/inside-airbnb/get-one-photo.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from urllib.error import HTTPError
 3 | from urllib.request import urlretrieve
 4 | 
 5 | listings = pd.read_csv('data/nyc-listings.csv')  # testing data
 6 | # listings = pd.read_csv('data/listings.csv')
 7 | listings.info()
 8 | count = 0  # count for successful downloads
 9 | err_count = 0  # error counts
10 | for index, row in listings.iterrows():
11 |     # print(row["id"], row["xl_picture_url"])
12 |     # check whether the URL exists
13 |     if pd.isnull(row["xl_picture_url"]):
14 |         url = row["picture_url"]
15 |     else:
16 |         url = row["xl_picture_url"]
17 | 
18 |     try:
19 |         urlretrieve(url, "./data/images/" + str(row["id"]) + ".jpg")
20 |         count += 1
21 |         print("downloading " + str(row["id"]))
22 |     except FileNotFoundError as err:
23 |         print(err)  # something wrong with local path
24 |     except HTTPError as err:
25 |         print(err)  # something wrong with url
26 |     except:
27 |         print("something is wrong, skipped one line")
28 | 
29 | 
30 | print("downloading complete with " + str(count) + " images.")


--------------------------------------------------------------------------------
/ds/inside-airbnb/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | selenium


--------------------------------------------------------------------------------
/ds/matplotlib/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | Grouped bar chart with percentage change using matplotlib. Excel version is at https://www.excelcampus.com/charts/column-chart-percentage-change/
 4 | 
 5 | ## Setup
 6 | 
 7 | go to the tutorial folder and do the following:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ds/matplotlib/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter>=1.0.0
2 | matplotlib
3 | numpy
4 | seaborn


--------------------------------------------------------------------------------
/ds/multi-armed-bandit/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | My revised code based on:
 4 | 
 5 | https://towardsdatascience.com/solving-multiarmed-bandits-a-comparison-of-epsilon-greedy-and-thompson-sampling-d97167ca9a50
 6 | 
 7 | https://github.com/conormm/bandit_algorithms/blob/master/bandits_post_code.py
 8 | 
 9 | 
10 | # Setup
11 | 
12 | Setup virtual environment and install packages:
13 | ```
14 | python3 -m venv venv
15 | source venv/bin/activate
16 | pip install -r requirements.txt
17 | ```
18 | 
19 | then, open the notebook
20 | 


--------------------------------------------------------------------------------
/ds/multi-armed-bandit/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | matplotlib
3 | numpy
4 | pandas
5 | scipy
6 | tqdm


--------------------------------------------------------------------------------
/ds/pymongo/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | https://realpython.com/introduction-to-mongodb-and-python/
 4 | 
 5 | ## Setup 
 6 | 
 7 | Install MongoDB:
 8 | 
 9 | ```
10 | brew tap mongodb/brew
11 | brew install mongodb-community@5.0
12 | ```
13 | 
14 | run/stop as a service
15 | 
16 | ```
17 | brew services start mongodb-community@5.0
18 | brew services stop mongodb-community@5.0
19 | ```
20 | 
21 | connect:
22 | 
23 | ```
24 | mongosh
25 | ```
26 | 
27 | create a new db
28 | 
29 | ```
30 | use rptutorials
31 | show dbs
32 | db
33 | ```
34 | 
35 | create a collection (table) using dot notation
36 | 
37 | ```
38 | db.tutorial
39 | ```
40 | 
41 | document (table row)
42 | 
43 | When you’re building a MongoDB database application, probably your most important decision is about the structure of documents. In other words, you’ll have to decide which fields and values your documents will have.
44 | 
45 | insert a document:
46 | 
47 | ```
48 | db.tutorial.insertOne(
49 | {
50 |     "title": "Reading and Writing CSV Files in Python",
51 |     "author": "Jon",
52 |     "contributors": [
53 |         "Aldren",
54 |         "Geir Arne",
55 |         "Joanna",
56 |         "Jason"
57 |     ],
58 |     "url": "https://realpython.com/python-csv/"
59 | }
60 | )
61 | 
62 | db.tutorial.insertOne(
63 | {
64 | "title": "Python 3's f-Strings: An Improved String Formatting Syntax",
65 | "author": "Joanna",
66 | "contributors": [
67 |          "Adriana",
68 |         "David",
69 |          "Dan",
70 |         "Jim",
71 |          "Pavel"
72 |     ],
73 |      "url": "https://realpython.com/python-f-strings/"
74 | }
75 | )
76 | ```
77 | 
78 | find
79 | ```
80 | db.tutorial.find()
81 | db.tutorial.find({author: "Joanna"})
82 | ```
83 | 


--------------------------------------------------------------------------------
/ds/pymongo/requirements.txt:
--------------------------------------------------------------------------------
1 | pymongo
2 | 


--------------------------------------------------------------------------------
/ds/seaborn/README.md:
--------------------------------------------------------------------------------
 1 | ## Seaborn Basics with Python 3
 2 | 
 3 | This is my revision of the tutorials at
 4 | 
 5 | - The Ultimate Python Seaborn Tutorial: https://elitedatascience.com/python-seaborn-tutorial
 6 | - Styling plots with Seaborn: http://jose-coto.com/styling-with-seaborn
 7 | 
 8 | ## Setup
 9 | 
10 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
11 | 
12 | ```
13 | $ cd path_to_this folder
14 | $ virtualenv -p python3 venv
15 | $ source venv/bin/activate
16 | $ pip3 install -r requirements.txt
17 | ```
18 | 
19 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
20 | 


--------------------------------------------------------------------------------
/ds/seaborn/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | seaborn
5 | scikit-learn
6 | 
7 | 


--------------------------------------------------------------------------------
/ds/spark-basics/datacamp-notes.md:
--------------------------------------------------------------------------------
 1 | https://www.datacamp.com/community/tutorials/apache-spark-tutorial-machine-learning
 2 | 
 3 | ## installation on Mac
 4 | https://medium.com/beeranddiapers/installing-apache-spark-on-mac-os-ce416007d79f
 5 | ```
 6 | brew upgrade && brew update
 7 | 
 8 | brew install --cask java
 9 | 
10 | java -version
11 | openjdk version "11.0.1" 2018-10-16
12 | OpenJDK Runtime Environment 18.9 (build 11.0.1+13)
13 | OpenJDK 64-Bit Server VM 18.9 (build 11.0.1+13, mixed mode)
14 | 
15 | xcode-select --install
16 | 
17 | brew install scala
18 | 
19 | scala -version
20 | Scala code runner version 2.13.5 -- Copyright 2002-2020, LAMP/EPFL and Lightbend, Inc.
21 | 
22 | brew install apache-spark
23 | 
24 | spark-shell
25 | 
26 | pyspark
27 | ```
28 | 
29 | 
30 | ```
31 | pip install pyspark
32 | pip install findspark
33 | ```


--------------------------------------------------------------------------------
/ds/spark-basics/datacamp-spark.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/spark-basics/datacamp-spark.ipynb


--------------------------------------------------------------------------------
/ds/statsmodels-tutorial/README.md:
--------------------------------------------------------------------------------
 1 | ## Python statsmodels tutorial
 2 | 
 3 | This is my code of the tutorial at https://www.statsmodels.org/stable/gettingstarted.html
 4 | 
 5 | ## Setup
 6 | 
 7 | within the tutorial folder:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ds/statsmodels-tutorial/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | linearmodels
3 | matplotlib
4 | statsmodels
5 | 


--------------------------------------------------------------------------------
/ds/streamlit/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | This is my revised code for tutorial at https://towardsdatascience.com/streamlit-101-an-in-depth-introduction-fc8aad9492f2
 4 | 
 5 | Changes:
 6 | 
 7 | - changed data file and make it self-contained in the repo
 8 | - added requirements.txt and virtual environment setup
 9 | 
10 | ## Local Setup
11 | 
12 | Python 3 required, see my tutorial to setup Python 3: https://bit.ly/2uX6wAX
13 | 
14 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
15 | 
16 | 
17 | ```shell
18 | $ python3 -m venv venv
19 | $ source venv/bin/activate
20 | $ pip install -r requirements.txt
21 | ```
22 | 
23 | Run the app locally (Local URL: http://localhost:8501) using terminal: `streamlit run airbnb.py` 
24 | 
25 | Stop the app by using ctrl + C or closing the terminal
26 | 
27 | Deploy the app to the cloud for public access via services such as streamlit share, heroku, aws by following my tutorial at https://github.com/harrywang/streamlit-basics. you can see an example at: https://st-demo-harrywang.herokuapp.com/
28 | 


--------------------------------------------------------------------------------
/ds/streamlit/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib>=3.4.2
2 | pandas>=1.2.4
3 | plotly>=5.0.0
4 | streamlit>=0.82.0


--------------------------------------------------------------------------------
/ds/superset/README.md:
--------------------------------------------------------------------------------
 1 | Install a local copy
 2 | 
 3 | https://superset.apache.org/docs/installation/installing-superset-using-docker-compose
 4 | 
 5 | I made some changes to the instructions found in the doc above:
 6 | 
 7 | - make sure local PostgreSQL is stopped otherwise superset port conflict
 8 | - run into this problem https://github.com/apache/superset/issues/12723 with docker deskptop 2G - increased to 7.5G 
 9 | 
10 | I did the followings:
11 | 
12 | - get code `git clone https://github.com/apache/superset.git`
13 | - make sure to use master branch `git checkout master`
14 | - change redis from 3.2 to latest in `docker-compose-non-dev.yml`
15 | <img width="307" alt="Screen Shot 2021-03-23 at 9 51 25 PM" src="https://user-images.githubusercontent.com/595772/112242317-f9577580-8c21-11eb-807f-db9c1fdd04c7.png">
16 | - use `docker-compose -f docker-compose-non-dev.yml up` to start the server
17 | - wait some time and superset_init exited with 0 is expected - it does not affect the server:
18 | <img width="468" alt="Screen Shot 2021-03-23 at 9 47 56 PM" src="https://user-images.githubusercontent.com/595772/112242376-1d1abb80-8c22-11eb-8393-75ff66cc196f.png">
19 | - login at http://localhost:8088/ using admin/admin


--------------------------------------------------------------------------------
/ds/time-series-additive-model/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | This tutorial is originally published by William Koehrsen at https://towardsdatascience.com/time-series-analysis-in-python-an-introduction-70d5a5b1d52a
 3 | 
 4 | # Setup (Mac)
 5 | 
 6 | - Install python 3 using Homebrew if not done yet: `$ brew install python3`
 7 | 
 8 | - Create python3 virtualenv: `virtualenv -p python3 venv`
 9 | 
10 | - Activate it: `source venv/bin/activate`
11 | 
12 | - Install packages: `$ pip3 install -r requirements.txt`, which include quandl seaborn matplotlib numpy pandas scipy scikit-learn fbprophet
13 | 
14 | - Change API key for Quandl: We will access financial data using the Quandl library. Please go to https://www.quandl.com/ and register to get your api_key. You will need to use your own api_key to pull data from the quandl financial library. **You should never put your real API key in the code and push to Github.** We use a local environment variable for the API key: `quandl.ApiConfig.api_key = os.environ.get('QUANDL_KEY')`. You need to add one line `export QUANDL_KEY='your_real_api_key'` to the `~/.bash_profile` file (use `vim` to edit, `source` to execute it, then use `env` to double check):
15 | ```
16 | $ vim ~/.bash_profile
17 | $ source ~/.bash_profile
18 | $ env
19 | ```
20 | **NOTE: You may need to close the Terminal window and restart it for Jupyter Notebook to read the new QUANDL_KEY you just added.**
21 | 
22 | # Run
23 | 
24 | - Start Virtual Env:
25 | ```
26 | $ virtualenv -p python3 venv
27 | $ source venv/bin/activate
28 | ```
29 | - Run Jupyter: `jupyter notebook`
30 | - Run additive_models.ipynb
31 | 
32 | ### TODO: Get rid of the Deprecation Warnings
33 | 


--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/Workbook1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/Workbook1.xlsx


--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/gm_sales.csv:
--------------------------------------------------------------------------------
1 | Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total
2 | 2017,195909,237388,256224,244406,237364,243151,226107,275552,279397,252813,245387,308539,3002237
3 | 2016,203745,227825,252128,259557,240449,255209,267258,256429,249795,258626,252644,319108,3042773
4 | 2015,202786,231378,249875,269055,293097,259346,272512,270480,251310,262993,229296,290230,3082358
5 | 2014,171486,222104,256047,254076,284694,267461,256160,272422,223437,226819,225818,274483,2935007
6 | 2013,194699,224314,245950,237646,252894,264843,234071,275847,187195,226402,212060,230157,2786078
7 | 2012,167962,209306,231052,213387,245256,248750,201237,240520,210245,195764,186505,245733,2595717
8 | 2011,178896,207028,206621,232538,221192,215358,214915,218479,207145,186895,180402,234351,2503820
9 | 2010,145098,138849,185406,183091,222305,194828,199432,184921,172969,183392,168704,223932,2202927


--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/gm_sales.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/gm_sales.xlsx


--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/recessions.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/recessions.csv


--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/recessions.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ds/time-series-additive-model/data/recessions.xlsx


--------------------------------------------------------------------------------
/ds/time-series-additive-model/data/tesla_search_terms.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:699ee134222a9b87c1434bd28d7e66ab0a163788d22c84af35386459961ac202
3 | size 880
4 | 


--------------------------------------------------------------------------------
/ds/time-series-additive-model/requirements.txt:
--------------------------------------------------------------------------------
 1 | quandl
 2 | seaborn
 3 | matplotlib
 4 | numpy
 5 | pandas
 6 | scipy
 7 | scikit-learn
 8 | pystan
 9 | fbprophet
10 | jupyter
11 | 


--------------------------------------------------------------------------------
/ds/time-series-basics/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 |  Time Series Analysis with Pandas: https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
 4 | 
 5 | ## Setup
 6 | 
 7 | Tested with Python 3.6 via virtual environment. Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
 8 | 
 9 | 
10 | ```shell
11 | $ python3.6 -m venv venv
12 | $ source venv/bin/activate
13 | $ pip install -r requirements.txt
14 | ```
15 | 
16 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
17 | 


--------------------------------------------------------------------------------
/ds/time-series-basics/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | pandas==1.0.0
3 | matplotlib==3.1.2
4 | seaborn==0.10.0
5 | 


--------------------------------------------------------------------------------
/ml/attention/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v_S1r-iPuuVAkqVo8hAL6-OsQ-hZhGx8)
 4 | 
 5 | 
 6 | I combined and revised the following tutorials:
 7 | - https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
 8 | - https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39
 9 | 
10 | ## Setup
11 | 
12 | within the tutorial folder:
13 | 
14 | ```
15 | python3 -m venv venv
16 | source venv/bin/activate
17 | pip install -r requirements.txt
18 | ```
19 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
20 | 
21 | 


--------------------------------------------------------------------------------
/ml/attention/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib==3.1.0 # latest version breaks the seaborn heatmap
3 | seaborn
4 | 


--------------------------------------------------------------------------------
/ml/autogluon/README.md:
--------------------------------------------------------------------------------
 1 | ## Kaggle Kernel
 2 | 
 3 | You can run this kernel directly at Kaggle.com: https://www.kaggle.com/harrywang/housing-price-prediction
 4 | 
 5 | ## Run Locally
 6 | 
 7 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
 8 | 
 9 | ```
10 | $ cd path_to_this folder
11 | $ virtualenv -p python3 venv
12 | $ source venv/bin/activate
13 | $ pip3 install -r requirements.txt
14 | ```
15 | 
16 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
17 | 
18 | ## Source
19 | 
20 | This is the dataset used in this book: https://github.com/ageron/handson-ml/tree/master/datasets/housing to illustrate a sample end-to-end ML project workflow (pipeline). This is a great book - I highly recommend!
21 | 
22 | The data is based on California Census in 1990.
23 | 
24 | ### About the Data (from the book):
25 | 
26 | "This dataset is a modified version of the California Housing dataset available from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the StatLib repository (which is closed now). The dataset may also be downloaded from StatLib mirrors.
27 | 
28 | The following is the description from the book author:
29 | 
30 | This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people).
31 | 
32 | The dataset in this directory is almost identical to the original, with two differences:
33 | 207 values were randomly removed from the total_bedrooms column, so we can discuss what to do with missing data.
34 | An additional categorical attribute called ocean_proximity was added, indicating (very roughly) whether each block group is near the ocean, near the Bay area, inland or on an island. This allows discussing what to do with categorical data.
35 | Note that the block groups are called "districts" in the Jupyter notebooks, simply because in some contexts the name "block group" was confusing."
36 | 
37 | ### About the Data (From Luís Torgo page):
38 | http://www.dcc.fc.up.pt/%7Eltorgo/Regression/cal_housing.html
39 | 
40 | This is a dataset obtained from the StatLib repository. Here is the included description:
41 | 
42 | "We collected information on the variables using all the block groups in California from the 1990 Cens us. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value)."
43 | 
44 | 
45 | ### End-to-End ML Project Steps (Chapter 2 of the book)
46 | 
47 | 1. Look at the big picture
48 | 2. Get the data
49 | 3. Discover and visualize the data to gain insights
50 | 4. Prepare the data for Machine Learning algorithms
51 | 5. Select a model and train it
52 | 6. Fine-tune your model
53 | 7. Present your solution
54 | 8. Launch, monitor, and maintain your system
55 | 
56 | ## The 10-Step Machine Learning Project Workflow (My Version)
57 | 
58 | 1. Define business object
59 | 2. Make sense of the data from a high level
60 |     - data types (number, text, object, etc.)
61 |     - continuous/discrete
62 |     - basic stats (min, max, std, median, etc.) using boxplot
63 |     - frequency via histogram
64 |     - scales and distributions of different features
65 | 3. Create the traning and test sets using proper sampling methods, e.g., random vs. stratified
66 | 4. Correlation analysis (pair-wise and attribute combinations)
67 | 5. Data cleaning (missing data, outliers, data errors)
68 | 6. Data transformation via pipelines (categorical text to number using one hot encoding, feature scaling via normalization/standardization, feature combinations)
69 | 7. Train and cross validate different models and select the most promising one (Linear Regression, Decision Tree, and Random Forest were tried in this tutorial)
70 | 8. Fine tune the model using trying different combinations of hyperparameters
71 | 9. Evaluate the model with best estimators in the test set
72 | 10. Launch, monitor, and refresh the model and system
73 | 


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/learner.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/learner.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/CatBoost/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/CatBoost/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/ExtraTreesEntr/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/ExtraTreesEntr/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/ExtraTreesGini/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/ExtraTreesGini/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/KNeighborsDist/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/KNeighborsDist/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/KNeighborsUnif/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/KNeighborsUnif/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/LightGBM/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBM/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/LightGBMLarge/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBMLarge/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/LightGBMXT/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/LightGBMXT/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model-internals.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model-internals.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/NeuralNetFastAI/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/RandomForestEntr/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/RandomForestEntr/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/RandomForestGini/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/RandomForestGini/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/model_template.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/model_template.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/oof.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/WeightedEnsemble_L2/utils/oof.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/XGBoost/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/XGBoost/model.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/models/trainer.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/models/trainer.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/predictor.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/predictor.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/X.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/X.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/X_val.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/X_val.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/y.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/y.pkl


--------------------------------------------------------------------------------
/ml/autogluon/agModels-predictClass/utils/data/y_val.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/autogluon/agModels-predictClass/utils/data/y_val.pkl


--------------------------------------------------------------------------------
/ml/autogluon/input/anscombe.csv:
--------------------------------------------------------------------------------
 1 | dataset,x,y
 2 | I,10.0,8.04
 3 | I,8.0,6.95
 4 | I,13.0,7.58
 5 | I,9.0,8.81
 6 | I,11.0,8.33
 7 | I,14.0,9.96
 8 | I,6.0,7.24
 9 | I,4.0,4.26
10 | I,12.0,10.84
11 | I,7.0,4.82
12 | I,5.0,5.68
13 | II,10.0,9.14
14 | II,8.0,8.14
15 | II,13.0,8.74
16 | II,9.0,8.77
17 | II,11.0,9.26
18 | II,14.0,8.1
19 | II,6.0,6.13
20 | II,4.0,3.1
21 | II,12.0,9.13
22 | II,7.0,7.26
23 | II,5.0,4.74
24 | III,10.0,7.46
25 | III,8.0,6.77
26 | III,13.0,12.74
27 | III,9.0,7.11
28 | III,11.0,7.81
29 | III,14.0,8.84
30 | III,6.0,6.08
31 | III,4.0,5.39
32 | III,12.0,8.15
33 | III,7.0,6.42
34 | III,5.0,5.73
35 | IV,8.0,6.58
36 | IV,8.0,5.76
37 | IV,8.0,7.71
38 | IV,8.0,8.84
39 | IV,8.0,8.47
40 | IV,8.0,7.04
41 | IV,8.0,5.25
42 | IV,19.0,12.5
43 | IV,8.0,5.56
44 | IV,8.0,7.91
45 | IV,8.0,6.89
46 | 


--------------------------------------------------------------------------------
/ml/autogluon/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | autogluon
3 | 


--------------------------------------------------------------------------------
/ml/clearml-server/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | Setup ClearML server locally on Mac: https://allegro.ai/docs/deploying_trains/trains_server_linux_mac/
 4 | 
 5 | For how to use ClearML, check out https://github.com/harrywang/tutorial-buffet/tree/master/clearml
 6 | 
 7 | Note: I set 6G for Docker
 8 | 
 9 | - Make sure docker is running correctly: `docker run hello-world`
10 | - Create the mounting folder: `sudo mkdir /opt/trains` and then Open the Docker app., On the File Sharing tab, add `/opt/trains`.
11 | 
12 | **NOTE: you have to restart docker app after this step!!**
13 | <img width="922" alt="Screen Shot 2020-12-24 at 10 16 29 AM" src="https://user-images.githubusercontent.com/595772/103096194-22d81500-45d1-11eb-906f-877dca676123.png">
14 | 
15 | - By default, ElasticSearch is mounted at `/opt/trains/data/elastic_7`, you need to create the folder and then give write permission as follows:
16 | 
17 | ```
18 | $ sudo mkdir -p /opt/trains/data/elastic_7
19 | $ chmod 777 /opt/trains/data/elastic_7
20 | ```
21 | 
22 | - Grant access to the Dockers, depending upon your operating system: 
23 | ```
24 | sudo chown -R $(whoami):staff /opt/trains
25 | ```
26 | 
27 | - download `docker-compose.yml` to the `/opt/trains` folder:  
28 | ```
29 | sudo curl https://raw.githubusercontent.com/allegroai/trains-server/master/docker-compose.yml -o /opt/trains/docker-compose.yml
30 | ```
31 | - Start the server: `docker-compose -f /opt/trains/docker-compose.yml up -d`
32 | 
33 | Then go to http://localhost:8080/ to login
34 | 
35 | <img width="792" alt="Screen Shot 2020-12-25 at 11 26 45 AM" src="https://user-images.githubusercontent.com/595772/103138824-2be5e680-46a4-11eb-810c-65adc477b686.png">
36 | 
37 | - Restart:
38 | 
39 | ```
40 | docker-compose -f /opt/trains/docker-compose.yml down
41 | docker-compose -f /opt/trains/docker-compose.yml up -d
42 | ```
43 | 
44 | 


--------------------------------------------------------------------------------
/ml/clearml/matplotlib/matplotlib_example.py:
--------------------------------------------------------------------------------
 1 | # TRAINS - Example of Matplotlib and Seaborn integration and reporting
 2 | #
 3 | import matplotlib
 4 | matplotlib.use('agg') # use agg instead of tkinter
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | import seaborn as sns
 8 | from clearml import Task
 9 | 
10 | 
11 | task = Task.init(project_name='examples', task_name='Matplotlib example by Harry')
12 | 
13 | # Create a plot
14 | N = 50
15 | x = np.random.rand(N)
16 | y = np.random.rand(N)
17 | colors = np.random.rand(N)
18 | area = (30 * np.random.rand(N))**2  # 0 to 15 point radii
19 | plt.scatter(x, y, s=area, c=colors, alpha=0.5)
20 | # Plot will be reported automatically
21 | plt.show()
22 | 
23 | # Alternatively, in order to report the plot with a more meaningful title/series and iteration number
24 | area = (40 * np.random.rand(N))**2
25 | plt.scatter(x, y, s=area, c=colors, alpha=0.5)
26 | task.logger.report_matplotlib_figure(title="My Plot Title", series="My Plot Series", iteration=10, figure=plt)
27 | plt.show()
28 | 
29 | # Create another plot - with a name
30 | x = np.linspace(0, 10, 30)
31 | y = np.sin(x)
32 | plt.plot(x, y, 'o', color='black')
33 | # Plot will be reported automatically
34 | plt.show()
35 | 
36 | # Create image plot
37 | m = np.eye(256, 256, dtype=np.uint8)
38 | plt.imshow(m)
39 | # Plot will be reported automatically
40 | plt.show()
41 | 
42 | # Create image plot - with a name
43 | m = np.eye(256, 256, dtype=np.uint8)
44 | plt.imshow(m)
45 | plt.title('Image Title')
46 | # Plot will be reported automatically
47 | plt.show()
48 | 
49 | sns.set(style="darkgrid")
50 | # Load an example dataset with long-form data
51 | fmri = sns.load_dataset("fmri")
52 | # Plot the responses for different events and regions
53 | sns.lineplot(x="timepoint", y="signal",
54 |              hue="region", style="event",
55 |              data=fmri)
56 | # Plot will be reported automatically
57 | plt.show()
58 | 
59 | print('This is a Matplotlib & Seaborn example')
60 | 


--------------------------------------------------------------------------------
/ml/clearml/matplotlib/mlp_grouped_errorbar.py:
--------------------------------------------------------------------------------
 1 | # Grouped bar chart with precentage change bars and labels
 2 | import matplotlib
 3 | matplotlib.use('agg') # use agg instead of tkinter
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | plt.style.use('seaborn')
 7 | 
 8 | from clearml import Task
 9 | task = Task.init(project_name='examples', task_name='Matplotlib GroupedBar example by Harry')
10 | 
11 | men_means = np.array([20, 35, 30, 35, 27])
12 | women_means = np.array([25, 32, 34, 20, 25])
13 | 
14 | ind = np.arange(len(men_means))  # the x locations for the groups
15 | width = 0.35  # the width of the bars
16 | 
17 | fig, ax = plt.subplots()
18 | 
19 | rects1 = ax.bar(ind - width/2, men_means, width, 
20 |                 label='Men')
21 | rects2 = ax.bar(ind + width/2, women_means, width,
22 |                 label='Women')
23 | 
24 | # Add some text for labels, title and custom x-axis tick labels, etc.
25 | ax.set_ylabel('Scores')
26 | ax.set_title('Scores by group and gender')
27 | ax.set_xticks(ind)
28 | ax.set_xticklabels(('G1', 'G2', 'G3', 'G4', 'G5'))
29 | ax.legend()
30 | 
31 | 
32 | def autolabel(rects, xpos='center'):
33 |     """
34 |     Attach a text label above each bar in *rects*, displaying its height.
35 | 
36 |     *xpos* indicates which side to place the text w.r.t. the center of
37 |     the bar. It can be one of the following {'center', 'right', 'left'}.
38 | 
39 |     ha: horizontal alignment
40 |     """
41 | 
42 |     ha = {'center': 'center', 'right': 'left', 'left': 'right'}
43 |     offset = {'center': 0, 'right': 1, 'left': -1}
44 | 
45 |     for rect in rects:
46 |         height = rect.get_height()
47 |         ax.annotate('{}'.format(height),
48 |                     xy=(rect.get_x() + rect.get_width() / 2, height),
49 |                     xytext=(offset[xpos]*3, 3),  # use 3 points offset
50 |                     textcoords="offset points",  # in both directions
51 |                     ha=ha[xpos], va='bottom')
52 | 
53 | 
54 | autolabel(rects1)
55 | autolabel(rects2)
56 | 
57 | # custom error bar 
58 | diff = (men_means - women_means)/2
59 | change_percentage = np.abs((men_means - women_means)/men_means)
60 | errorbar_y = men_means - diff # the y of the error bar
61 | errorbar_x_offset = 0.1
62 | 
63 | 
64 | # show the small caps on error bar ends:
65 | # capsize=3 (bar width) AND markeredgewidth=1 (bar width - default is 0)
66 | # elinewidth=1 is the error bar line width
67 | ax.errorbar(ind + width + errorbar_x_offset, errorbar_y,
68 |             yerr=diff, fmt='none', elinewidth=1, 
69 |             capsize=3, markeredgewidth=1)
70 | 
71 | # show the change percentage labels
72 | 
73 | errorbar_text_offset = 0.625 # the offset fro the man's bar x location
74 | 
75 | for i in range(len(rects1)):
76 |     # find the higher bar to determine label height
77 |     height1 = rects1[i].get_height() 
78 |     height2 = rects2[i].get_height()
79 |     height = height1 if height1 > height2 else height2
80 |     
81 |     # add the percentage change labels
82 |     ax.annotate(f'{change_percentage[i]:.1%}', # the text 
83 |                 xy=(ind[i] - width/2 + 0.625, height), # x y for the text
84 |                 xytext=(0, 3),   # 0 point horizotal and 3 points vertical offsets
85 |                 textcoords="offset points",  # in both directions
86 |                 ha='center',  # horizontal alignment
87 |                 va='bottom') # vertical alignment
88 | 
89 | fig.tight_layout()
90 | 
91 | plt.show()


--------------------------------------------------------------------------------
/ml/clearml/matplotlib/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib >= 3.1.1 ; python_version >= '3.6'
2 | matplotlib >= 2.2.4 ; python_version < '3.6'
3 | seaborn
4 | clearml


--------------------------------------------------------------------------------
/ml/clearml/pytorch/manual_model_upload.py:
--------------------------------------------------------------------------------
 1 | # TRAINS - Example of manual model configuration and uploading
 2 | #
 3 | import os
 4 | from tempfile import gettempdir
 5 | 
 6 | import torch
 7 | from trains import Task
 8 | 
 9 | 
10 | task = Task.init(project_name='examples', task_name='Model configuration and upload')
11 | 
12 | # create a model
13 | model = torch.nn.Module
14 | 
15 | # Connect a local configuration file
16 | config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json')
17 | config_file = task.connect_configuration(config_file)
18 | # then read configuration as usual, the backend will contain a copy of it.
19 | # later when executing remotely, the returned `config_file` will be a temporary file
20 | # containing a new copy of the configuration retrieved form the backend
21 | # # model_config_dict = json.load(open(config_file, 'rt'))
22 | 
23 | # Or Store dictionary of definition for a specific network design
24 | model_config_dict = {
25 |     'value': 13.37,
26 |     'dict': {'sub_value': 'string', 'sub_integer': 11},
27 |     'list_of_ints': [1, 2, 3, 4],
28 | }
29 | model_config_dict = task.connect_configuration(model_config_dict)
30 | 
31 | # We now update the dictionary after connecting it, and the changes will be tracked as well.
32 | model_config_dict['new value'] = 10
33 | model_config_dict['value'] *= model_config_dict['new value']
34 | 
35 | # store the label enumeration of the training model
36 | labels = {'background': 0, 'cat': 1, 'dog': 2}
37 | task.connect_label_enumeration(labels)
38 | 
39 | # storing the model, it will have the task network configuration and label enumeration
40 | print('Any model stored from this point onwards, will contain both model_config and label_enumeration')
41 | 
42 | torch.save(model, os.path.join(gettempdir(), "model.pt"))
43 | print('Model saved')
44 | 


--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/audio/README.md:
--------------------------------------------------------------------------------
1 | The `audio_classifier_UrbanSound8K.ipynb` example uses a small dataset based on [UrbanSound8K dataset](https://urbansounddataset.weebly.com/urbansound8k.html).


--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/audio/audio_preprocessing_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "! pip install -U pip\n",
 12 |     "! pip install -U torch==1.5.1\n",
 13 |     "! pip install -U torchaudio==0.5.1\n",
 14 |     "! pip install -U matplotlib==3.2.1\n",
 15 |     "! pip install -U trains>=0.16.1\n",
 16 |     "! pip install -U tensorboard==2.2.1"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import os\n",
 26 |     "import torch\n",
 27 |     "import torchaudio\n",
 28 |     "from torch.utils.tensorboard import SummaryWriter\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "\n",
 31 |     "from trains import Task\n",
 32 |     "\n",
 33 |     "%matplotlib inline"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "task = Task.init(project_name='Audio Example', task_name='data pre-processing')\n",
 43 |     "configuration_dict = {'number_of_samples': 3}\n",
 44 |     "configuration_dict = task.connect(configuration_dict)  # enabling configuration override by trains\n",
 45 |     "print(configuration_dict)  # printing actual configuration (after override in remote mode)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "tensorboard_writer = SummaryWriter('./tensorboard_logs')"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "scrolled": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "if not os.path.isdir('./data'):\n",
 66 |     "    os.mkdir('./data')\n",
 67 |     "yesno_data = torchaudio.datasets.YESNO('./data', download=True)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "def plot_signal(signal, title, cmap=None):\n",
 77 |     "    plt.figure()\n",
 78 |     "    if signal.ndim == 1:\n",
 79 |     "        plt.plot(signal)\n",
 80 |     "    else:\n",
 81 |     "        plt.imshow(signal, cmap=cmap)    \n",
 82 |     "    plt.title(title)\n",
 83 |     "    plt.show()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "pycharm": {
 91 |      "name": "#%%\n"
 92 |     },
 93 |     "scrolled": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "fixed_sample_rate = 22050\n",
 98 |     "for n in range(configuration_dict.get('number_of_samples', 3)):\n",
 99 |     "    audio, sample_rate, labels = yesno_data[n]\n",
100 |     "    tensorboard_writer.add_audio('Audio samples/{}'.format(n), audio, n, sample_rate)\n",
101 |     "    \n",
102 |     "    resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=fixed_sample_rate)\n",
103 |     "    melspectogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=fixed_sample_rate, n_mels=128)\n",
104 |     "    \n",
105 |     "    audio_mono = torch.mean(resample_transform(audio), dim=0, keepdim=True)\n",
106 |     "    plot_signal(audio_mono[0,:], 'Original waveform')\n",
107 |     "    \n",
108 |     "    melspectogram = melspectogram_transform(audio_mono)\n",
109 |     "    plot_signal(melspectogram.squeeze().numpy(), 'Mel spectogram', 'hot')\n",
110 |     "    plot_signal(torchaudio.transforms.AmplitudeToDB()(melspectogram).squeeze().numpy(), 'Mel spectogram DB', 'hot')"
111 |    ]
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.7.4"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 4
135 | }
136 | 


--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/table/download_and_split.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "! pip install -U pip\n",
 10 |     "! pip install -U trains==0.16.2rc0\n",
 11 |     "! pip install -U pandas==1.0.4\n",
 12 |     "! pip install -U scikit-learn==0.23.1\n",
 13 |     "! pip install -U pathlib2==2.3.5"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import pandas as pd\n",
 23 |     "from pathlib2 import Path\n",
 24 |     "from sklearn.model_selection import train_test_split\n",
 25 |     "\n",
 26 |     "from trains import Task"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "task = Task.init(project_name='Tabular Example', task_name='Download and split tabular dataset')\n",
 36 |     "logger = task.get_logger()\n",
 37 |     "configuration_dict = {'test_size': 0.1, 'split_random_state': 0}\n",
 38 |     "configuration_dict = task.connect(configuration_dict)  # enabling configuration override by trains\n",
 39 |     "print(configuration_dict)  # printing actual configuration (after override in remote mode)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "# **Downloading**"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Download the shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n",
 56 |     "# and save it to your cloud storage or your mounted local storage\n",
 57 |     "# If the data is on your cloud storage, you can use trains' storage manager to get a local copy of it:\n",
 58 |     "#    from trains.storage import StorageManager\n",
 59 |     "#    path_to_ShelterAnimal = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip\", \n",
 60 |     "#                                                          extract_archive=True)\n",
 61 |     "path_to_ShelterAnimal = '/home/sam/Datasets/shelter-animal-outcomes'"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "train_set = pd.read_csv(Path(path_to_ShelterAnimal) / 'train.csv')\n",
 71 |     "logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "# **Splitting to train and val**"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "X = train_set.drop(columns= ['OutcomeType'])\n",
 88 |     "Y = train_set['OutcomeType']\n",
 89 |     "X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=configuration_dict.get('test_size', 0.1), \n",
 90 |     "                                                  random_state=configuration_dict.get('split_random_state', 0))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "train_df = X_train.join(Y_train)\n",
100 |     "val_df = X_val.join(Y_val)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "task.upload_artifact('train_data', artifact_object=train_df)\n",
110 |     "task.upload_artifact('val_data', artifact_object=val_df)"
111 |    ]
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.7.4"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 4
135 | }
136 | 


--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/table/pick_best_model.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "! pip install -U pip\n",
10 |     "! pip install -U trains==0.16.2rc0"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "from trains import Task, OutputModel"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": null,
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "task = Task.init(project_name='Tabular Example', task_name='pick best model')\n",
29 |     "configuration_dict = {'train_tasks_ids': ['c9bff3d15309487a9e5aaa00358ff091', 'c9bff3d15309487a9e5aaa00358ff091']}\n",
30 |     "configuration_dict = task.connect(configuration_dict)  # enabling configuration override by trains\n",
31 |     "print(configuration_dict)  # printing actual configuration (after override in remote mode)"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": [
40 |     "results = {}\n",
41 |     "for task_id in configuration_dict.get('train_tasks_ids'):\n",
42 |     "    train_task = Task.get_task(task_id)\n",
43 |     "    results[task_id] = train_task.get_last_scalar_metrics()['accuracy']['total']['last']"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "metadata": {},
50 |    "outputs": [],
51 |    "source": [
52 |     "print(results)"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "best_model_task_id = max(results.items(), key=lambda x: x[1])[0]\n",
62 |     "best_model_id = Task.get_task(best_model_task_id).output_model_id"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": null,
68 |    "metadata": {},
69 |    "outputs": [],
70 |    "source": [
71 |     "OutputModel(base_model_id=best_model_id)"
72 |    ]
73 |   }
74 |  ],
75 |  "metadata": {
76 |   "kernelspec": {
77 |    "display_name": "Python 3",
78 |    "language": "python",
79 |    "name": "python3"
80 |   },
81 |   "language_info": {
82 |    "codemirror_mode": {
83 |     "name": "ipython",
84 |     "version": 3
85 |    },
86 |    "file_extension": ".py",
87 |    "mimetype": "text/x-python",
88 |    "name": "python",
89 |    "nbconvert_exporter": "python",
90 |    "pygments_lexer": "ipython3",
91 |    "version": "3.7.4"
92 |   }
93 |  },
94 |  "nbformat": 4,
95 |  "nbformat_minor": 4
96 | }
97 | 


--------------------------------------------------------------------------------
/ml/clearml/pytorch/notebooks/table/tabular_ml_pipeline.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "# pip install with locked versions\n",
10 |     "! pip install -U pip\n",
11 |     "! pip install -U trains==0.16.2rc0"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": null,
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "from trains import Task\n",
21 |     "from trains.automation.controller import PipelineController"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": null,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "task = Task.init(project_name='Tabular Example', task_name='tabular training pipeline', task_type=Task.TaskTypes.controller)"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": null,
36 |    "metadata": {},
37 |    "outputs": [],
38 |    "source": [
39 |     "pipe = PipelineController(default_execution_queue='dan_queue', add_pipeline_tags=True)\n",
40 |     "pipe.add_step(name='preprocessing_1', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n",
41 |     "              parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
42 |     "                                  'General/fill_categorical_NA': 'True',\n",
43 |     "                                  'General/fill_numerical_NA': 'True'})\n",
44 |     "pipe.add_step(name='preprocessing_2', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n",
45 |     "              parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
46 |     "                                  'General/fill_categorical_NA': 'False',\n",
47 |     "                                  'General/fill_numerical_NA': 'True'})\n",
48 |     "                                  \n",
49 |     "pipe.add_step(name='train_1', parents=['preprocessing_1'],\n",
50 |     "              base_task_project='Tabular Example', base_task_name='tabular prediction',\n",
51 |     "              parameter_override={'General/data_task_id': '${preprocessing_1.id}'})\n",
52 |     "pipe.add_step(name='train_2', parents=['preprocessing_2'],\n",
53 |     "              base_task_project='Tabular Example', base_task_name='tabular prediction',\n",
54 |     "              parameter_override={'General/data_task_id': '${preprocessing_2.id}'})\n",
55 |     "                                  \n",
56 |     "pipe.add_step(name='pick_best', parents=['train_1', 'train_2'],\n",
57 |     "              base_task_project='Tabular Example', base_task_name='pick best model',\n",
58 |     "              parameter_override={'General/train_tasks_ids': '[${train_1.id}, ${train_2.id}]'})                   "
59 |    ]
60 |   },
61 |   {
62 |    "cell_type": "code",
63 |    "execution_count": null,
64 |    "metadata": {},
65 |    "outputs": [],
66 |    "source": [
67 |     "# Starting the pipeline (in the background)\n",
68 |     "pipe.start()\n",
69 |     "# Wait until pipeline terminates\n",
70 |     "pipe.wait()\n",
71 |     "# cleanup everything\n",
72 |     "pipe.stop()"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "Python 3",
79 |    "language": "python",
80 |    "name": "python3"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.7.4"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 4
97 | }
98 | 


--------------------------------------------------------------------------------
/ml/clearml/pytorch/pytorch_tensorboardx.py:
--------------------------------------------------------------------------------
1 | ../tensorboardx/pytorch_tensorboardX.py


--------------------------------------------------------------------------------
/ml/clearml/pytorch/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | tensorboardX
3 | tensorboard>=1.14.0
4 | torch>=1.1.0
5 | torchvision>=0.3.0
6 | clearml


--------------------------------------------------------------------------------
/ml/clearml/pytorch/tensorboard_toy_pytorch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tempfile import gettempdir
 3 | 
 4 | import numpy as np
 5 | from PIL import Image
 6 | from torch.utils.tensorboard import SummaryWriter
 7 | 
 8 | from trains import Task
 9 | task = Task.init(project_name='examples', task_name='pytorch tensorboard toy example')
10 | 
11 | 
12 | writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs'))
13 | 
14 | # convert to 4d [batch, col, row, RGB-channels]
15 | image_open = Image.open(os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg"))
16 | image = np.asarray(image_open)
17 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
18 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
19 | image_rgba = image_rgba[np.newaxis, :, :, :]
20 | image = image[np.newaxis, :, :, :]
21 | 
22 | writer.add_image("test/first", image[0], dataformats='HWC')
23 | writer.add_image("test_gray/second", image_gray[0], dataformats='HWC')
24 | writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
25 | # writer.add_image("image/first_series", image, max_outputs=10)
26 | # writer.add_image("image_gray/second_series", image_gray, max_outputs=10)
27 | # writer.add_image("image_rgba/third_series", image_rgba, max_outputs=10)
28 | 
29 | print('Done!')
30 | 


--------------------------------------------------------------------------------
/ml/clearml/requirements.txt:
--------------------------------------------------------------------------------
1 | PyJWT==1.7.1
2 | 


--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/model-harry.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/clearml/scikit-learn/model-harry.pkl


--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/clearml/scikit-learn/model.pkl


--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.13.2
2 | matplotlib >= 3.1.1 ; python_version >= '3.6'
3 | matplotlib >= 2.2.4 ; python_version < '3.6'
4 | scikit-learn
5 | clearml


--------------------------------------------------------------------------------
/ml/clearml/scikit-learn/sklearn_joblib_example.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import joblib
 3 | except ImportError:
 4 |     from sklearn.externals import joblib
 5 | 
 6 | from sklearn import datasets
 7 | from sklearn.linear_model import LogisticRegression
 8 | from sklearn.model_selection import train_test_split
 9 | import numpy as np
10 | import matplotlib
11 | matplotlib.use('agg') # use agg instead of tkinter
12 | import matplotlib.pyplot as plt
13 | 
14 | 
15 | 
16 | from clearml import Task
17 | 
18 | task = Task.init(project_name="examples", task_name="scikit-learn joblib example")
19 | 
20 | iris = datasets.load_iris()
21 | X = iris.data
22 | y = iris.target
23 | 
24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
25 | 
26 | model = LogisticRegression(solver='liblinear', multi_class='auto')  # sklearn LogisticRegression class
27 | model.fit(X_train, y_train)
28 | 
29 | joblib.dump(model, 'model-harry.pkl', compress=True)
30 | 
31 | loaded_model = joblib.load('model.pkl')
32 | result = loaded_model.score(X_test, y_test)
33 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
34 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
35 | h = .02  # step size in the mesh
36 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
37 | plt.figure(1, figsize=(4, 3))
38 | 
39 | plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
40 | plt.xlabel('Sepal length')
41 | plt.ylabel('Sepal width')
42 | 
43 | plt.xlim(xx.min(), xx.max())
44 | plt.ylim(yy.min(), yy.max())
45 | plt.xticks(())
46 | plt.yticks(())
47 | 
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/ml/clearml/tensorflow/legacy/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard>=1.14.0
2 | tensorflow>=1.14.0
3 | 


--------------------------------------------------------------------------------
/ml/clearml/tensorflow/legacy/tensorboard_toy.py:
--------------------------------------------------------------------------------
 1 | # TRAINS - Example of tensorboard with tensorflow (without any actual training)
 2 | #
 3 | import os
 4 | from tempfile import gettempdir
 5 | 
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | from PIL import Image
 9 | 
10 | from trains import Task
11 | task = Task.init(project_name='examples', task_name='tensorboard toy example')
12 | 
13 | 
14 | k = tf.placeholder(tf.float32)
15 | 
16 | # Make a normal distribution, with a shifting mean
17 | mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
18 | # Record that distribution into a histogram summary
19 | tf.summary.histogram("normal/moving_mean", mean_moving_normal)
20 | tf.summary.scalar("normal/value", mean_moving_normal[-1])
21 | 
22 | # Make a normal distribution with shrinking variance
23 | variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
24 | # Record that distribution too
25 | tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
26 | tf.summary.scalar("normal/variance_shrinking_normal", variance_shrinking_normal[-1])
27 | 
28 | # Let's combine both of those distributions into one dataset
29 | normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
30 | # We add another histogram summary to record the combined distribution
31 | tf.summary.histogram("normal/bimodal", normal_combined)
32 | tf.summary.scalar("normal/normal_combined", normal_combined[0])
33 | 
34 | # Add a gamma distribution
35 | gamma = tf.random_gamma(shape=[1000], alpha=k)
36 | tf.summary.histogram("gamma", gamma)
37 | 
38 | # And a poisson distribution
39 | poisson = tf.random_poisson(shape=[1000], lam=k)
40 | tf.summary.histogram("poisson", poisson)
41 | 
42 | # And a uniform distribution
43 | uniform = tf.random_uniform(shape=[1000], maxval=k*10)
44 | tf.summary.histogram("uniform", uniform)
45 | 
46 | # Finally, combine everything together!
47 | all_distributions = [mean_moving_normal, variance_shrinking_normal, gamma, poisson, uniform]
48 | all_combined = tf.concat(all_distributions, 0)
49 | tf.summary.histogram("all_combined", all_combined)
50 | 
51 | # Log text value
52 | tf.summary.text("this is a test", tf.make_tensor_proto("This is the content", dtype=tf.string))
53 | 
54 | # convert to 4d [batch, col, row, RGB-channels]
55 | image_open = Image.open(os.path.join("..", "..", "..", "reporting", "data_samples", "picasso.jpg"))
56 | image = np.asarray(image_open)
57 | image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
58 | image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
59 | image_rgba = image_rgba[np.newaxis, :, :, :]
60 | image = image[np.newaxis, :, :, :]
61 | 
62 | tf.summary.image("test", image, max_outputs=10)
63 | tf.summary.image("test_gray", image_gray, max_outputs=10)
64 | tf.summary.image("test_rgba", image_rgba, max_outputs=10)
65 | 
66 | # Setup a session and summary writer
67 | summaries = tf.summary.merge_all()
68 | sess = tf.Session()
69 | 
70 | logger = task.get_logger()
71 | 
72 | # Use original FileWriter for comparison , run:
73 | # % tensorboard --logdir=/tmp/histogram_example
74 | writer = tf.summary.FileWriter(os.path.join(gettempdir(), "histogram_example"))
75 | 
76 | # Setup a loop and write the summaries to disk
77 | N = 40
78 | for step in range(N):
79 |     k_val = step/float(N)
80 |     summ = sess.run(summaries, feed_dict={k: k_val})
81 |     writer.add_summary(summ, global_step=step)
82 | 
83 | print('Done!')
84 | 


--------------------------------------------------------------------------------
/ml/clearml/tensorflow/manual_model_upload.py:
--------------------------------------------------------------------------------
 1 | # TRAINS - Example of manual model configuration and uploading
 2 | #
 3 | import os
 4 | import tempfile
 5 | 
 6 | import tensorflow as tf
 7 | from trains import Task
 8 | 
 9 | task = Task.init(project_name='examples', task_name='Model configuration and upload')
10 | 
11 | model = tf.Module()
12 | 
13 | # Connect a local configuration file
14 | config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json')
15 | config_file = task.connect_configuration(config_file)
16 | # then read configuration as usual, the backend will contain a copy of it.
17 | # later when executing remotely, the returned `config_file` will be a temporary file
18 | # containing a new copy of the configuration retrieved form the backend
19 | # # model_config_dict = json.load(open(config_file, 'rt'))
20 | 
21 | # Or Store dictionary of definition for a specific network design
22 | model_config_dict = {
23 |     'value': 13.37,
24 |     'dict': {'sub_value': 'string', 'sub_integer': 11},
25 |     'list_of_ints': [1, 2, 3, 4],
26 | }
27 | model_config_dict = task.connect_configuration(model_config_dict)
28 | 
29 | # We now update the dictionary after connecting it, and the changes will be tracked as well.
30 | model_config_dict['new value'] = 10
31 | model_config_dict['value'] *= model_config_dict['new value']
32 | 
33 | # store the label enumeration of the training model
34 | labels = {'background': 0, 'cat': 1, 'dog': 2}
35 | task.connect_label_enumeration(labels)
36 | 
37 | # storing the model, it will have the task network configuration and label enumeration
38 | print('Any model stored from this point onwards, will contain both model_config and label_enumeration')
39 | 
40 | tempdir = tempfile.mkdtemp()
41 | tf.saved_model.save(model, os.path.join(tempdir, "model"))
42 | print('Model saved')
43 | 


--------------------------------------------------------------------------------
/ml/clearml/tensorflow/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard>=2.0
2 | tensorflow>=2.0
3 | trains


--------------------------------------------------------------------------------
/ml/clearml/tensorflow/tensorboard_toy.py:
--------------------------------------------------------------------------------
 1 | # TRAINS - Example of tensorboard with tensorflow (without any actual training)
 2 | #
 3 | import os
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | from tempfile import gettempdir
 7 | from PIL import Image
 8 | 
 9 | from trains import Task
10 | 
11 | 
12 | def generate_summary(k, step):
13 |     # Make a normal distribution, with a shifting mean
14 |     mean_moving_normal = tf.random.normal(shape=[1000], mean=(5 * k), stddev=1)
15 |     # Record that distribution into a histogram summary
16 |     tf.summary.histogram("normal/moving_mean", mean_moving_normal, step=step)
17 |     tf.summary.scalar("normal/value", mean_moving_normal[-1], step=step)
18 | 
19 |     # Make a normal distribution with shrinking variance
20 |     variance_shrinking_normal = tf.random.normal(shape=[1000], mean=0, stddev=1-k)
21 |     # Record that distribution too
22 |     tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal, step=step)
23 |     tf.summary.scalar("normal/variance_shrinking_normal", variance_shrinking_normal[-1], step=step)
24 | 
25 |     # Let's combine both of those distributions into one dataset
26 |     normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
27 |     # We add another histogram summary to record the combined distribution
28 |     tf.summary.histogram("normal/bimodal", normal_combined, step=step)
29 |     tf.summary.scalar("normal/normal_combined", normal_combined[0], step=step)
30 | 
31 |     # Add a gamma distribution
32 |     gamma = tf.random.gamma(shape=[1000], alpha=k)
33 |     tf.summary.histogram("gamma", gamma, step=step)
34 | 
35 |     # And a poisson distribution
36 |     poisson = tf.random.poisson(shape=[1000], lam=k)
37 |     tf.summary.histogram("poisson", poisson, step=step)
38 | 
39 |     # And a uniform distribution
40 |     uniform = tf.random.uniform(shape=[1000], maxval=k*10)
41 |     tf.summary.histogram("uniform", uniform, step=step)
42 | 
43 |     # Finally, combine everything together!
44 |     all_distributions = [mean_moving_normal, variance_shrinking_normal, gamma, poisson, uniform]
45 |     all_combined = tf.concat(all_distributions, 0)
46 |     tf.summary.histogram("all_combined", all_combined, step=step)
47 | 
48 |     # Log text value
49 |     tf.summary.text("this is a test", "This is the content", step=step)
50 | 
51 |     # convert to 4d [batch, col, row, RGB-channels]
52 |     image_open = Image.open(os.path.join('..', '..', 'reporting', 'data_samples', 'picasso.jpg'))
53 |     image = np.asarray(image_open)
54 |     image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
55 |     image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
56 |     image_rgba = image_rgba[np.newaxis, :, :, :]
57 |     image = image[np.newaxis, :, :, :]
58 | 
59 |     tf.summary.image("test", image, max_outputs=10, step=step)
60 |     tf.summary.image("test_gray", image_gray, max_outputs=10, step=step)
61 |     tf.summary.image("test_rgba", image_rgba, max_outputs=10, step=step)
62 | 
63 | 
64 | task = Task.init(project_name='examples', task_name='tensorboard toy example')
65 | 
66 | # create the tensorboard file writer in a temp folder
67 | writer = tf.summary.create_file_writer(os.path.join(gettempdir(), "toy_tb_example"))
68 | 
69 | # Setup a loop and write the summaries to disk
70 | N = 40
71 | for step in range(N):
72 |     k_val = step/float(N)
73 |     with writer.as_default():
74 |         generate_summary(k_val, tf.cast(step, tf.int64))
75 | 
76 | print('Tensorboard toy example done')
77 | 


--------------------------------------------------------------------------------
/ml/clearml/wandb/latest-run:
--------------------------------------------------------------------------------
1 | run-20210201_173509-jrmpee7z


--------------------------------------------------------------------------------
/ml/clearml/wandb/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | tensorboardX
3 | tensorboard>=1.14.0
4 | torch>=1.1.0
5 | torchvision>=0.3.0
6 | clearml
7 | wandb


--------------------------------------------------------------------------------
/ml/clearml/xgboost/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib >= 3.1.1 ; python_version >= '3.6'
2 | matplotlib >= 2.2.4 ; python_version < '3.6'
3 | sklearn
4 | trains
5 | xgboost>=0.90 ; python_version >= '3'
6 | xgboost>=0.82 ; python_version < '3'
7 | # sudo apt-get install graphviz
8 | graphviz>=0.8
9 | 


--------------------------------------------------------------------------------
/ml/clearml/xgboost/xgboost_sample.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import xgboost as xgb
 3 | from sklearn import datasets
 4 | from sklearn.metrics import accuracy_score
 5 | from sklearn.model_selection import train_test_split
 6 | from xgboost import plot_tree
 7 | 
 8 | from trains import Task
 9 | 
10 | task = Task.init(project_name='examples', task_name='XGBoost simple example')
11 | iris = datasets.load_iris()
12 | X = iris.data
13 | y = iris.target
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
15 | dtrain = xgb.DMatrix(X_train, label=y_train)
16 | dtest = xgb.DMatrix(X_test, label=y_test)
17 | param = {
18 |     'max_depth': 3,  # the maximum depth of each tree
19 |     'eta': 0.3,  # the training step for each iteration
20 |     'silent': 1,  # logging mode - quiet
21 |     'objective': 'multi:softprob',  # error evaluation for multiclass training
22 |     'num_class': 3}  # the number of classes that exist in this datset
23 | num_round = 20  # the number of training iterations
24 | 
25 | # noinspection PyBroadException
26 | try:
27 |     # try to load a model
28 |     bst = xgb.Booster(params=param, model_file='xgb.01.model')
29 |     bst.load_model('xgb.01.model')
30 | except Exception:
31 |     bst = None
32 | 
33 | # if we dont have one train a model
34 | if bst is None:
35 |     bst = xgb.train(param, dtrain, num_round)
36 | 
37 | # store trained model model v1
38 | bst.save_model('xgb.01.model')
39 | bst.dump_model('xgb.01.raw.txt')
40 | 
41 | # build classifier
42 | model = xgb.XGBClassifier()
43 | model.fit(X_train, y_train)
44 | 
45 | # store trained classifier model
46 | model.save_model('xgb.02.model')
47 | 
48 | # make predictions for test data
49 | y_pred = model.predict(X_test)
50 | predictions = [round(value) for value in y_pred]
51 | 
52 | # evaluate predictions
53 | accuracy = accuracy_score(y_test, predictions)
54 | print("Accuracy: %.2f%%" % (accuracy * 100.0))
55 | labels = dtest.get_label()
56 | 
57 | # plot results
58 | xgb.plot_importance(model)
59 | plot_tree(model)
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/ml/document-clustering/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | Document Clustering with Python 3
 4 | 
 5 | This is my revision of the great tutorial at http://brandonrose.org/clustering - many thanks to the author.
 6 | 
 7 | ## TL;DR
 8 | **Data**: Top 100 movies (http://www.imdb.com/list/ls055592025/) with title, genre, and synopsis (IMDB and Wiki)
 9 | 
10 | **Goal**: Put 100 movies into 5 clusters by text-mining their synopses and plot the result as follows
11 | 
12 | <img width="771" alt="screenshot 2016-05-23 20 50 20" src="https://cloud.githubusercontent.com/assets/595772/15488829/5b863710-2128-11e6-843b-25aac76bd134.png">
13 | 
14 | ## Setup
15 | 
16 | First, clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
17 | 
18 | ```
19 | $ cd path_to_document-clustering
20 | $ virtualenv -p python3 venv
21 | $ source venv/bin/activate
22 | $ pip3 install -r requirements.txt
23 | ```
24 | Second, use nltk.download() to download all nltk packages (a GUI will open and you can choose to install all packages: ~ 3.5G), which are saved to /Users/your_mac_username/nltk_data
25 | 
26 | ```
27 | ipython
28 | import nltk
29 | nltk.download()
30 | ```
31 | 
32 | Lastly, run `$ jupyter notebook` to go over the tutorial step-by-step.
33 | 
34 | ## Key Steps
35 | 1. **Read data**: read titles, genres, synopses, rankings into four arrays
36 | 2. **Tokenize and stem**: break paragraphs into sentences, then to words, stem the words (without removing stopwords) - each synopsis essentially becomes a bag of stemmed words.
37 | 3. **Generate tf-idf matrix**: each row is a term (unigram, bigram, trigram...generated from the bag of words in 2.), each column is a synopsis.
38 | 4. **Generate clusters**: based on the tf-idf matrix, 5 (or any number) clusters are generated using k-means. The top key terms are selected for each cluster.
39 | 5. **Calculate similarity**: generate the cosine similarity matrix using the tf-idf matrix (100x100), then generate the distance matrix (1 - similarity matrix), so each pair of synopsis has a distance number between 0 and 1.
40 | 6. **Plot clusters**: use multidimensional scaling (MDS) to convert distance matrix to a 2-dimensional array, each synopsis has (x, y) that represents their relative location based on the distance matrix. Plot the 100 points with their (x, y) using matplotlib (I added an example on using plotly.js).
41 | 


--------------------------------------------------------------------------------
/ml/document-clustering/data/genres_list.txt:
--------------------------------------------------------------------------------
  1 | [u' Crime', u' Drama']
  2 | [u' Crime', u' Drama']
  3 | [u' Biography', u' Drama', u' History']
  4 | [u' Biography', u' Drama', u' Sport']
  5 | [u' Drama', u' Romance', u' War']
  6 | [u' Drama']
  7 | [u' Drama', u' Romance', u' War']
  8 | [u' Drama', u' Mystery']
  9 | [u' Adventure', u' Family', u' Fantasy', u' Musical']
 10 | [u' Drama', u' Romance']
 11 | [u' Adventure', u' Biography', u' Drama', u' History', u' War']
 12 | [u' Crime', u' Drama']
 13 | [u' Horror', u' Mystery', u' Thriller']
 14 | [u' Drama', u' Film-Noir']
 15 | [u' Mystery', u' Romance', u' Thriller']
 16 | [u' Crime', u' Drama']
 17 | [u' Drama', u' Romance']
 18 | [u' Biography', u' Drama', u' Family', u' Musical', u' Romance']
 19 | [u' Crime', u' Drama', u' Musical', u' Romance', u' Thriller']
 20 | [u' Action', u' Adventure', u' Fantasy', u' Sci-Fi']
 21 | [u' Adventure', u' Family', u' Sci-Fi']
 22 | [u' Mystery', u' Sci-Fi']
 23 | [u' Crime', u' Drama', u' Thriller']
 24 | [u' Drama', u' Mystery', u' Thriller']
 25 | [u' Adventure', u' Drama', u' War']
 26 | [u' Comedy', u' Musical', u' Romance']
 27 | [u' Drama', u' Family', u' Fantasy']
 28 | [u' Comedy']
 29 | [u' Drama']
 30 | [u' Comedy', u' War']
 31 | [u' Biography', u' Drama', u' Music']
 32 | [u' Drama', u' War']
 33 | [u' Biography', u' Drama', u' History']
 34 | [u' Adventure', u' Fantasy']
 35 | [u' Action', u' Drama']
 36 | [u' Drama', u' Romance', u' War']
 37 | [u' Action', u' Drama', u' War']
 38 | [u' Western']
 39 | [u' Action', u' Adventure']
 40 | [u' Drama', u' Sport']
 41 | [u' Drama']
 42 | [u' Comedy', u' Romance']
 43 | [u' Drama']
 44 | [u' Musical', u' Romance']
 45 | [u' Drama', u' Romance', u' War']
 46 | [u' Drama', u' Family', u' Musical', u' Romance']
 47 | [u' Adventure', u' Drama']
 48 | [u' Drama', u' Romance', u' War']
 49 | [u' Biography', u' Drama', u' War']
 50 | [u' Drama', u' Thriller']
 51 | [u' Action', u' Biography', u' Drama', u' History', u' War']
 52 | [u' Western']
 53 | [u' Biography', u' Crime', u' Western']
 54 | [u' Action', u' Adventure', u' Drama', u' Western']
 55 | [u' Comedy', u' Drama', u' Romance']
 56 | [u' Drama', u' War']
 57 | [u' Western']
 58 | [u' Adventure', u' Drama', u' Western']
 59 | [u' Biography', u' Drama', u' War']
 60 | [u' Biography', u' Crime', u' Drama']
 61 | [u' Horror']
 62 | [u' Drama', u' War']
 63 | [u' Drama', u' War']
 64 | [u' Action', u' Crime', u' Thriller']
 65 | [u' Comedy', u' Drama', u' Romance']
 66 | [u' Biography', u' Drama', u' History']
 67 | [u' Comedy', u' Romance']
 68 | [u' Drama', u' Romance']
 69 | [u' Drama']
 70 | [u' Drama']
 71 | [u' Drama']
 72 | [u' Comedy', u' Drama', u' Romance']
 73 | [u' Biography', u' Drama', u' Romance']
 74 | [u' Drama']
 75 | [u' Comedy', u' Drama']
 76 | [u' Comedy', u' Drama', u' Romance']
 77 | [u' Crime', u' Drama', u' Thriller']
 78 | [u' Drama', u' Romance']
 79 | [u' Drama']
 80 | [u' Drama', u' Romance', u' Western']
 81 | [u' Crime', u' Drama', u' Fantasy', u' Mystery']
 82 | [u' Drama', u' Sci-Fi']
 83 | [u' Drama']
 84 | [u' Drama', u' Music']
 85 | [u' Comedy', u' Drama', u' Romance']
 86 | [u' Comedy', u' Drama']
 87 | [u' Crime', u' Drama', u' Thriller']
 88 | [u' Adventure', u' Romance', u' War']
 89 | [u' Adventure', u' Western']
 90 | [u' Adventure', u' Drama', u' History']
 91 | [u' Drama', u' Film-Noir', u' Mystery']
 92 | [u' Crime', u' Drama', u' Sci-Fi']
 93 | [u' Crime', u' Drama']
 94 | [u' Drama', u' Romance']
 95 | [u' Crime', u' Drama', u' Film-Noir', u' Thriller']
 96 | [u' Drama']
 97 | [u' Mystery', u' Thriller']
 98 | [u' Film-Noir', u' Mystery', u' Thriller']
 99 | [u' Mystery', u' Thriller']
100 | [u' Biography', u' Drama', u' Musical']
101 | 


--------------------------------------------------------------------------------
/ml/document-clustering/data/title_list.txt:
--------------------------------------------------------------------------------
  1 | The Godfather
  2 | The Shawshank Redemption
  3 | Schindler's List
  4 | Raging Bull
  5 | Casablanca
  6 | One Flew Over the Cuckoo's Nest
  7 | Gone with the Wind
  8 | Citizen Kane
  9 | The Wizard of Oz
 10 | Titanic
 11 | Lawrence of Arabia
 12 | The Godfather: Part II
 13 | Psycho
 14 | Sunset Blvd.
 15 | Vertigo
 16 | On the Waterfront
 17 | Forrest Gump
 18 | The Sound of Music
 19 | West Side Story
 20 | Star Wars
 21 | E.T. the Extra-Terrestrial
 22 | 2001: A Space Odyssey
 23 | The Silence of the Lambs
 24 | Chinatown
 25 | The Bridge on the River Kwai
 26 | Singin' in the Rain
 27 | It's a Wonderful Life
 28 | Some Like It Hot
 29 | 12 Angry Men
 30 | Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
 31 | Amadeus
 32 | Apocalypse Now
 33 | Gandhi
 34 | The Lord of the Rings: The Return of the King
 35 | Gladiator
 36 | From Here to Eternity
 37 | Saving Private Ryan
 38 | Unforgiven
 39 | Raiders of the Lost Ark
 40 | Rocky
 41 | A Streetcar Named Desire
 42 | The Philadelphia Story
 43 | To Kill a Mockingbird
 44 | An American in Paris
 45 | The Best Years of Our Lives
 46 | My Fair Lady
 47 | Ben-Hur
 48 | Doctor Zhivago
 49 | Patton
 50 | Jaws
 51 | Braveheart
 52 | The Good, the Bad and the Ugly
 53 | Butch Cassidy and the Sundance Kid
 54 | The Treasure of the Sierra Madre
 55 | The Apartment
 56 | Platoon
 57 | High Noon
 58 | Dances with Wolves
 59 | The Pianist
 60 | Goodfellas
 61 | The Exorcist
 62 | The Deer Hunter
 63 | All Quiet on the Western Front
 64 | The French Connection
 65 | City Lights
 66 | The King's Speech
 67 | It Happened One Night
 68 | A Place in the Sun
 69 | Midnight Cowboy
 70 | Mr. Smith Goes to Washington
 71 | Rain Man
 72 | Annie Hall
 73 | Out of Africa
 74 | Good Will Hunting
 75 | Terms of Endearment
 76 | Tootsie
 77 | Fargo
 78 | Giant
 79 | The Grapes of Wrath
 80 | Shane
 81 | The Green Mile
 82 | Close Encounters of the Third Kind
 83 | Network
 84 | Nashville
 85 | The Graduate
 86 | American Graffiti
 87 | Pulp Fiction
 88 | The African Queen
 89 | Stagecoach
 90 | Mutiny on the Bounty
 91 | The Maltese Falcon
 92 | A Clockwork Orange
 93 | Taxi Driver
 94 | Wuthering Heights
 95 | Double Indemnity
 96 | Rebel Without a Cause
 97 | Rear Window
 98 | The Third Man
 99 | North by Northwest
100 | Yankee Doodle Dandy
101 | 


--------------------------------------------------------------------------------
/ml/document-clustering/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | jupyter
3 | matplotlib
4 | nltk
5 | pandas
6 | scikit-learn
7 | scipy
8 | 


--------------------------------------------------------------------------------
/ml/feature-importance/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is my revised code of the tutorial at:
 4 | - https://machinelearningmastery.com/calculate-feature-importance-with-python/
 5 | - https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/
 6 | 
 7 | ## Setup
 8 | 
 9 | for xgboost to work, do `brew install libomp` on Mac
10 | 
11 | ```
12 | python3 -m venv venv
13 | source venv/bin/activate
14 | pip install -r requirements.txt
15 | ```
16 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
17 | 
18 | 


--------------------------------------------------------------------------------
/ml/feature-importance/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | scikit-learn
5 | xgboost
6 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/.gitignore:
--------------------------------------------------------------------------------
1 | /datasets/omniglot/data
2 | /model
3 | /logs


--------------------------------------------------------------------------------
/ml/few-shot-learning/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is my revised code of the tutorial at: https://medium.com/@barnrang/re-implementation-of-the-prototypical-network-for-few-shot-learning-using-tensorflow-2-0-keras-b2adac8e49e0
 4 | 
 5 | The related paper is: Jake Snell and Kevin Swersky and Richard S. Zemel (2017). Prototypical Networks for Few-shot LearningCoRR, abs/1703.05175. https://arxiv.org/abs/1703.05175
 6 | ## Setup
 7 | 
 8 | Setup virtual environment and install packages
 9 | 
10 | ```
11 | python3 -m venv venv
12 | source venv/bin/activate
13 | pip install -r requirements.txt
14 | ```
15 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
16 | 
17 | ## Prepare Datasets
18 | Download dataset at https://drive.google.com/file/d/1UQEdAv4g_Mh2t15YtorNHkoHfQZJfmoE/view?usp=sharing
19 | 
20 | For omniglot dataset:
21 | 
22 | ```
23 | cd datasets/omniglot
24 | mkdir data
25 | unzip images_background.zip -d data/
26 | unzip images_evaluation.zip -d data/
27 | mv data/images_evaluation/* data/images_background/
28 | python dataloader_omniglot.py
29 | ```
30 | 
31 | Note that we split (1200 * 4(rotate 4 direction)) classes for training and the rest for the test set. The dataset will be collected into a numpy file .npy
32 | 
33 | ## Train and Test
34 | 
35 | To train:
36 | 
37 | In the root folder of this repo, run `python train_omniglot.py` to train 2 epochs by default (about 10 minutes on MacBook Pro). 
38 | 
39 | You can use different arguments:
40 | 
41 | - `python train_omniglot.py --epoch 100`
42 | - `python train_omniglot.py --train_way 60 --train_query 5 --val_way 20 --shot 1 --gpu 0[for specify the gpu]`
43 | 
44 | temp checkpoints (with the format `omniglot_conv_{epoch}_{shot}_{val_way}`) and the final model `omniglot_conv` are saved in the `/model` folder (ignored by git)
45 | 
46 | Show training visualization using Tensorboard, run `tensorboard --logdir=./logs --port=6006`
47 | 
48 | Then, you can access TensorBoard at http://localhost:6006/
49 | 
50 | <img width="729" alt="Screen Shot 2020-09-28 at 4 35 40 PM" src="https://user-images.githubusercontent.com/595772/94483335-b2c01b80-01a8-11eb-852c-b204fca31a10.png">
51 | 
52 | To test: 
53 | 
54 | `python test_omniglot.py --model model/omniglot_conv --shot 1 --test_way 20`
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/datasets/mini_imagenet/dataloader_mini_imagenet.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from skimage.io import imread
 4 | from skimage.transform import resize as imresize
 5 | import os
 6 | 
 7 | train_label = pd.read_csv('train.csv')
 8 | val_label = pd.read_csv('val.csv')
 9 | test_label = pd.read_csv('test.csv')
10 | 
11 | train_images = []
12 | 
13 | PATH = 'images'
14 | 
15 | for name, df in train_label['filename'].groupby(train_label['label']):
16 |     images = []
17 |     for image_name in df.values:
18 |         image = imread(os.path.join(PATH, image_name))
19 |         image = (imresize(image, (84,84)) * 255.).astype(np.uint8)
20 |         images.append(image)
21 | 
22 |     train_images.append(images)
23 | 
24 | val_images = []
25 | 
26 | PATH = 'images'
27 | 
28 | for name, df in val_label['filename'].groupby(val_label['label']):
29 |     images = []
30 |     for image_name in df.values:
31 |         image = imread(os.path.join(PATH, image_name))
32 |         image = (imresize(image, (84,84)) * 255.).astype(np.uint8)
33 |         images.append(image)
34 | 
35 |     val_images.append(images)
36 | 
37 | test_images = []
38 | 
39 | PATH = 'images'
40 | 
41 | for name, df in test_label['filename'].groupby(test_label['label']):
42 |     images = []
43 |     for image_name in df.values:
44 |         image = imread(os.path.join(PATH, image_name))
45 |         image = (imresize(image, (84,84)) * 255.).astype(np.uint8)
46 |         images.append(image)
47 | 
48 |     test_images.append(images)
49 | 
50 | train_images = np.array(train_images)
51 | 
52 | val_images = np.array(val_images)
53 | test_images = np.array(test_images)
54 | 
55 | np.save('mini_train', train_images)
56 | np.save('mini_val', val_images)
57 | np.save('mini_test', test_images)
58 | 
59 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/datasets/omniglot/dataloader_omniglot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import matplotlib.pyplot as plt
 4 | from tqdm import tqdm
 5 | from skimage.transform import resize as imresize
 6 | from skimage.transform import rotate
 7 | BASE_PATH = "data/images_background"
 8 | TRAIN_CLASS = 1200
 9 | 
10 | 
11 | def loader(path=None):
12 |     index = 0
13 |     train_images = []
14 |     eval_images = []
15 |     current_save = train_images
16 |     if path is None:
17 |         path = BASE_PATH
18 |     folders_list = os.listdir(path)
19 |     folders_list.sort()
20 |     count = 0
21 |     loading_eval = False
22 |     for folder in tqdm(folders_list):
23 |         path1 = os.path.join(path, folder)
24 |         try: #In case of invalid folder
25 |             for char_type in os.listdir(path1):
26 |                 if not loading_eval and count >= 1200:
27 |                     loading_eval = True
28 |                     current_save = eval_images
29 |                     print("Start to collect eval")
30 | 
31 |                 path2 = os.path.join(path1, char_type)
32 |                 try:
33 |                     for rot in [0,90,180,270]:
34 |                         class_image = []
35 |                         for image_name in os.listdir(path2):
36 |                             image = plt.imread(os.path.join(path2, image_name))
37 |                             image = imresize(image,(28,28), anti_aliasing=False)
38 |                             image = rotate(image, rot)
39 |                             image = np.expand_dims(image, axis=-1)
40 |                             class_image.append(image)
41 |                             current_save.append(class_image)
42 |                     count += 1
43 |                 except NotADirectoryError:
44 |                     print(f"Cannot load from {path2}")
45 |         except NotADirectoryError:
46 |             print(f"cannot load from {path1}")
47 |             continue
48 | 
49 |     np.save(f"./data/train_omniglot.npy", (np.array(train_images) * 255).astype(np.uint8))
50 |     np.save(f"./data/test_omniglot.npy", (np.array(eval_images) * 255).astype(np.uint8))
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     images = loader()
55 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/loader_omniglot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import tensorflow
 4 | from tensorflow import keras
 5 | 
 6 | 
 7 | class DataGenerator(tensorflow.keras.utils.Sequence):
 8 |     'Generates data for Keras'
 9 |     def __init__(self, data_type='train', dim=(28,28), n_channels=1,
10 |                 way=20, shot=1, query=1, num_batch=500):
11 |         'Initialization'
12 |         self.type = data_type
13 |         # if self.type == 'train':
14 |         #     self.is_training = np.array([True for _ in range(batch_size)])
15 |         # else:
16 |         #     self.is_training = np.array([False for _ in range(batch_size)])
17 |         self.dim = dim
18 |         #self.batch_size = batch_size
19 |         self.n_channels = n_channels
20 |         self.num_per_class = 20
21 |         self.num_batch = num_batch
22 |         #self.y_target = np.zeros(self.batch_size)
23 |         self.build_data(self.type)
24 |         self.on_epoch_end()
25 |         self.way = way
26 |         self.shot = shot
27 |         self.query = query
28 |         #TODO!!!!
29 |         #self.hard_batch = np.zeros(batch_size, *dim, n_channels)
30 | 
31 |     def build_data(self, data_type):
32 |         if data_type == 'train':
33 |             self.class_data = np.load('datasets/omniglot/data/train_omniglot.npy')
34 |         else:
35 |             self.class_data = np.load('datasets/omniglot/data/test_omniglot.npy')
36 | 
37 |         self.n_classes = len(self.class_data)
38 | 
39 |     def __len__(self):
40 |         'Denotes the number of batches per epoch'
41 |         return self.num_batch
42 | 
43 |     def __getitem__(self, index):
44 |         'Generate one batch of data'
45 |         # Generate data
46 |         X_sample, X_query, label = self.__data_generation()
47 |         #way = np.ones((self.way * self.shot, 1)) * self.way
48 | 
49 | 
50 |         return [X_sample, X_query], label
51 | 
52 |     def on_epoch_end(self):
53 |         'Updates indexes after each epoch'
54 |         pass
55 | 
56 |     def __data_generation(self):
57 |         'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
58 |         # Initialization
59 |         X_sample = np.empty((self.way, self.shot, *self.dim, self.n_channels))
60 |         X_query = np.empty((self.way, self.query, *self.dim, self.n_channels))
61 |         chosen_class = random.sample(range(self.n_classes), self.way)
62 |         label = np.empty(self.way * self.query)
63 |         # print(pos, neg)
64 |         # print(self.class_data[pos][0].shape)
65 |         # Generate data
66 |         for i in range(self.way):
67 |             sample_idx = random.sample(range(self.num_per_class), self.shot + self.query)
68 |             sample_data = self.class_data[chosen_class[i]][sample_idx]/255.
69 |             X_sample[i] = sample_data[:self.shot]
70 |             X_query[i] = sample_data[self.shot:self.shot + self.query]
71 |             label[i * self.query: (i+1) * self.query] = i
72 |         return X_sample, X_query, keras.utils.to_categorical(label)
73 |         #return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
74 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/mini_imagenet/mini_proto_model.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D, Activation
 2 | from tensorflow.keras.layers import BatchNormalization
 3 | from tensorflow.keras.models import Model, Sequential
 4 | from tensorflow.keras.regularizers import l2
 5 | from tensorflow.keras import backend as K
 6 | from tensorflow.keras.optimizers import SGD,Adam
 7 | from tensorflow.keras.losses import binary_crossentropy
 8 | import tensorflow as tf
 9 | import numpy.random as rng
10 | import numpy as np
11 | import os
12 | import matplotlib.pyplot as plt
13 | eps = 1e-12
14 | 
15 | def W_init(shape,name=None):
16 |     """Initialize weights as in paper"""
17 |     values = rng.normal(loc=0,scale=1e-2,size=shape)
18 |     return K.variable(values,name=name)
19 | #//TODO: figure out how to initialize layer biases in tensorflow.keras.
20 | def b_init(shape,name=None):
21 |     """Initialize bias as in paper"""
22 |     values=rng.normal(loc=0.5,scale=1e-2,size=shape)
23 |     return K.variable(values,name=name)
24 | 
25 | input_shape = (84,84, 3)
26 | 
27 | #build convnet to use in each siamese 'leg'
28 | def conv_net():
29 |     convnet = Sequential()
30 |     for i in range(4):
31 |         convnet.add(Conv2D(64,(3,3),padding='same',input_shape=input_shape))
32 |         convnet.add(BatchNormalization())
33 |         convnet.add(Activation('relu'))
34 |         convnet.add(MaxPooling2D())
35 |     convnet.add(Flatten())
36 |     return convnet
37 | 
38 | def l1_distance(x,y):
39 |     return tf.reduce_sum(tf.maximum(tf.abs(x-y),eps), axis=1, keep_dims=True)
40 | 
41 | def l2_distance(x,y):
42 |     return tf.sqrt(tf.reduce_sum(tf.maximum(tf.square(x-y),eps), axis=1, keep_dims=True))
43 | 
44 | def hinge_loss(target, pred, h=1.):
45 |     loss = tf.reduce_mean(tf.maximum(pred + h, 0.))
46 |     return loss
47 | 
48 | def acc(target, pred):
49 |     result = tf.cast(tf.less(pred, target), dtype=tf.float32)
50 |     return tf.reduce_mean(result)
51 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/mini_imagenet/mini_proto_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | def parser():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('--test_way', dest='test_way', type=int, default=5)
 7 |     parser.add_argument('--shot', dest='shot', type=int, default=1)
 8 |     parser.add_argument('--gpu', dest='gpu', type=int, default=0)
 9 |     parser.add_argument('--model', dest='model')
10 | 
11 |     return parser.parse_args()
12 | 
13 | args = parser()
14 | os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
15 | 
16 | from tensorflow.keras import callbacks as cb
17 | from tensorflow.keras.optimizers import Adam
18 | from tensorflow.keras.models import load_model, Model, save_model
19 | from tensorflow.keras.layers import *
20 | from tensorflow.keras.models import Sequential
21 | from tensorflow.keras import regularizers as rg
22 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
23 | from tensorflow.keras.applications.xception import Xception
24 | from tensorflow.keras import backend as K
25 | 
26 | 
27 | import numpy.random as rng
28 | 
29 | import numpy as np
30 | import matplotlib.pyplot as plt
31 | import matplotlib.image as img
32 | import random
33 | from python.dataloader import loader
34 | from mini_protoloader import DataGenerator
35 | from mini_proto_model import conv_net, hinge_loss, l2_distance, acc, l1_distance
36 | #from transform import transform_gate
37 | from util.tensor_op import *
38 | from util.loss import *
39 | input_shape = (None,84,84,3)
40 | batch_size = 20
41 | test_way = args.test_way
42 | shot = args.shot
43 | model_path = args.model
44 | lr = 0.002
45 | 
46 | def scheduler(epoch):
47 |     global lr
48 |     if epoch % 15 == 0:
49 |         lr /= 2
50 |     return lr
51 | 
52 | class SaveConv(tf.keras.callbacks.Callback):
53 |     def on_epoch_end(self, epoch, logs=None):
54 |         if epoch % 5 == 0:
55 |             save_model(conv, f"model/miniimage_conv_{epoch}_{shot}_{val_way}")
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     #conv = conv_net()
60 |     conv = load_model(model_path)
61 |     sample = Input(input_shape)
62 |     conv_5d = TimeDistributed(conv)
63 |     out_feature = conv_5d(sample)
64 |     out_feature = Lambda(reduce_tensor)(out_feature)
65 |     inp = Input(input_shape)
66 |     map_feature = conv_5d(inp)
67 |     map_feature = Lambda(reshape_query)(map_feature)
68 |     pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance
69 |     combine = Model([sample, inp], pred)
70 | 
71 |     optimizer = Adam(0.001)
72 |     combine.compile(loss='categorical_crossentropy', optimizer=optimizer,
73 |         metrics=['categorical_accuracy'])
74 |     test_loader = DataGenerator(data_type='test',way=test_way, shot=shot, num_batch=10000)
75 | 
76 |     combine.evaluate(test_loader)
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/mini_imagenet/mini_protoloader.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.utils import np_utils
  3 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
  4 | import tensorflow
  5 | import keras
  6 | import random
  7 | from python.dataloader import loader
  8 | 
  9 | class DataGenerator(tensorflow.keras.utils.Sequence):
 10 |     'Generates data for Keras'
 11 |     def __init__(self, data_type='train', dim=(84,84), n_channels=3,
 12 |                 way=5, shot=1, query=5, num_batch=500):
 13 |         'Initialization'
 14 |         self.type = data_type
 15 |         # if self.type == 'train':
 16 |         #     self.is_training = np.array([True for _ in range(batch_size)])
 17 |         # else:
 18 |         #     self.is_training = np.array([False for _ in range(batch_size)])
 19 |         self.dim = dim
 20 |         #self.batch_size = batch_size
 21 |         self.n_channels = n_channels
 22 |         self.num_per_class = 600
 23 |         self.num_batch = num_batch
 24 |         #self.y_target = np.zeros(self.batch_size)
 25 |         self.build_data(self.type)
 26 |         self.on_epoch_end()
 27 |         self.way = way
 28 |         self.shot = shot
 29 |         self.query = query
 30 |         self.transformer = ImageDataGenerator(
 31 |             width_shift_range=0.1,
 32 |             height_shift_range=0.1,
 33 |             zoom_range=0.2,
 34 |             rotation_range=30,
 35 |             horizontal_flip=True,
 36 |             shear_range=0.1
 37 | 
 38 |         )
 39 |         #TODO!!!!
 40 |         #self.hard_batch = np.zeros(batch_size, *dim, n_channels)
 41 | 
 42 |     def build_data(self, data_type):
 43 |         if data_type == 'train':
 44 |             self.class_data = np.load('python/mini_train.npy')
 45 |         elif data_type == 'val':
 46 |             self.class_data = np.load('python/mini_val.npy')
 47 |         else:
 48 |             self.class_data = np.load('python/mini_test.npy')
 49 | 
 50 |         self.n_classes = len(self.class_data)
 51 | 
 52 |     def __len__(self):
 53 |         'Denotes the number of batches per epoch'
 54 |         return self.num_batch
 55 | 
 56 |     def __getitem__(self, index):
 57 |         'Generate one batch of data'
 58 |         # Generate data
 59 |         X_sample, X_query, label = self.__data_generation()
 60 |         #way = np.ones((self.way * self.shot, 1)) * self.way
 61 | 
 62 | 
 63 |         return [X_sample, X_query], label
 64 | 
 65 |     def on_epoch_end(self):
 66 |         'Updates indexes after each epoch'
 67 |         pass
 68 | 
 69 |     def __data_generation(self):
 70 |         'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
 71 |         # Initialization
 72 |         X_sample = np.empty((self.way, self.shot, *self.dim, self.n_channels))
 73 |         X_query = np.empty((self.way, self.query, *self.dim, self.n_channels))
 74 |         chosen_class = random.sample(range(self.n_classes), self.way)
 75 |         label = np.empty(self.way * self.query)
 76 |         # print(pos, neg)
 77 |         # print(self.class_data[pos][0].shape)
 78 |         # Generate data
 79 |         for i in range(self.way):
 80 |             sample_idx = random.sample(range(self.num_per_class), self.shot + self.query)
 81 |             sample_data = self.class_data[chosen_class[i]][sample_idx]/255.
 82 |             if True:
 83 |             #if self.type != 'train':
 84 |                 X_sample[i] = sample_data[:self.shot]
 85 |                 X_query[i] = sample_data[self.shot:self.shot + self.query]
 86 |             else:
 87 |                 for j in range(self.shot):
 88 |                     params = self.transformer.get_random_transform(self.dim + (self.n_channels,))
 89 |                     x = self.transformer.apply_transform(sample_data[j], params)
 90 |                     X_sample[i][j] = x
 91 | 
 92 |                 for j in range(self.shot, self.shot + self.query):
 93 |                     params = self.transformer.get_random_transform(self.dim + (self.n_channels,))
 94 |                     x = self.transformer.apply_transform(sample_data[j], params)
 95 |                     X_query[i][j-self.shot] = x
 96 | 
 97 |             label[i * self.query: (i+1) * self.query] = i
 98 |         return X_sample, X_query, np_utils.to_categorical(label)
 99 |         #return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
100 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/model_omniglot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.keras.layers import Input, Conv2D, Lambda, Dense, Flatten, MaxPooling2D, Activation, BatchNormalization
 4 | from tensorflow.keras.models import Model, Sequential
 5 | 
 6 | 
 7 | eps = 1e-12
 8 | 
 9 | def W_init(shape,name=None):
10 |     """Initialize weights as in paper"""
11 |     values = np.random.normal(loc=0, scale=1e-2, size=shape)
12 |     return tf.variable(values, name=name)
13 | 
14 | 
15 | #//TODO: figure out how to initialize layer biases in tensorflow.keras.
16 | def b_init(shape, name=None):
17 |     """Initialize bias as in paper"""
18 |     values=np.random.normal(loc=0.5, scale=1e-2, size=shape)
19 |     return tf.variable(values, name=name)
20 | 
21 | input_shape = (28, 28, 1)
22 | 
23 | 
24 | #build convnet to use in each siamese 'leg'
25 | def conv_net():
26 |     convnet = Sequential()
27 |     for i in range(4):
28 |         convnet.add(Conv2D(64,(3,3),padding='same', input_shape=input_shape))
29 |         convnet.add(BatchNormalization())
30 |         convnet.add(Activation('relu'))
31 |         convnet.add(MaxPooling2D())
32 |     convnet.add(Flatten())
33 |     return convnet
34 | 
35 | 
36 | def l1_distance(x,y):
37 |     return tf.reduce_sum(tf.maximum(tf.abs(x-y),eps), axis=1, keep_dims=True)
38 | 
39 | 
40 | def l2_distance(x,y):
41 |     return tf.sqrt(tf.reduce_sum(tf.maximum(tf.square(x-y),eps), axis=1, keep_dims=True))
42 | 
43 | 
44 | def hinge_loss(target, pred, h=1.):
45 |     loss = tf.reduce_mean(tf.maximum(pred + h, 0.))
46 |     return loss
47 | 
48 | 
49 | def acc(target, pred):
50 |     result = tf.cast(tf.less(pred, target), dtype=tf.float32)
51 |     return tf.reduce_mean(result)
52 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/notebooks/dataloader_notebook/images_background_small2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/few-shot-learning/notebooks/dataloader_notebook/images_background_small2.zip


--------------------------------------------------------------------------------
/ml/few-shot-learning/notebooks/dataloader_notebook/loss_test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/home/barnrang/.conda/envs/chatbot/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
 13 |       "  return f(*args, **kwds)\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import tensorflow as tf\n",
 19 |     "from util.loss import prior_dist\n",
 20 |     "%load_ext autoreload\n",
 21 |     "%autoreload 2"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "x = tf.placeholder(shape=[None, 2], dtype=tf.float32)\n",
 33 |     "y = tf.placeholder(shape=[None, 2], dtype=tf.float32)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 6,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "Tensor(\"Sum_4:0\", shape=(?, 1), dtype=float32) Tensor(\"Sum_5:0\", shape=(?, 1), dtype=float32)\n",
 46 |       "Tensor(\"MatMul_2:0\", shape=(?, ?), dtype=float32)\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "z = prior_dist([x,y])"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 8,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "[[  9.  17.  29.]\n",
 64 |       " [  1.   1.   5.]]\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "with tf.Session() as sess:\n",
 70 |     "    print(sess.run(z, feed_dict={x:[[1,3],[2,4],[3,5]],\n",
 71 |     "                          y:[[1,0],[1,4]]}))"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": []
 82 |   }
 83 |  ],
 84 |  "metadata": {
 85 |   "kernelspec": {
 86 |    "display_name": "Python 3",
 87 |    "language": "python",
 88 |    "name": "python3"
 89 |   },
 90 |   "language_info": {
 91 |    "codemirror_mode": {
 92 |     "name": "ipython",
 93 |     "version": 3
 94 |    },
 95 |    "file_extension": ".py",
 96 |    "mimetype": "text/x-python",
 97 |    "name": "python",
 98 |    "nbconvert_exporter": "python",
 99 |    "pygments_lexer": "ipython3",
100 |    "version": "3.6.3"
101 |   }
102 |  },
103 |  "nbformat": 4,
104 |  "nbformat_minor": 2
105 | }
106 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | numpy
3 | scikit-image
4 | tensorflow >= 2.0
5 | tqdm


--------------------------------------------------------------------------------
/ml/few-shot-learning/test_omniglot.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tensorflow.keras import callbacks as cb
 5 | from tensorflow.keras.optimizers import Adam
 6 | from tensorflow.keras.models import load_model, Model, save_model
 7 | from tensorflow.keras.layers import *
 8 | 
 9 | from loader_omniglot import DataGenerator
10 | from model_omniglot import conv_net
11 | from util.tensor_op import *
12 | from util.loss import *
13 | 
14 | 
15 | def parser():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--test_way', dest='test_way', type=int, default=5)
18 |     parser.add_argument('--shot', dest='shot', type=int, default=1)
19 |     parser.add_argument('--gpu', dest='gpu', type=int, default=0)
20 |     parser.add_argument('--model', dest='model')
21 | 
22 |     return parser.parse_args()
23 | 
24 | args = parser()
25 | os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
26 | test_way = args.test_way
27 | shot = args.shot
28 | model_path = args.model
29 | 
30 | input_shape = (None, 28, 28, 1)
31 | batch_size = 20
32 | lr = 0.002
33 | 
34 | 
35 | def scheduler(epoch):
36 |     global lr
37 |     if epoch % 15 == 0:
38 |         lr /= 2
39 |     return lr
40 | 
41 | 
42 | class SaveConv(tf.keras.callbacks.Callback):
43 |     def on_epoch_end(self, epoch, logs=None):
44 |         if epoch % 5 == 0:
45 |             save_model(conv, f"model/omniglot_conv_{epoch}_{shot}_{val_way}")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     conv = load_model(model_path)
50 |     sample = Input(input_shape)
51 |     conv_5d = TimeDistributed(conv)
52 |     out_feature = conv_5d(sample)
53 |     out_feature = Lambda(reduce_tensor)(out_feature)
54 |     inp = Input(input_shape)
55 |     map_feature = conv_5d(inp)
56 |     map_feature = Lambda(reshape_query)(map_feature)
57 |     pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance
58 |     combine = Model([sample, inp], pred)
59 | 
60 |     optimizer = Adam(0.001)
61 |     combine.compile(loss='categorical_crossentropy', optimizer=optimizer,
62 |         metrics=['categorical_accuracy'])
63 |     test_loader = DataGenerator(data_type='test', way=test_way, shot=shot, num_batch=10000)
64 | 
65 |     combine.evaluate(test_loader)


--------------------------------------------------------------------------------
/ml/few-shot-learning/train_omniglot.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tensorflow.keras import callbacks as cb
 5 | from tensorflow.keras.optimizers import Adam
 6 | from tensorflow.keras.models import load_model, Model, save_model
 7 | from tensorflow.keras.layers import *
 8 | 
 9 | # import from custom modules
10 | from loader_omniglot import DataGenerator
11 | from model_omniglot import conv_net
12 | from util.tensor_op import *
13 | from util.loss import *
14 | 
15 | # command line argument parser
16 | def parser():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--train_way', dest='train_way', type=int, default=60)
19 |     parser.add_argument('--train_query', dest='train_query', type=int, default=5)
20 |     parser.add_argument('--val_way', dest='val_way', type=int, default=20)
21 |     parser.add_argument('--shot', dest='shot', type=int, default=1)
22 |     parser.add_argument('--gpu', dest='gpu', type=int, default=0)
23 |     parser.add_argument('--epochs', dest='epochs', type=int, default=2)
24 | 
25 |     return parser.parse_args()
26 | 
27 | args = parser()
28 | os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
29 | 
30 | # get values from the command line arguments
31 | train_way = args.train_way
32 | train_query = args.train_query
33 | val_way = args.val_way
34 | shot = args.shot
35 | epochs = args.epochs
36 | 
37 | # specify model parameters
38 | input_shape = (None, 28, 28, 1)
39 | batch_size = 20
40 | lr = 0.002
41 | 
42 | def scheduler(epoch):
43 |     global lr
44 |     if epoch % 100 == 0:
45 |         lr /= 2
46 |     return lr
47 | 
48 | class SaveConv(tf.keras.callbacks.Callback):
49 |     def on_epoch_end(self, epoch, logs=None):
50 |         if epoch % 50 == 0:
51 |             save_model(conv, f"model/omniglot_conv_{epoch}_{shot}_{val_way}")
52 | 
53 | if __name__ == "__main__":
54 |     conv = conv_net()
55 |     sample = Input(input_shape)
56 |     conv_5d = TimeDistributed(conv)
57 |     out_feature = conv_5d(sample)
58 |     out_feature = Lambda(reduce_tensor)(out_feature)
59 |     inp = Input(input_shape)
60 |     map_feature = conv_5d(inp)
61 |     map_feature = Lambda(reshape_query)(map_feature)
62 |     # proto_dist is from util/loss.py
63 |     pred = Lambda(proto_dist)([out_feature, map_feature]) #negative distance
64 |     combine = Model([sample, inp], pred)
65 | 
66 |     optimizer = Adam(0.001)
67 |     combine.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
68 | 
69 |     train_loader = DataGenerator(way=train_way, query=train_query, shot=shot, num_batch=1000)
70 |     val_loader = DataGenerator(data_type='val',way=val_way, shot=shot)
71 | 
72 |     (x,y), z = train_loader[0]
73 |     print(x.shape, y.shape, z.shape)
74 |     print(combine.summary())
75 | 
76 |     save_conv = SaveConv()
77 |     reduce_lr = cb.ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=2, min_lr=1e-8)
78 |     lr_sched = cb.LearningRateScheduler(scheduler)
79 |     tensorboard = cb.TensorBoard()
80 | 
81 |     combine.fit_generator(
82 |         train_loader, 
83 |         epochs=epochs, 
84 |         validation_data=val_loader, 
85 |         use_multiprocessing=False, 
86 |         workers=4, 
87 |         shuffle=False, 
88 |         callbacks=[save_conv, lr_sched, tensorboard]
89 |         )
90 | 
91 |     save_model(conv, "model/omniglot_conv")
92 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/util/loss.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def proto_dist(x):
 4 |     feature, pred = x
 5 |     pred_dist = tf.reduce_sum(pred ** 2, axis=1, keepdims=True)
 6 |     feature_dist = tf.reduce_sum(feature ** 2, axis=1, keepdims=True)
 7 |     dot = tf.matmul(pred, tf.transpose(feature))
 8 |     return tf.nn.softmax(-(tf.sqrt(pred_dist + tf.transpose(feature_dist) - 2 * dot)))
 9 | 
10 | 


--------------------------------------------------------------------------------
/ml/few-shot-learning/util/tensor_op.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def slice_tensor_and_sum(x, way=20):
 4 |     sliced = tf.split(x, num_or_size_splits=way,axis=0)
 5 |     return tf.reduce_mean(sliced, axis=1)
 6 | 
 7 | def reduce_tensor(x):
 8 |     return tf.reduce_mean(x, axis=1)
 9 | 
10 | def reshape_query(x):
11 |     return tf.reshape(x, [-1, tf.shape(x)[-1]])
12 | 


--------------------------------------------------------------------------------
/ml/fine-tune-pegasus/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | Fine tune pegasus-large using XSUM dataset
 3 | 
 4 | adapted from https://towardsdatascience.com/how-to-perform-abstractive-summarization-with-pegasus-3dd74e48bafb
 5 | 
 6 | Colab Version (a little different from this notebook - include pip installation and batch size is 2): https://colab.research.google.com/drive/1RyUsYDAo6bA1RZICMb-FxYLszBcDY81X?usp=sharing
 7 | 
 8 | ## Setup
 9 | 
10 | ```
11 | $ python3 -m venv venv
12 | $ source venv/bin/activate
13 | $ pip install -r requirements.txt
14 | ```


--------------------------------------------------------------------------------
/ml/fine-tune-pegasus/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | SentencePiece
3 | transformers>=4.3.3
4 | datasets>=1.4.1
5 | torch>=1.8.0
6 | 


--------------------------------------------------------------------------------
/ml/graph/.gitignore:
--------------------------------------------------------------------------------
1 | /logs


--------------------------------------------------------------------------------
/ml/graph/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | https://www.analyticsvidhya.com/blog/2020/01/link-prediction-how-to-predict-your-future-connections-on-facebook/
 4 | https://www.analyticsvidhya.com/blog/2019/11/graph-feature-extraction-deepwalk/
 5 | https://www.tensorflow.org/tutorials/text/word2vec
 6 | 
 7 | 
 8 | 
 9 | ## Setup
10 | 
11 | ```
12 | $ python3 -m venv venv
13 | $ source venv/bin/activate
14 | $ pip install -r requirements.txt
15 | ```


--------------------------------------------------------------------------------
/ml/graph/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | matplotlib
4 | tensorflow
5 | tqdm
6 | sklearn
7 | networkx
8 | node2vec
9 | lightgbm


--------------------------------------------------------------------------------
/ml/greedy-layer-wise-pretraning/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is my revised code of the tutorial at https://machinelearningmastery.com/greedy-layer-wise-pretraining-tutorial/
 4 | 
 5 | ## Setup
 6 | 
 7 | within the tutorial folder:
 8 | 
 9 | ```
10 | python3 -m venv venv
11 | source venv/bin/activate
12 | pip install -r requirements.txt
13 | ```
14 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
15 | 
16 | 


--------------------------------------------------------------------------------
/ml/greedy-layer-wise-pretraning/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | matplotlib
3 | pandas
4 | scikit-learn
5 | tensorflow
6 | 


--------------------------------------------------------------------------------
/ml/house-price-prediction/README.md:
--------------------------------------------------------------------------------
 1 | ## Kaggle Kernel
 2 | 
 3 | You can run this kernel directly at Kaggle.com: https://www.kaggle.com/harrywang/housing-price-prediction
 4 | 
 5 | ## Run Locally
 6 | 
 7 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
 8 | 
 9 | ```
10 | $ cd path_to_this folder
11 | $ virtualenv -p python3 venv
12 | $ source venv/bin/activate
13 | $ pip3 install -r requirements.txt
14 | ```
15 | 
16 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
17 | 
18 | ## Source
19 | 
20 | This is the dataset used in this book: https://github.com/ageron/handson-ml/tree/master/datasets/housing to illustrate a sample end-to-end ML project workflow (pipeline). This is a great book - I highly recommend!
21 | 
22 | The data is based on California Census in 1990.
23 | 
24 | ### About the Data (from the book):
25 | 
26 | "This dataset is a modified version of the California Housing dataset available from Luís Torgo's page (University of Porto). Luís Torgo obtained it from the StatLib repository (which is closed now). The dataset may also be downloaded from StatLib mirrors.
27 | 
28 | The following is the description from the book author:
29 | 
30 | This dataset appeared in a 1997 paper titled Sparse Spatial Autoregressions by Pace, R. Kelley and Ronald Barry, published in the Statistics and Probability Letters journal. They built it using the 1990 California census data. It contains one row per census block group. A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data (a block group typically has a population of 600 to 3,000 people).
31 | 
32 | The dataset in this directory is almost identical to the original, with two differences:
33 | 207 values were randomly removed from the total_bedrooms column, so we can discuss what to do with missing data.
34 | An additional categorical attribute called ocean_proximity was added, indicating (very roughly) whether each block group is near the ocean, near the Bay area, inland or on an island. This allows discussing what to do with categorical data.
35 | Note that the block groups are called "districts" in the Jupyter notebooks, simply because in some contexts the name "block group" was confusing."
36 | 
37 | ### About the Data (From Luís Torgo page):
38 | http://www.dcc.fc.up.pt/%7Eltorgo/Regression/cal_housing.html
39 | 
40 | This is a dataset obtained from the StatLib repository. Here is the included description:
41 | 
42 | "We collected information on the variables using all the block groups in California from the 1990 Cens us. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value)."
43 | 
44 | 
45 | ### End-to-End ML Project Steps (Chapter 2 of the book)
46 | 
47 | 1. Look at the big picture
48 | 2. Get the data
49 | 3. Discover and visualize the data to gain insights
50 | 4. Prepare the data for Machine Learning algorithms
51 | 5. Select a model and train it
52 | 6. Fine-tune your model
53 | 7. Present your solution
54 | 8. Launch, monitor, and maintain your system
55 | 
56 | ## The 10-Step Machine Learning Project Workflow (My Version)
57 | 
58 | 1. Define business object
59 | 2. Make sense of the data from a high level
60 |     - data types (number, text, object, etc.)
61 |     - continuous/discrete
62 |     - basic stats (min, max, std, median, etc.) using boxplot
63 |     - frequency via histogram
64 |     - scales and distributions of different features
65 | 3. Create the traning and test sets using proper sampling methods, e.g., random vs. stratified
66 | 4. Correlation analysis (pair-wise and attribute combinations)
67 | 5. Data cleaning (missing data, outliers, data errors)
68 | 6. Data transformation via pipelines (categorical text to number using one hot encoding, feature scaling via normalization/standardization, feature combinations)
69 | 7. Train and cross validate different models and select the most promising one (Linear Regression, Decision Tree, and Random Forest were tried in this tutorial)
70 | 8. Fine tune the model using trying different combinations of hyperparameters
71 | 9. Evaluate the model with best estimators in the test set
72 | 10. Launch, monitor, and refresh the model and system
73 | 


--------------------------------------------------------------------------------
/ml/house-price-prediction/input/anscombe.csv:
--------------------------------------------------------------------------------
 1 | dataset,x,y
 2 | I,10.0,8.04
 3 | I,8.0,6.95
 4 | I,13.0,7.58
 5 | I,9.0,8.81
 6 | I,11.0,8.33
 7 | I,14.0,9.96
 8 | I,6.0,7.24
 9 | I,4.0,4.26
10 | I,12.0,10.84
11 | I,7.0,4.82
12 | I,5.0,5.68
13 | II,10.0,9.14
14 | II,8.0,8.14
15 | II,13.0,8.74
16 | II,9.0,8.77
17 | II,11.0,9.26
18 | II,14.0,8.1
19 | II,6.0,6.13
20 | II,4.0,3.1
21 | II,12.0,9.13
22 | II,7.0,7.26
23 | II,5.0,4.74
24 | III,10.0,7.46
25 | III,8.0,6.77
26 | III,13.0,12.74
27 | III,9.0,7.11
28 | III,11.0,7.81
29 | III,14.0,8.84
30 | III,6.0,6.08
31 | III,4.0,5.39
32 | III,12.0,8.15
33 | III,7.0,6.42
34 | III,5.0,5.73
35 | IV,8.0,6.58
36 | IV,8.0,5.76
37 | IV,8.0,7.71
38 | IV,8.0,8.84
39 | IV,8.0,8.47
40 | IV,8.0,7.04
41 | IV,8.0,5.25
42 | IV,19.0,12.5
43 | IV,8.0,5.56
44 | IV,8.0,7.91
45 | IV,8.0,6.89
46 | 


--------------------------------------------------------------------------------
/ml/house-price-prediction/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | sklearn
4 | matplotlib
5 | seaborn
6 | 


--------------------------------------------------------------------------------
/ml/imbalanced-multi-classification/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | This is my revised code of the tutorial at: https://machinelearningmastery.com/multi-class-imbalanced-classification/
 4 | 
 5 | ## Setup
 6 | 
 7 | 
 8 | ```
 9 | python3 -m venv venv
10 | source venv/bin/activate
11 | pip install -r requirements.txt
12 | ```
13 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
14 | 
15 | 


--------------------------------------------------------------------------------
/ml/imbalanced-multi-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | imbalanced-learn
3 | matplotlib
4 | pandas
5 | scikit-learn


--------------------------------------------------------------------------------
/ml/openml-csv-arff/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | This is the pre-processing script to convert a csv to arff for openml data upload at https://www.openml.org/d/42634
 4 | 
 5 | After a csv is created, you need to use the weka to load and save the csv as raff file and then upload to openml.org:
 6 | 
 7 | <img width="496" alt="Screen Shot 2020-08-28 at 4 26 08 PM" src="https://user-images.githubusercontent.com/595772/91612350-39d06880-e94b-11ea-968a-1f44e1fb6afa.png">
 8 | 
 9 | 
10 | NOTE: https://pypi.org/project/csv2arff/ does not work - lots of errors.
11 | 
12 | 
13 | 
14 | ## Setup
15 | 
16 | Tested with Python 3.6 via virtual environment:
17 | ```shell
18 | $ python3.6 -m venv venv
19 | $ source venv/bin/activate
20 | $ jupyter notebook
21 | ```
22 | 
23 | 


--------------------------------------------------------------------------------
/ml/process-mining/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | A script to convert a txt log into process mining log format. The key of this notebook is to show how to use a pre-defined time gap to identify a user session
 4 | 
 5 | ## Data
 6 | 
 7 | The log file is very big, the following commands can help get a small sample of the file for EDA and other tasks.
 8 | 
 9 | ### Utility Commands
10 | 
11 | Run `$ wc -l search.txt` to find out the total number of lines in the file. When the file is too big for efficient pandas analysis and you can split it into smaller files with 100,000 lines each by using Linux version of `split`.
12 | 
13 | ```
14 | $ brew install coreutils
15 | $ gsplit -a 4 -d -l 10000 file.txt search_
16 | ```
17 | 
18 | ## Setup
19 | 
20 | Tested with Python 3.6 via virtual environment:
21 | ```shell
22 | $ python3.6 -m venv venv
23 | $ source venv/bin/activate
24 | $ pip install -r requirements.txt
25 | $ jupyter notebook
26 | ```
27 | 
28 | - log2csv notebook converts the text into a csv.
29 | - log-eda notebook converts the log into process mining log
30 | 


--------------------------------------------------------------------------------
/ml/process-mining/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | pandas==0.25.2
3 | matplotlib==3.1.2
4 | seaborn==0.9.0
5 | 


--------------------------------------------------------------------------------
/ml/tf-serving/.gitignore:
--------------------------------------------------------------------------------
1 | models/


--------------------------------------------------------------------------------
/ml/tf-serving/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | My revised code based on:
 4 | 
 5 | - https://thelongrun.blog/2020/01/12/rest-api-tensorflow-serving-pt1/
 6 | - https://thelongrun.blog/2020/01/26/rest-api-tensorflow-serving-pt2/
 7 | 
 8 | 
 9 | # Setup
10 | 
11 | Setup virtual environment and install packages:
12 | ```
13 | python3 -m venv venv
14 | source venv/bin/activate
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | Pull tf serving docker image (I assume you installed docker: https://docs.docker.com/get-docker/): 
19 | ```
20 | docker pull tensorflow/serving:latest
21 | ```
22 | 
23 | Create tf serving servables from tf functions and pre-trained models, two servables will be generated and saved in `/models/` folder:
24 | 
25 | ```
26 | python make_servables.py
27 | ```
28 | 
29 | # Start TF Serving Servers
30 | 
31 | Use tf serving to serve two models from different host ports: 8501 and 8502 by running the following command **in the repo root folder**:
32 | ```
33 | docker run -t --rm -p 8501:8501 -v "$(pwd)/models/mobilenet_v2_test:/models/mobilenet_v2_test" -e MODEL_NAME=mobilenet_v2_test tensorflow/serving &
34 | 
35 | docker run -t --rm -p 8502:8501 -v "$(pwd)/models/add_two:/models/add_two" -e MODEL_NAME=add_two tensorflow/serving &
36 | ```
37 | 
38 | You should see two docker apps running:
39 | 
40 | <img width="753" alt="Screen Shot 2020-11-03 at 5 45 23 PM" src="https://user-images.githubusercontent.com/595772/98048786-6c3f8b80-1dfc-11eb-808d-f4c0697305ae.png">
41 | 
42 | # Use REST APIs for Computing/Inference
43 | 
44 | Call the `AddTwo()` function using `curl`, which will add 2 to each number in the tensor:
45 | ```
46 | curl -H "Content-Type: application/json" -d '{"instances":[1.0, 5.0, 4.0]}'  http://localhost:8502/v1/models/add_two:predict
47 | ```
48 | 
49 | Call MobileNet classifier using `curl`:
50 | ```
51 | chmod +x client_curl.sh 
52 | ./client_curl.sh ./images/animal.jpg
53 | ```
54 | 
55 | Call MobileNet classifier using `curl` via python: 
56 | 
57 | ```
58 | python client.py
59 | ```
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/ml/tf-serving/client.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | import base64
 4 |  
 5 | data = {}
 6 | with open('images/animal.jpg', mode='rb') as file:
 7 |     img = file.read()
 8 | data = {"inputs":[{"b64":base64.encodebytes(img).decode("utf-8")}]}
 9 |  
10 | # Making the request
11 | r = requests.post("http://localhost:8501/v1/models/mobilenet_v2_test:predict", data=json.dumps(data))
12 | print(r.content)
13 | # And returns:
14 | # b'{\n    "outputs": [\n        "giant panda"\n    ]\n}'


--------------------------------------------------------------------------------
/ml/tf-serving/client_curl.sh:
--------------------------------------------------------------------------------
1 | # $1 refers to the path where the image file is located
2 | ENCODED_IMG="$(base64 $1)"
3 | (echo '{"inputs": [{"b64": "'; echo "$ENCODED_IMG"; echo '"}]}') | curl -H "Content-Type: application/json" -d @-  http://localhost:8501/v1/models/mobilenet_v2_test:predict


--------------------------------------------------------------------------------
/ml/tf-serving/images/animal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/animal.jpg


--------------------------------------------------------------------------------
/ml/tf-serving/images/clear.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/clear.jpg


--------------------------------------------------------------------------------
/ml/tf-serving/images/ponds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/ml/tf-serving/images/ponds.png


--------------------------------------------------------------------------------
/ml/tf-serving/make_servables.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow_hub as hub
 3 | 
 4 | 
 5 | class AddTwo(tf.Module):
 6 |     @tf.function(input_signature=[tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name='x')])
 7 |     def add_two(self, x):
 8 |         return x + 2
 9 | 
10 | 
11 | class CustomMobileNet(tf.keras.Model):
12 |     model_handler = "https://tfhub.dev/google/imagenet/mobilenet_v2_035_224/classification/4"
13 |      
14 |     def __init__(self):
15 |         super(CustomMobileNet, self).__init__()
16 |         self.model = hub.load(self.__class__.model_handler)
17 |         self.labels = None
18 | 
19 |     @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.string)])
20 |     def call(self, input_img):
21 |         def _preprocess(img_file):
22 |             img_bytes = tf.reshape(img_file, [])
23 |             img = tf.io.decode_jpeg(img_bytes, channels=3)
24 |             img = tf.image.convert_image_dtype(img, tf.float32)
25 |             return tf.image.resize(img, (224, 224))
26 |  
27 |         labels = tf.io.read_file(self.labels)
28 |         labels = tf.strings.split(labels, sep='\n')
29 |         img = _preprocess(input_img)[tf.newaxis,:]
30 |         logits = self.model(img)
31 |         get_class = lambda x: labels[tf.argmax(x)]
32 |         class_text = tf.map_fn(get_class, logits, fn_output_signature=tf.string)
33 |         return class_text # index of the class
34 | 
35 | # create a servable from a tf function
36 | tf_func_servable = AddTwo()
37 | tf.saved_model.save(tf_func_servable, "models/add_two/1")
38 | 
39 | # create a servable from a pre-trian model downloaded from tf hub
40 | tf_model_servable = CustomMobileNet()
41 | tf_model_servable.labels = tf.saved_model.Asset("data/ImageNetLabels.txt") # save lables txt as an asset
42 | tf.saved_model.save(tf_model_servable, "models/mobilenet_v2_test/1/")
43 | 


--------------------------------------------------------------------------------
/ml/tf-serving/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=2.3.1
2 | tensorflow_hub>=0.10.0


--------------------------------------------------------------------------------
/ml/tfidf-bm25/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | tfidf and bm25 examples for document retrieval using the Cranfield dataset
 4 | 
 5 | "The Cranfield collection. This was the pioneering test collection in allowing precise quantitative measures of information retrieval effectiveness, but is nowadays too small for anything but the most elementary pilot experiments. Collected in the United Kingdom starting in the late 1950s, it contains 1398 abstracts of aerodynamics journal articles, a set of 225 queries, and exhaustive relevance judgments of all (query, document) pairs." -
 6 | https://nlp.stanford.edu/IR-book/html/htmledition/standard-test-collections-1.html
 7 | 
 8 | ## Setup
 9 | 
10 | ```
11 | $ python3 -m venv venv
12 | $ source venv/bin/activate
13 | $ pip install -r requirements.txt
14 | ```
15 | ## Data
16 | 
17 | The data is in `data` folder in JSON format:
18 | - `cranfield_docs.json`: information about 1400 documents, which are abstracts from papers related to Aeronautics with information about author, bibliography, body (abstract), title:
19 | ```
20 |   {
21 | 
22 |       "id" : 1,
23 |       "author" : "brenckman,m.",
24 |       "bibliography" : "j. ae. scs. 25, 1958, 324.",
25 |       "body" : "experimental investigation of the aerodynamics of a wing in a slipstream .   an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem .   the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory .   an empirical evaluation of the destalling effects was made for the specific configuration of the experiment .",
26 |       "title" : "experimental investigation of the aerodynamics of a wing in a slipstream ."
27 | 
28 |   }
29 |   ```
30 | - `cranfield_queries.json`: 225 queries representing users' information need.
31 | ```
32 | {
33 |   "query_id": 1,
34 |   "query": "what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft ."
35 | }
36 | ```
37 | 
38 | - `cranfield_relevance.json`: the relevance score (1, 2, 3, 4 as 1 being the highest relevance) of each query and related documents.
39 | ```
40 | {"query_id": "1", "r_score": 2, "doc_id": "184"},
41 | {"query_id": "2", "r_score": 1, "doc_id": "12"},
42 | ```
43 |   - 1 : the document is the complete answer to the query
44 |   - 2 : the document has a high degree of relevance to the query
45 |   - 3 : the document is useful to the query as general background information
46 |   - 4 : the document is of minimum interest to the query
47 | 
48 | 
49 | 
50 | ## Evaluation Metrics
51 | 
52 | Precision and Recall are used in the examples. See https://nlp.stanford.edu/IR-book/html/htmledition/information-retrieval-system-evaluation-1.html for more evaluation metrics.
53 | 


--------------------------------------------------------------------------------
/ml/tfidf-bm25/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab>=2.2.9
2 | matplotlib>=3.2.1
3 | nltk>=3.5
4 | pandas>=1.0.3
5 | scikit-learn>=0.22.2
6 | rank-bm25
7 | 


--------------------------------------------------------------------------------
/ml/topic-modeling/.gitignore:
--------------------------------------------------------------------------------
1 | /data
2 | 


--------------------------------------------------------------------------------
/ml/topic-modeling/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | My revised code for
 4 | 
 5 | https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
 6 | https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
 7 | 
 8 | and
 9 | 
10 | LDA from Scratch
11 | My revised tutorial based on https://www.depends-on-the-definition.com/lda-from-scratch/
12 | 
13 | I also found another similar tutorial at https://gist.github.com/umbertogriffo/5041b9e4ec6c3478cef99b8653530032
14 | 
15 | ## Setup
16 | 
17 | within the tutorial folder:
18 | 
19 | ```
20 | python3 -m venv venv
21 | source venv/bin/activate
22 | pip install -r requirements.txt
23 | ```
24 | Then, you can use `jupyter notebook` or use VSCode `code .` to open the notebooks.
25 | 
26 | 


--------------------------------------------------------------------------------
/ml/topic-modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim
2 | jupyter
3 | matplotlib
4 | nltk
5 | pandas
6 | scikit-learn


--------------------------------------------------------------------------------
/ml/tweet-sentiment-analysis/README.md:
--------------------------------------------------------------------------------
 1 | ## Tweet Sentiment Analysis with Python 3
 2 | 
 3 | This is my revision of the tutorial at https://dev.to/rodolfoferro/sentiment-analysis-on-trumpss-tweets-using-python - many thanks to the author. The original repo is at https://github.com/RodolfoFerro/pandas_twitter
 4 | 
 5 | The original author provides markdown version of his tutorial. I combine all files into one: tutorial.md and create a English version of the Jupyter notebook (the author only had a Spanish version).
 6 | 
 7 | ## Summary
 8 | **Data**: 200 Tweets from Donald Trump: https://twitter.com/realDonaldTrump
 9 | 
10 | **Goal**: Conduct a sentiment analysis of the tweets with sample result:
11 | 
12 | - Percentage of positive tweets: 53.5%
13 | - Percentage of neutral tweets: 23.0%
14 | - Percentage of negative tweets: 23.5%
15 | 
16 | Python packages used: jupyter, pandas, numpy, tweepy, textblob
17 | 
18 | ## API Keys
19 | Change API key for Twitter: In order to extract tweets for a posterior analysis, we need to access to our Twitter account and create an app. The website to do this is https://apps.twitter.com/. (If you don't know how to do this, you can follow this tutorial video https://www.youtube.com/watch?v=BOA7SD_09Qk to create an account and an application.)
20 | 
21 | 
22 | - Consumer Key (API Key)
23 | - Consumer Secret (API Secret)
24 | - Access Token
25 | - Access Token Secret
26 | 
27 | **You should never put your real API key in the code and push to Github.** We use local environment variables for the API keys:
28 | 
29 | ```
30 | # Get the API key from local environment variable
31 | CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY')
32 | CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET')
33 | ACCESS_TOKEN = os.environ.get('TWITTER_ACCESS_TOKEN')
34 | ACCESS_SECRET = os.environ.get('TWITTER_ACCESS_SECRET')
35 | ```
36 | 
37 | You need to add the following lines to the `~/.bash_profile` file
38 | ```
39 | export TWITTER_CONSUMER_KEY='yourealkey'
40 | export TWITTER_CONSUMER_SECRET='yourealkey'
41 | export TWITTER_ACCESS_TOKEN='yourealkey'
42 | export TWITTER_ACCESS_SECRET='yourealkey'
43 | ```
44 | 
45 | then, use `vim` to edit, `source` to execute it, then use `env` to double check):
46 | 
47 | ```
48 | $ vim ~/.bash_profile
49 | $ source ~/.bash_profile
50 | $ env
51 | ```
52 | **NOTE: You may need to close the Terminal window and restart it for Jupyter Notebook to read the new variables you just added.**
53 | 
54 | ## Setup
55 | 
56 | Clone the repo, go to the repo folder, setup the virtual environment, and install the required packages:
57 | 
58 | ```
59 | $ cd path_to_document-clustering
60 | $ virtualenv -p python3 venv
61 | $ source venv/bin/activate
62 | $ pip3 install -r requirements.txt
63 | ```
64 | 
65 | Run `$ jupyter notebook` to go over the tutorial step-by-step.
66 | 


--------------------------------------------------------------------------------
/ml/tweet-sentiment-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas
3 | numpy
4 | tweepy
5 | textblob
6 | 


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | .DS_Store
104 | 
105 | # PyCharm project settings
106 | .idea
107 | 


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | A script to convert Chinese folder names to pinyin. revised according to: http://sunzhen.blogspot.com/2016/05/rename-chinese-filenames-to-pinyin.html
 4 | 
 5 | 
 6 | ## Setup
 7 | 
 8 | This script converts all file and folder names in the sub folder "data" into Pinyin.
 9 | 
10 | If different unicode maps to the same pinyin such as "利" and "立"" both map to "li" and 1 will be added to the filename.
11 | 
12 | Tested with Python 3.6 via virtual environment:
13 | ```shell
14 | $ python3.6 -m venv venv
15 | $ source venv/bin/activate
16 | $ python ch-to-pinyin.py
17 | ```
18 | 
19 | ## An Example
20 | 
21 | Before:
22 | 
23 | <img width="1391" alt="Screen Shot 2019-09-24 at 11 43 08 AM" src="https://user-images.githubusercontent.com/595772/65527394-91a35b80-dec0-11e9-8df3-f54f76a2a77e.png">
24 | 
25 | After:
26 | 
27 | <img width="1394" alt="Screen Shot 2019-09-24 at 11 43 20 AM" src="https://user-images.githubusercontent.com/595772/65527400-936d1f00-dec0-11e9-8e42-4126839b9342.png">
28 | 


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/ch-to-pinyin.py:
--------------------------------------------------------------------------------
 1 | # renameCH2Pinyin.py
 2 | # Rename filename from Chinese characters to capitalized pinyin using the
 3 | # mapping file and taking out the tone numbers
 4 | 
 5 | import os
 6 | import re
 7 | 
 8 | # File uni2pinyin is a mapping from hex to Pinyin with a tone number
 9 | f = open('uni2pinyin')
10 | wf = f.read() # read the whole mapping file
11 | 
12 | os.chdir('data') # to rename all files in sub folder 'voc'
13 | filename_list = os.listdir(u'.') # read all file names in unicode mode
14 | print(filename_list)
15 | for filename_unicode in filename_list: # each file name
16 |     filename_pinyin = ''
17 |     for c in filename_unicode: # each character
18 |         if 0x4e00 <= ord(c) <= 0x9fff: # Chinese Character Unicode range
19 |             hexCH = (hex(ord(c))[2:]).upper() # strip leading '0x' and change
20 |                                               # to uppercase
21 |             p = re.compile(hexCH+'\t([a-z]+)[\d]*') # define the match pattern
22 |             mp = p.search(wf)
23 |             filename_pinyin+=mp.group(1).title() # get the pinyin without the tone
24 |                                             # number and capitalize it
25 |         else:
26 |             filename_pinyin+=c
27 |     print(filename_unicode, filename_pinyin)
28 | 
29 |     latest_filename_list = os.listdir(u'.')
30 |     while filename_pinyin in latest_filename_list:
31 |         filename_pinyin= filename_pinyin + '1'
32 |         print(filename_pinyin)
33 |     os.rename(filename_unicode, filename_pinyin)
34 | os.chdir('..') # go back to the parent folder
35 | 


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store11:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store11


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store111:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store111


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store111111:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store111111


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/.DS_Store1111111:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/.DS_Store1111111


--------------------------------------------------------------------------------
/other/chinese-to-pinyin/data/白/0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/chinese-to-pinyin/data/白/0a2afd0597d8e9c7e635012241bbc9eea6622c89.jpg


--------------------------------------------------------------------------------
/other/color-palette/test-palette.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/color-palette/test-palette.png


--------------------------------------------------------------------------------
/other/color-palette/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/color-palette/test.png


--------------------------------------------------------------------------------
/other/csv-to-bert-text/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | A script to convert csv to multiple txt based on a label column
 4 | 
 5 | ## Data
 6 | 
 7 | ```
 8 | 1	The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor.	5	POS
 9 | 2	The food was definitely more "upscale" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas.	4	NEG
10 | 3	Another small bonus was the hard-to-find Sol beer , which was great with a lime.	4	POS
11 | 4	Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!.	5	NEU
12 | ```
13 | ## Setup
14 | 
15 | Tested with Python 3.6 via virtual environment:
16 | ```shell
17 | $ python3.6 -m venv venv
18 | $ source venv/bin/activate
19 | $ pip install -r requirements.txt
20 | $ jupyter notebook
21 | ```
22 | 


--------------------------------------------------------------------------------
/other/csv-to-bert-text/csv-to-txt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[27]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import os
 9 | import shutil
10 | 
11 | 
12 | # In[20]:
13 | 
14 | 
15 | sent = pd.read_csv("real.csv", encoding='utf8')
16 | sent
17 | 
18 | 
19 | # In[21]:
20 | 
21 | 
22 | path = os.getcwd()
23 | print ("The current working directory is %s" % path)
24 | 
25 | 
26 | # In[32]:
27 | 
28 | 
29 | # clean data folder and create new folders
30 | shutil.rmtree('./data')
31 | os.makedirs("./data/pos")
32 | os.makedirs("./data/neg")
33 | os.makedirs("./data/neu")
34 | 
35 | 
36 | # In[16]:
37 | 
38 | 
39 | def write_sent(label, id, sent):
40 |     filename = "./data/"+ label +"/" + id +".txt"
41 |     file = open(filename,"w")
42 |     file.writelines(sent)
43 |     file.close()
44 | 
45 | 
46 | # In[17]:
47 | 
48 | 
49 | for index, row in sent.iterrows():
50 |     write_sent(row['SentiLabel_food'].lower(), str(row['SentenceID']), row['Sentences'])
51 |     print("writing sentence")
52 | 


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/neg/43.txt:
--------------------------------------------------------------------------------
1 | I was slightly saddened that either they do not offer chalula here or that they ran out.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/neg/44.txt:
--------------------------------------------------------------------------------
1 | Sometimes I feel that breakfast is just not complete without it and hope that they can stock up because that louisiana hot sauce they carried did not cut it.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/neu/29.txt:
--------------------------------------------------------------------------------
1 | Food was ok , 


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/1.txt:
--------------------------------------------------------------------------------
1 | The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/2.txt:
--------------------------------------------------------------------------------
1 | The food was definitely more "upscale" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/20.txt:
--------------------------------------------------------------------------------
1 | We ordered 2 appetizers , 2 entrees  , 2 bottles of wine... food was very good not great.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/3.txt:
--------------------------------------------------------------------------------
1 | Another small bonus was the hard-to-find Sol beer , which was great with a lime.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/4.txt:
--------------------------------------------------------------------------------
1 | Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/40.txt:
--------------------------------------------------------------------------------
1 | The food came out in about 10 minutes and my skillet was cooked very well.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/41.txt:
--------------------------------------------------------------------------------
1 | The sunny side up eggs here are some of the best I have ever had- they were slightly chewy and full of flavor.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/42.txt:
--------------------------------------------------------------------------------
1 | My skillet came with cheddar cheese mushroom , broccoli , and tomtaoes and it was a hearty meal that , when combined with ketchup and salt , delivered very satisfying feelings to my tastebuds and stomach.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/46.txt:
--------------------------------------------------------------------------------
1 | They were hot and had a soft , spongy texture.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/47.txt:
--------------------------------------------------------------------------------
1 | Pretty delicious and satisfied my sweet tooth.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/5.txt:
--------------------------------------------------------------------------------
1 | After shopping around , my husband and I both think they have the best pizza around Highland Park.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/50.txt:
--------------------------------------------------------------------------------
1 | Their sandwich specials looked great.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/data/pos/8.txt:
--------------------------------------------------------------------------------
1 | It's a homey place with good food.


--------------------------------------------------------------------------------
/other/csv-to-bert-text/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter==1.0.0
2 | pandas==0.25.2
3 | 


--------------------------------------------------------------------------------
/other/csv-to-bert-text/sample.csv:
--------------------------------------------------------------------------------
 1 | SentenceID,Sentences,SentiScore_food,SentiLabel_food
 2 | 1,"The pork tenderloin with mole sauce was perfect , and the side dishes added a lot of nice , fresh flavor.",5,POS
 3 | 2,"The food was definitely more ""upscale"" than traditional Mexican , so don't come here if you are looking for Qdoba style burritos and quesadillas.",4,POS
 4 | 3,"Another small bonus was the hard-to-find Sol beer , which was great with a lime.",4,POS
 5 | 4,"Speaking of lime , the guacamole starter was as fresh as it gets (made tableside) and had a nice kick to it!.",5,POS
 6 | 5,"After shopping around , my husband and I both think they have the best pizza around Highland Park.",5,POS
 7 | 8,It's a homey place with good food.,4,POS
 8 | 20,"We ordered 2 appetizers , 2 entrees  , 2 bottles of wine... food was very good not great.",4,POS
 9 | 29,"Food was ok , ",3,NEU
10 | 40,The food came out in about 10 minutes and my skillet was cooked very well.,5,POS
11 | 41,The sunny side up eggs here are some of the best I have ever had- they were slightly chewy and full of flavor.,5,POS
12 | 42,"My skillet came with cheddar cheese mushroom , broccoli , and tomtaoes and it was a hearty meal that , when combined with ketchup and salt , delivered very satisfying feelings to my tastebuds and stomach.",5,POS
13 | 43,I was slightly saddened that either they do not offer chalula here or that they ran out.,2,NEG
14 | 44,Sometimes I feel that breakfast is just not complete without it and hope that they can stock up because that louisiana hot sauce they carried did not cut it.,2,NEG
15 | 46,"They were hot and had a soft , spongy texture.",4,POS
16 | 47,Pretty delicious and satisfied my sweet tooth.,4,POS
17 | 50,Their sandwich specials looked great.,5,POS


--------------------------------------------------------------------------------
/other/list-like-to-list/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 virtualenv at ~/Documents/sandbox/one-off/movie-genre/venv" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/other/list-like-to-list/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/movie-genre.iml" filepath="$PROJECT_DIR$/.idea/movie-genre.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/other/list-like-to-list/.idea/movie-genre.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/other/list-like-to-list/README.md:
--------------------------------------------------------------------------------
 1 | # Intro
 2 | This is a program to do some data transformation.
 3 | 
 4 | The key challenge is converting the following "list-like" string into a real list:
 5 | 
 6 | ```
 7 | "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"
 8 | ```
 9 | 
10 | `ast.literal_eval` does the trick: https://docs.python.org/2/library/ast.html
11 | 
12 | Another useful trick is converting a list of list:
13 | 
14 | ```
15 | [[862, 16], [862, 35], [862, 10751], [8844, 12]]
16 | ```
17 | into a csv file:
18 | 
19 | ```
20 | 862,16
21 | 862,35
22 | 862,10751
23 | 8844,12
24 | ```
25 | pandas makes it easy:
26 | ```
27 | my_df = pd.DataFrame(my_list)
28 | my_df.to_csv('my_csv.csv', index=False, header=False)
29 | ```
30 | 
31 | # Run
32 | 
33 | Python 2.x
34 | 
35 | - create virtual environment: `$virtualenv venv`
36 | - activate virtual env: `$source venv/bin/activate`
37 | - install required packages: `pip install -r requirements.txt`
38 | 


--------------------------------------------------------------------------------
/other/list-like-to-list/movie.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import ast
 3 | import pandas as pd
 4 | 
 5 | movies = []
 6 | # how to read a csv into a list
 7 | with open('input.csv', 'rb') as f:
 8 |     reader = csv.reader(f)
 9 |     movies = list(reader)
10 | 
11 | my_list = []
12 | 
13 | for movie in movies:
14 |     # print '****** one movie ******'
15 |     for genre in ast.literal_eval(movie[1]):
16 |         line = []
17 |         # print '****** one genre ******'
18 |         # print movie[0]
19 |         # print genre['id']
20 |         # print genre['name']
21 |         line.append(int(movie[0]))
22 |         line.append(genre['id'])
23 |         # print line
24 |         my_list.append(line)
25 | 
26 | print(my_list)
27 | my_df = pd.DataFrame(my_list)
28 | my_df.to_csv('movie_genre.csv', index=False, header=False)
29 | print 'See result in movie_genre.csv file'
30 | 


--------------------------------------------------------------------------------
/other/list-like-to-list/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.21.0
2 | 


--------------------------------------------------------------------------------
/other/list-of-dicts-to-columns/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/tutorial-buffet/b466fa10aa1d7f55d6f98feb177fa705d8bc87da/other/list-of-dicts-to-columns/README.md


--------------------------------------------------------------------------------
/other/list-of-dicts-to-columns/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | pandas


--------------------------------------------------------------------------------
/other/screenshot-gif-generation/.gitignore:
--------------------------------------------------------------------------------
1 | /screenshots/*.jpg
2 | /screenshots/*.gif
3 | 


--------------------------------------------------------------------------------
/other/screenshot-gif-generation/README.md:
--------------------------------------------------------------------------------
1 | # Generate Screenshots and Gifs via Python
2 | 
3 | Code is revised based on:
4 | - https://blog.csdn.net/qq_38161040/article/details/91040640
5 | - https://medium.com/swlh/python-animated-images-6a85b9b68f86


--------------------------------------------------------------------------------
/other/screenshot-gif-generation/gif-generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "language_info": {
  4 |    "codemirror_mode": {
  5 |     "name": "ipython",
  6 |     "version": 3
  7 |    },
  8 |    "file_extension": ".py",
  9 |    "mimetype": "text/x-python",
 10 |    "name": "python",
 11 |    "nbconvert_exporter": "python",
 12 |    "pygments_lexer": "ipython3",
 13 |    "version": "3.7.7-final"
 14 |   },
 15 |   "orig_nbformat": 2,
 16 |   "kernelspec": {
 17 |    "name": "python_defaultSpec_1600265837621",
 18 |    "display_name": "Python 3.7.7 64-bit ('venv': venv)"
 19 |   }
 20 |  },
 21 |  "nbformat": 4,
 22 |  "nbformat_minor": 2,
 23 |  "cells": [
 24 |   {
 25 |    "source": [
 26 |     "# Generate Screenshots and Gifs via Python\n",
 27 |     "\n",
 28 |     "Code is revised based on:\n",
 29 |     "- https://blog.csdn.net/qq_38161040/article/details/91040640\n",
 30 |     "- https://medium.com/swlh/python-animated-images-6a85b9b68f86"
 31 |    ],
 32 |    "cell_type": "markdown",
 33 |    "metadata": {}
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 27,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from PIL import ImageGrab\n",
 42 |     "from PIL import Image\n",
 43 |     "import time"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 28,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# take a screenshot every 0.1 second, 10 jpg saved\n",
 53 |     "total_images = 10 # total screenshots\n",
 54 |     "interval = 0.1 # the interval to take a screenshot\n",
 55 |     "resize_ratio = 0.3 # the resize ratio to keep the screenshot smaller\n",
 56 |     "\n",
 57 |     "for i in range(total_images):\n",
 58 |     "    time.sleep(interval)\n",
 59 |     "    img = ImageGrab.grab()\n",
 60 |     "    width = img.size[0]\n",
 61 |     "    height = img.size[1]\n",
 62 |     "\n",
 63 |     "    img = img.resize(\n",
 64 |     "        (int(width*resize_ratio), int(height*resize_ratio)), \n",
 65 |     "        Image.ANTIALIAS)\n",
 66 |     "    \n",
 67 |     "    img = img.convert('RGB') # if save to jpg\n",
 68 |     "    img.save(f'./screenshots/screenshot{str(i+1)}.jpg')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 29,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# generate the gif\n",
 78 |     "import imageio\n",
 79 |     "\n",
 80 |     "gif_images = []\n",
 81 |     "for i in range(total_images):\n",
 82 |     "    gif_images.append(imageio.imread(f'./screenshots/screenshot{str(i+1)}.jpg'))\n",
 83 |     "\n",
 84 |     "imageio.mimsave(\"./screenshots/screenshot.gif\", gif_images, fps=5)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 30,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# reduce the gif file size\n",
 94 |     "from pygifsicle import optimize\n",
 95 |     "\n",
 96 |     "gif_orginal = './screenshots/screenshot.gif'\n",
 97 |     "\n",
 98 |     "# create a new onegit \n",
 99 |     "optimize(gif_orginal, './screenshots/screenshot_optimized.gif')\n",
100 |     "\n",
101 |     "# overwrite the original one if needed\n",
102 |     "# optimize(gif_orginal)"
103 |    ]
104 |   }
105 |  ]
106 | }


--------------------------------------------------------------------------------
/other/screenshot-gif-generation/git-gen.py:
--------------------------------------------------------------------------------
 1 | # # Generate Screenshots and Gifs via Python
 2 | # 
 3 | # Code is revised based on:
 4 | # - https://blog.csdn.net/qq_38161040/article/details/91040640
 5 | # - https://medium.com/swlh/python-animated-images-6a85b9b68f86
 6 | # pip install the following three packages: imageio, pillow, pygifsicle
 7 | 
 8 | # %%
 9 | from PIL import ImageGrab
10 | from PIL import Image
11 | import time
12 | 
13 | 
14 | # %%
15 | # take a screenshot every 0.1 second, 10 jpg saved
16 | total_images = 10 # total screenshots
17 | interval = 0.1 # the interval to take a screenshot
18 | resize_ratio = 0.3 # the resize ratio to keep the screenshot smaller
19 | 
20 | for i in range(total_images):
21 |     time.sleep(interval)
22 |     img = ImageGrab.grab()
23 |     width = img.size[0]
24 |     height = img.size[1]
25 | 
26 |     img = img.resize(
27 |         (int(width*resize_ratio), int(height*resize_ratio)), 
28 |         Image.ANTIALIAS)
29 |     
30 |     img = img.convert('RGB') # if save to jpg
31 |     img.save(f'./screenshots/screenshot{str(i+1)}.jpg')
32 | 
33 | 
34 | # %%
35 | import imageio
36 | 
37 | gif_images = []
38 | for i in range(total_images):
39 |     gif_images.append(imageio.imread(f'./screenshots/screenshot{str(i+1)}.jpg'))
40 | 
41 | imageio.mimsave("./screenshots/screenshot.gif", gif_images, fps=5)
42 | 
43 | 
44 | # %%
45 | from pygifsicle import optimize
46 | 
47 | gif_orginal = './screenshots/screenshot.gif'
48 | 
49 | # create a new onegit 
50 | optimize(gif_orginal, './screenshots/screenshot_optimized.gif')
51 | 
52 | # overwrite the original one if needed
53 | # optimize(gif_orginal)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/other/screenshot-gif-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | imageio
3 | pillow
4 | pygifsicle


--------------------------------------------------------------------------------
/other/screenshot-gif-generation/screenshots/screenshot-folder.md:
--------------------------------------------------------------------------------
1 | temp files are saved in this folder


--------------------------------------------------------------------------------