├── .devcontainer.json ├── .dvc ├── .gitignore └── config ├── .dvcignore ├── .gitattributes ├── .github └── workflows │ └── cml.yaml ├── .gitignore ├── .gitlab-ci.yml ├── README.md ├── data ├── .gitignore └── data.xml.dvc ├── dvc.lock ├── dvc.yaml ├── params.yaml └── src ├── evaluate.py ├── featurization.py ├── prepare.py ├── requirements.txt └── train.py /.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "example-get-started", 3 | "image": "mcr.microsoft.com/devcontainers/python:3.10", 4 | "extensions": ["Iterative.dvc", "ms-python.python", "redhat.vscode-yaml"], 5 | "features": { 6 | "ghcr.io/iterative/features/dvc:1": {} 7 | }, 8 | "postCreateCommand": "pip3 install --user -r src/requirements.txt" 9 | } 10 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = public-s3 3 | ['remote "public-s3"'] 4 | url = https://remote.dvc.org/get-started 5 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.dvc linguist-language=YAML 2 | dvc.lock linguist-language=YAML 3 | -------------------------------------------------------------------------------- /.github/workflows/cml.yaml: -------------------------------------------------------------------------------- 1 | name: CML Report 2 | on: pull_request 3 | jobs: 4 | run: 5 | runs-on: [ubuntu-latest] 6 | steps: 7 | - uses: iterative/setup-cml@v2 8 | - uses: iterative/setup-dvc@v1 9 | - uses: actions/checkout@v3 10 | with: 11 | fetch-depth: 2 12 | # Needed for https://github.com/iterative/example-repos-dev/issues/225 13 | - name: Installs JSON5 14 | run: npm install -g json5 15 | - name: Generate metrics report 16 | env: 17 | REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | run: | 19 | cml ci 20 | if [ $GITHUB_REF = refs/heads/main ]; then 21 | PREVIOUS_REF=HEAD~1 22 | else 23 | PREVIOUS_REF=main 24 | git fetch origin main:main 25 | fi 26 | 27 | dvc pull eval 28 | dvc plots diff $PREVIOUS_REF workspace \ 29 | --show-vega --targets ROC | json5 > vega.json 30 | vl2svg vega.json roc.svg 31 | 32 | dvc plots diff $PREVIOUS_REF workspace \ 33 | --show-vega --targets Precision-Recall | json5 > vega.json 34 | vl2svg vega.json prc.svg 35 | 36 | dvc plots diff $PREVIOUS_REF workspace \ 37 | --show-vega --targets Confusion-Matrix | json5 > vega.json 38 | vl2svg vega.json confusion.svg 39 | 40 | cp eval/plots/images/importance.png importance_workspace.png 41 | 42 | git checkout $PREVIOUS_REF -- dvc.lock 43 | cp eval/plots/images/importance.png importance_previous.png 44 | 45 | dvc_report=$(dvc exp diff $PREVIOUS_REF --md) 46 | 47 | cat < report.md 48 | # CML Report 49 | ## Plots 50 | ![ROC](./roc.svg) 51 | ![Precision-Recall](./prc.svg) 52 | ![Confusion Matrix](./confusion.svg) 53 | #### Feature Importance: ${PREVIOUS_REF} 54 | ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png) 55 | #### Feature Importance: workspace 56 | ![Feature Importance: workspace](./importance_workspace.png) 57 | 58 | ## Metrics and Params 59 | ### ${PREVIOUS_REF} → workspace 60 | ${dvc_report} 61 | EOF 62 | 63 | cml comment create --publish --pr=false report.md 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | /model.pkl 3 | /eval 4 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | report: 2 | rules: 3 | - if: $CI_PIPELINE_SOURCE == 'merge_request_event' 4 | - if: $CI_COMMIT_BRANCH == 'main' 5 | image: dvcorg/cml:0-dvc3-base1 6 | before_script: 7 | - cml ci && cml --version 8 | - npm install -g json5 9 | script: | 10 | if [ $CI_COMMIT_REF_NAME = main ]; then 11 | PREVIOUS_REF=HEAD~1 12 | COMMIT_HASH1=$CI_COMMIT_BEFORE_SHA 13 | COMMIT_HASH2=$CI_COMMIT_SHA 14 | else 15 | PREVIOUS_REF=main 16 | git fetch --depth=1 origin main:main 17 | COMMIT_HASH1=$CI_MERGE_REQUEST_DIFF_BASE_SHA 18 | COMMIT_HASH2=$CI_COMMIT_SHA 19 | fi 20 | 21 | dvc pull eval 22 | dvc plots diff $PREVIOUS_REF workspace \ 23 | --show-vega --targets ROC | json5 > vega.json 24 | vl2svg vega.json roc.svg 25 | 26 | dvc plots diff $PREVIOUS_REF workspace \ 27 | --show-vega --targets Precision-Recall | json5 > vega.json 28 | vl2svg vega.json prc.svg 29 | 30 | dvc plots diff $PREVIOUS_REF workspace \ 31 | --show-vega --targets Confusion-Matrix | json5 > vega.json 32 | vl2svg vega.json confusion.svg 33 | 34 | cp eval/plots/images/importance.png importance_workspace.png 35 | 36 | git checkout $PREVIOUS_REF -- dvc.lock 37 | cp eval/plots/images/importance.png importance_previous.png 38 | 39 | dvc_report=$(dvc exp diff $PREVIOUS_REF --md) 40 | 41 | cat < report.md 42 | # CML Report 43 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/views/example-get-started-2gpv7kdqx2?panels=plots%2C%3Bcompare%2C&commits=${COMMIT_HASH2}%3B${COMMIT_HASH1}&activeCommits=${COMMIT_HASH1}%3Aprimary%3B${COMMIT_HASH2}%3Apurple) 44 | ## Plots 45 | ![ROC](./roc.svg) 46 | ![Precision-Recall](./prc.svg) 47 | ![Confusion Matrix](./confusion.svg) 48 | #### Feature Importance: ${PREVIOUS_REF} 49 | ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png) 50 | #### Feature Importance: workspace 51 | ![Feature Importance: workspace](./importance_workspace.png) 52 | 53 | ## Metrics and Params 54 | ### ${PREVIOUS_REF} → workspace 55 | ${dvc_report} 56 | EOF 57 | 58 | if [ $CI_COMMIT_REF_NAME = main ]; then 59 | cml comment create --target=commit report.md 60 | else 61 | cml comment update --target=pr report.md 62 | fi 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DVC](https://img.shields.io/badge/-Open_in_Studio-grey.svg?style=flat-square&logo=dvc)](https://studio.iterative.ai/team/Iterative/views/example-get-started-zde16i6c4g) 2 | 3 | # DVC Get Started 4 | 5 | This is an auto-generated repository for use in [DVC](https://dvc.org) 6 | [Get Started](https://dvc.org/doc/get-started). It is a step-by-step quick 7 | introduction into basic DVC concepts. 8 | 9 | ![](https://static.iterative.ai/img/example-get-started/readme-head.png) 10 | 11 | The project is a natural language processing (NLP) binary classifier problem of 12 | predicting tags for a given StackOverflow question. For example, we want one 13 | classifier which can predict a post that is about the R language by tagging it 14 | `R`. 15 | 16 | 🐛 Please report any issues found in this project here - 17 | [example-repos-dev](https://github.com/iterative/example-repos-dev). 18 | 19 | ## Installation 20 | 21 | Python 3.9+ is required to run code from this repo. 22 | 23 | ```console 24 | $ git clone https://github.com/iterative/example-get-started 25 | $ cd example-get-started 26 | ``` 27 | 28 | Now let's install the requirements. But before we do that, we **strongly** 29 | recommend creating a virtual environment with a tool such as 30 | [virtualenv](https://virtualenv.pypa.io/en/stable/): 31 | 32 | ```console 33 | $ virtualenv -p python3 .venv 34 | $ source .venv/bin/activate 35 | $ pip install -r src/requirements.txt 36 | ``` 37 | 38 | > This instruction assumes that DVC is already installed, as it is frequently 39 | > used as a global tool like Git. If DVC is not installed, see the 40 | > [DVC installation guide](https://dvc.org/doc/install) on how to install DVC. 41 | 42 | This DVC project comes with a preconfigured DVC 43 | [remote storage](https://dvc.org/doc/commands-reference/remote) that holds raw 44 | data (input), intermediate, and final results that are produced. This is a 45 | read-only HTTP remote. 46 | 47 | ```console 48 | $ dvc remote list 49 | storage https://remote.dvc.org/get-started 50 | ``` 51 | 52 | You can run [`dvc pull`](https://man.dvc.org/pull) to download the data: 53 | 54 | ```console 55 | $ dvc pull 56 | ``` 57 | 58 | ## Running in your environment 59 | 60 | Run [`dvc exp run`](https://man.dvc.org/exp/run) to reproduce the 61 | [pipeline](https://dvc.org/doc/user-guide/pipelines) and create a new 62 | [experiment](https://dvc.org/doc/user-guide/experiment-management). 63 | 64 | ```console 65 | $ dvc exp run 66 | Ran experiment(s): rapid-cane 67 | Experiment results have been applied to your workspace. 68 | ``` 69 | 70 | If you'd like to test commands like [`dvc push`](https://man.dvc.org/push), 71 | that require write access to the remote storage, the easiest way would be to set 72 | up a "local remote" on your file system: 73 | 74 | > This kind of remote is located in the local file system, but is external to 75 | > the DVC project. 76 | 77 | ```console 78 | $ mkdir -p /tmp/dvc-storage 79 | $ dvc remote add local /tmp/dvc-storage 80 | ``` 81 | 82 | You should now be able to run: 83 | 84 | ```console 85 | $ dvc push -r local 86 | ``` 87 | 88 | ## Existing stages 89 | 90 | This project with the help of the Git tags reflects the sequence of actions that 91 | are run in the DVC [get started](https://dvc.org/doc/get-started) guide. Feel 92 | free to checkout one of them and play with the DVC commands having the 93 | playground ready. 94 | 95 | - `0-git-init`: Empty Git repository initialized. 96 | - `1-dvc-init`: DVC has been initialized. `.dvc/` with the cache directory 97 | created. 98 | - `2-track-data`: Raw data file `data.xml` downloaded and tracked with DVC using 99 | [`dvc add`](https://man.dvc.org/add). First `.dvc` file created. 100 | - `3-config-remote`: Remote HTTP storage initialized. It's a shared read only 101 | storage that contains all data artifacts produced during next steps. 102 | - `4-import-data`: Use `dvc import` to get the same `data.xml` from the DVC data 103 | registry. 104 | - `5-source-code`: Source code downloaded and put into Git. 105 | - `6-prepare-stage`: Create `dvc.yaml` and the first pipeline stage with 106 | [`dvc run`](https://man.dvc.org/run). It transforms XML data into TSV. 107 | - `7-ml-pipeline`: Feature extraction and train stages created. It takes data in 108 | TSV format and produces two `.pkl` files that contain serialized feature 109 | matrices. Train runs random forest classifier and creates the `model.pkl` file. 110 | - `8-evaluation`: Evaluation stage. Runs the model on a test dataset to produce 111 | its performance AUC value. The result is dumped into a DVC metric file so that 112 | we can compare it with other experiments later. 113 | - `9-bigrams-model`: Bigrams experiment, code has been modified to extract more 114 | features. We run [`dvc repro`](https://man.dvc.org/repro) for the first time 115 | to illustrate how DVC can reuse cached files and detect changes along the 116 | computational graph, regenerating the model with the updated data. 117 | - `10-bigrams-experiment`: Reproduce the evaluation stage with the bigrams based 118 | model. 119 | - `11-random-forest-experiments`: Reproduce experiments to tune the random 120 | forest classifier parameters and select the best experiment. 121 | 122 | There are three additional tags: 123 | 124 | - `baseline-experiment`: First end-to-end result that we have performance metric 125 | for. 126 | - `bigrams-experiment`: Second experiment (model trained using bigrams 127 | features). 128 | - `random-forest-experiments`: Best of additional experiments tuning random 129 | forest parameters. 130 | 131 | These tags can be used to illustrate `-a` or `-T` options across different 132 | [DVC commands](https://man.dvc.org/). 133 | 134 | ## Project structure 135 | 136 | The data files, DVC files, and results change as stages are created one by one. 137 | After cloning and using [`dvc pull`](https://man.dvc.org/pull) to download 138 | data, models, and plots tracked by DVC, the workspace should look like this: 139 | 140 | ```console 141 | $ tree 142 | . 143 | ├── README.md 144 | ├── data # <-- Directory with raw and intermediate data 145 | │   ├── data.xml # <-- Initial XML StackOverflow dataset (raw data) 146 | │   ├── data.xml.dvc # <-- .dvc file - a placeholder/pointer to raw data 147 | │   ├── features # <-- Extracted feature matrices 148 | │   │   ├── test.pkl 149 | │   │   └── train.pkl 150 | │   └── prepared # <-- Processed dataset (split and TSV formatted) 151 | │   ├── test.tsv 152 | │   └── train.tsv 153 | ├── dvc.lock 154 | ├── dvc.yaml # <-- DVC pipeline file 155 | ├── eval 156 | │   ├── metrics.json # <-- Binary classifier final metrics (e.g. AUC) 157 | │   └── plots 158 | │   ├── images 159 | │   │   └── importance.png # <-- Feature importance plot 160 | │   └── sklearn # <-- Data points for ROC, confusion matrix 161 | │   ├── cm 162 | │   │   ├── test.json 163 | │   │   └── train.json 164 | │   ├── prc 165 | │   │   ├── test.json 166 | │   │   └── train.json 167 | │   └── roc 168 | │   ├── test.json 169 | │   └── train.json 170 | ├── model.pkl # <-- Trained model file 171 | ├── params.yaml # <-- Parameters file 172 | └── src # <-- Source code to run the pipeline stages 173 | ├── evaluate.py 174 | ├── featurization.py 175 | ├── prepare.py 176 | ├── requirements.txt # <-- Python dependencies needed in the project 177 | └── train.py 178 | ``` 179 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | /data.xml 2 | /prepared 3 | /features 4 | -------------------------------------------------------------------------------- /data/data.xml.dvc: -------------------------------------------------------------------------------- 1 | md5: f5ea021eddd7b1df6de80b904cba1da6 2 | frozen: true 3 | deps: 4 | - path: get-started/data.xml 5 | repo: 6 | url: https://github.com/iterative/dataset-registry 7 | rev_lock: f59388cd04276e75d70b2136597aaa27e7937cc3 8 | outs: 9 | - md5: 22a1a2931c8370d3aeedd7183606fd7f 10 | size: 14445097 11 | hash: md5 12 | path: data.xml 13 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | schema: '2.0' 2 | stages: 3 | prepare: 4 | cmd: python src/prepare.py data/data.xml 5 | deps: 6 | - path: data/data.xml 7 | hash: md5 8 | md5: 22a1a2931c8370d3aeedd7183606fd7f 9 | size: 14445097 10 | - path: src/prepare.py 11 | hash: md5 12 | md5: f54d670ac8a4f63206781fc31d1f2651 13 | size: 2231 14 | params: 15 | params.yaml: 16 | prepare.seed: 20170428 17 | prepare.split: 0.2 18 | outs: 19 | - path: data/prepared 20 | hash: md5 21 | md5: 153aad06d376b6595932470e459ef42a.dir 22 | size: 8437363 23 | nfiles: 2 24 | featurize: 25 | cmd: python src/featurization.py data/prepared data/features 26 | deps: 27 | - path: data/prepared 28 | hash: md5 29 | md5: 153aad06d376b6595932470e459ef42a.dir 30 | size: 8437363 31 | nfiles: 2 32 | - path: src/featurization.py 33 | hash: md5 34 | md5: e22789fc9581cad11ef7a6fa3aa3f17b 35 | size: 4158 36 | params: 37 | params.yaml: 38 | featurize.max_features: 200 39 | featurize.ngrams: 2 40 | outs: 41 | - path: data/features 42 | hash: md5 43 | md5: f35d4cc2c552ac959ae602162b8543f3.dir 44 | size: 2232588 45 | nfiles: 2 46 | train: 47 | cmd: python src/train.py data/features model.pkl 48 | deps: 49 | - path: data/features 50 | hash: md5 51 | md5: f35d4cc2c552ac959ae602162b8543f3.dir 52 | size: 2232588 53 | nfiles: 2 54 | - path: src/train.py 55 | hash: md5 56 | md5: 324001573ed724e5ae092226fcf9ca30 57 | size: 1666 58 | params: 59 | params.yaml: 60 | train.min_split: 0.01 61 | train.n_est: 50 62 | train.seed: 20170428 63 | outs: 64 | - path: model.pkl 65 | hash: md5 66 | md5: d1f6e055f7f5e2827fcfae68d9b64d4c 67 | size: 1958115 68 | evaluate: 69 | cmd: python src/evaluate.py model.pkl data/features 70 | deps: 71 | - path: data/features 72 | hash: md5 73 | md5: f35d4cc2c552ac959ae602162b8543f3.dir 74 | size: 2232588 75 | nfiles: 2 76 | - path: model.pkl 77 | hash: md5 78 | md5: d1f6e055f7f5e2827fcfae68d9b64d4c 79 | size: 1958115 80 | - path: src/evaluate.py 81 | hash: md5 82 | md5: a1a59f55636170fb56e0c6afd3e28fa4 83 | size: 3315 84 | outs: 85 | - path: eval 86 | hash: md5 87 | md5: 80a081570c800c60b9b98ca4b3c91dd7.dir 88 | size: 1292342 89 | nfiles: 8 90 | -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | artifacts: 2 | stackoverflow-dataset: 3 | path: data/data.xml 4 | type: dataset 5 | desc: Initial XML StackOverflow dataset (raw data) 6 | text-classification: 7 | path: model.pkl 8 | desc: Detect whether the given stackoverflow question should have R language tag 9 | type: model 10 | labels: 11 | - nlp 12 | - classification 13 | - stackoverflow 14 | stages: 15 | prepare: 16 | cmd: python src/prepare.py data/data.xml 17 | deps: 18 | - data/data.xml 19 | - src/prepare.py 20 | params: 21 | - prepare.seed 22 | - prepare.split 23 | outs: 24 | - data/prepared 25 | featurize: 26 | cmd: python src/featurization.py data/prepared data/features 27 | deps: 28 | - data/prepared 29 | - src/featurization.py 30 | params: 31 | - featurize.max_features 32 | - featurize.ngrams 33 | outs: 34 | - data/features 35 | train: 36 | cmd: python src/train.py data/features model.pkl 37 | deps: 38 | - data/features 39 | - src/train.py 40 | params: 41 | - train.min_split 42 | - train.n_est 43 | - train.seed 44 | outs: 45 | - model.pkl 46 | evaluate: 47 | cmd: python src/evaluate.py model.pkl data/features 48 | deps: 49 | - data/features 50 | - model.pkl 51 | - src/evaluate.py 52 | outs: 53 | - eval 54 | metrics: 55 | - eval/metrics.json 56 | plots: 57 | - ROC: 58 | template: simple 59 | x: fpr 60 | y: 61 | eval/plots/sklearn/roc/train.json: tpr 62 | eval/plots/sklearn/roc/test.json: tpr 63 | - Confusion-Matrix: 64 | template: confusion 65 | x: actual 66 | y: 67 | eval/plots/sklearn/cm/train.json: predicted 68 | eval/plots/sklearn/cm/test.json: predicted 69 | - Precision-Recall: 70 | template: simple 71 | x: recall 72 | y: 73 | eval/plots/sklearn/prc/train.json: precision 74 | eval/plots/sklearn/prc/test.json: precision 75 | - eval/plots/images/importance.png 76 | -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- 1 | prepare: 2 | split: 0.20 3 | seed: 20170428 4 | 5 | featurize: 6 | max_features: 200 7 | ngrams: 2 8 | 9 | train: 10 | seed: 20170428 11 | n_est: 50 12 | min_split: 0.01 13 | 14 | -------------------------------------------------------------------------------- /src/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | import pickle 5 | import sys 6 | 7 | import pandas as pd 8 | from sklearn import metrics 9 | from sklearn import tree 10 | from dvclive import Live 11 | from matplotlib import pyplot as plt 12 | 13 | 14 | def evaluate(model, matrix, split, live, save_path): 15 | """ 16 | Dump all evaluation metrics and plots for given datasets. 17 | 18 | Args: 19 | model (sklearn.ensemble.RandomForestClassifier): Trained classifier. 20 | matrix (scipy.sparse.csr_matrix): Input matrix. 21 | split (str): Dataset name. 22 | live (dvclive.Live): Dvclive instance. 23 | save_path (str): Path to save the metrics. 24 | """ 25 | labels = matrix[:, 1].toarray().astype(int) 26 | x = matrix[:, 2:] 27 | 28 | predictions_by_class = model.predict_proba(x) 29 | predictions = predictions_by_class[:, 1] 30 | 31 | # Use dvclive to log a few simple metrics... 32 | avg_prec = metrics.average_precision_score(labels, predictions) 33 | roc_auc = metrics.roc_auc_score(labels, predictions) 34 | if not live.summary: 35 | live.summary = {"avg_prec": {}, "roc_auc": {}} 36 | live.summary["avg_prec"][split] = avg_prec 37 | live.summary["roc_auc"][split] = roc_auc 38 | 39 | # ... and plots... 40 | # ... like an roc plot... 41 | live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}") 42 | # ... and precision recall plot... 43 | # ... which passes `drop_intermediate=True` to the sklearn method... 44 | live.log_sklearn_plot( 45 | "precision_recall", 46 | labels, 47 | predictions, 48 | name=f"prc/{split}", 49 | drop_intermediate=True, 50 | ) 51 | # ... and confusion matrix plot 52 | live.log_sklearn_plot( 53 | "confusion_matrix", 54 | labels.squeeze(), 55 | predictions_by_class.argmax(-1), 56 | name=f"cm/{split}", 57 | ) 58 | 59 | 60 | def save_importance_plot(live, model, feature_names): 61 | """ 62 | Save feature importance plot. 63 | 64 | Args: 65 | live (dvclive.Live): DVCLive instance. 66 | model (sklearn.ensemble.RandomForestClassifier): Trained classifier. 67 | feature_names (list): List of feature names. 68 | """ 69 | fig, axes = plt.subplots(dpi=100) 70 | fig.subplots_adjust(bottom=0.2, top=0.95) 71 | axes.set_ylabel("Mean decrease in impurity") 72 | 73 | importances = model.feature_importances_ 74 | forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30) 75 | forest_importances.plot.bar(ax=axes) 76 | 77 | live.log_image("importance.png", fig) 78 | 79 | 80 | def main(): 81 | EVAL_PATH = "eval" 82 | 83 | if len(sys.argv) != 3: 84 | sys.stderr.write("Arguments error. Usage:\n") 85 | sys.stderr.write("\tpython evaluate.py model features\n") 86 | sys.exit(1) 87 | 88 | model_file = sys.argv[1] 89 | train_file = os.path.join(sys.argv[2], "train.pkl") 90 | test_file = os.path.join(sys.argv[2], "test.pkl") 91 | 92 | # Load model and data. 93 | with open(model_file, "rb") as fd: 94 | model = pickle.load(fd) 95 | 96 | with open(train_file, "rb") as fd: 97 | train, feature_names = pickle.load(fd) 98 | 99 | with open(test_file, "rb") as fd: 100 | test, _ = pickle.load(fd) 101 | 102 | # Evaluate train and test datasets. 103 | with Live(EVAL_PATH, dvcyaml=False) as live: 104 | evaluate(model, train, "train", live, save_path=EVAL_PATH) 105 | evaluate(model, test, "test", live, save_path=EVAL_PATH) 106 | 107 | # Dump feature importance plot. 108 | save_importance_plot(live, model, feature_names) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /src/featurization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.sparse as sparse 8 | import yaml 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 10 | 11 | 12 | def get_df(data): 13 | """Read the input data file and return a data frame.""" 14 | df = pd.read_csv( 15 | data, 16 | encoding="utf-8", 17 | header=None, 18 | delimiter="\t", 19 | names=["id", "label", "text"], 20 | ) 21 | sys.stderr.write(f"The input data frame {data} size is {df.shape}\n") 22 | return df 23 | 24 | 25 | def save_matrix(df, matrix, names, output): 26 | """ 27 | Save the matrix to a pickle file. 28 | 29 | Args: 30 | df (pandas.DataFrame): Input data frame. 31 | matrix (scipy.sparse.csr_matrix): Input matrix. 32 | names (list): List of feature names. 33 | output (str): Output file name. 34 | """ 35 | id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T 36 | label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T 37 | 38 | result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr") 39 | 40 | msg = "The output matrix {} size is {} and data type is {}\n" 41 | sys.stderr.write(msg.format(output, result.shape, result.dtype)) 42 | 43 | with open(output, "wb") as fd: 44 | pickle.dump((result, names), fd) 45 | pass 46 | 47 | 48 | def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf): 49 | """ 50 | Generate train feature matrix. 51 | 52 | Args: 53 | train_input (str): Train input file name. 54 | train_output (str): Train output file name. 55 | bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. 56 | tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. 57 | """ 58 | df_train = get_df(train_input) 59 | train_words = np.array(df_train.text.str.lower().values) 60 | 61 | bag_of_words.fit(train_words) 62 | 63 | train_words_binary_matrix = bag_of_words.transform(train_words) 64 | feature_names = bag_of_words.get_feature_names_out() 65 | 66 | tfidf.fit(train_words_binary_matrix) 67 | train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) 68 | 69 | save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output) 70 | 71 | 72 | def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf): 73 | """ 74 | Generate test feature matrix. 75 | 76 | Args: 77 | test_input (str): Test input file name. 78 | test_output (str): Test output file name. 79 | bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. 80 | tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. 81 | """ 82 | df_test = get_df(test_input) 83 | test_words = np.array(df_test.text.str.lower().values) 84 | 85 | test_words_binary_matrix = bag_of_words.transform(test_words) 86 | test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix) 87 | feature_names = bag_of_words.get_feature_names_out() 88 | 89 | save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output) 90 | 91 | 92 | def main(): 93 | params = yaml.safe_load(open("params.yaml"))["featurize"] 94 | 95 | np.set_printoptions(suppress=True) 96 | 97 | if len(sys.argv) != 3 and len(sys.argv) != 5: 98 | sys.stderr.write("Arguments error. Usage:\n") 99 | sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n") 100 | sys.exit(1) 101 | 102 | in_path = sys.argv[1] 103 | out_path = sys.argv[2] 104 | 105 | train_input = os.path.join(in_path, "train.tsv") 106 | test_input = os.path.join(in_path, "test.tsv") 107 | train_output = os.path.join(out_path, "train.pkl") 108 | test_output = os.path.join(out_path, "test.pkl") 109 | 110 | max_features = params["max_features"] 111 | ngrams = params["ngrams"] 112 | 113 | os.makedirs(out_path, exist_ok=True) 114 | 115 | bag_of_words = CountVectorizer( 116 | stop_words="english", max_features=max_features, ngram_range=(1, ngrams) 117 | ) 118 | tfidf = TfidfTransformer(smooth_idf=False) 119 | 120 | generate_and_save_train_features( 121 | train_input=train_input, 122 | train_output=train_output, 123 | bag_of_words=bag_of_words, 124 | tfidf=tfidf, 125 | ) 126 | 127 | generate_and_save_test_features( 128 | test_input=test_input, 129 | test_output=test_output, 130 | bag_of_words=bag_of_words, 131 | tfidf=tfidf, 132 | ) 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | -------------------------------------------------------------------------------- /src/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import re 4 | import sys 5 | import xml.etree.ElementTree 6 | 7 | import yaml 8 | 9 | 10 | def process_posts(input_lines, fd_out_train, fd_out_test, target_tag, split): 11 | """ 12 | Process the input lines and write the output to the output files. 13 | 14 | Args: 15 | input_lines (list): List of input lines. 16 | fd_out_train (file): Output file for the training data set. 17 | fd_out_test (file): Output file for the test data set. 18 | target_tag (str): Target tag. 19 | split (float): Test data set split ratio. 20 | """ 21 | num = 1 22 | for line in input_lines: 23 | try: 24 | fd_out = fd_out_train if random.random() > split else fd_out_test 25 | attr = xml.etree.ElementTree.fromstring(line).attrib 26 | 27 | pid = attr.get("Id", "") 28 | label = 1 if target_tag in attr.get("Tags", "") else 0 29 | title = re.sub(r"\s+", " ", attr.get("Title", "")).strip() 30 | body = re.sub(r"\s+", " ", attr.get("Body", "")).strip() 31 | text = title + " " + body 32 | 33 | fd_out.write("{}\t{}\t{}\n".format(pid, label, text)) 34 | 35 | num += 1 36 | except Exception as ex: 37 | sys.stderr.write(f"Skipping the broken line {num}: {ex}\n") 38 | 39 | 40 | def main(): 41 | params = yaml.safe_load(open("params.yaml"))["prepare"] 42 | 43 | if len(sys.argv) != 2: 44 | sys.stderr.write("Arguments error. Usage:\n") 45 | sys.stderr.write("\tpython prepare.py data-file\n") 46 | sys.exit(1) 47 | 48 | # Test data set split ratio 49 | split = params["split"] 50 | random.seed(params["seed"]) 51 | 52 | input = sys.argv[1] 53 | output_train = os.path.join("data", "prepared", "train.tsv") 54 | output_test = os.path.join("data", "prepared", "test.tsv") 55 | 56 | os.makedirs(os.path.join("data", "prepared"), exist_ok=True) 57 | 58 | input_lines = [] 59 | with open(input) as fd_in: 60 | input_lines = fd_in.readlines() 61 | 62 | fd_out_train = open(output_train, "w", encoding="utf-8") 63 | fd_out_test = open(output_test, "w", encoding="utf-8") 64 | 65 | process_posts( 66 | input_lines=input_lines, 67 | fd_out_train=fd_out_train, 68 | fd_out_test=fd_out_test, 69 | target_tag="", 70 | split=split, 71 | ) 72 | 73 | fd_out_train.close() 74 | fd_out_test.close() 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | dvclive>=3.0 2 | pandas 3 | pyaml 4 | scikit-learn>=1.3 5 | scipy 6 | matplotlib 7 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import sys 4 | 5 | import numpy as np 6 | import yaml 7 | from sklearn.ensemble import RandomForestClassifier 8 | 9 | 10 | def train(seed, n_est, min_split, matrix): 11 | """ 12 | Train a random forest classifier. 13 | 14 | Args: 15 | seed (int): Random seed. 16 | n_est (int): Number of trees in the forest. 17 | min_split (int): Minimum number of samples required to split an internal node. 18 | matrix (scipy.sparse.csr_matrix): Input matrix. 19 | 20 | Returns: 21 | sklearn.ensemble.RandomForestClassifier: Trained classifier. 22 | """ 23 | labels = np.squeeze(matrix[:, 1].toarray()) 24 | x = matrix[:, 2:] 25 | 26 | sys.stderr.write("Input matrix size {}\n".format(matrix.shape)) 27 | sys.stderr.write("X matrix size {}\n".format(x.shape)) 28 | sys.stderr.write("Y matrix size {}\n".format(labels.shape)) 29 | 30 | clf = RandomForestClassifier( 31 | n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed 32 | ) 33 | 34 | clf.fit(x, labels) 35 | 36 | return clf 37 | 38 | 39 | def main(): 40 | params = yaml.safe_load(open("params.yaml"))["train"] 41 | 42 | if len(sys.argv) != 3: 43 | sys.stderr.write("Arguments error. Usage:\n") 44 | sys.stderr.write("\tpython train.py features model\n") 45 | sys.exit(1) 46 | 47 | input = sys.argv[1] 48 | output = sys.argv[2] 49 | seed = params["seed"] 50 | n_est = params["n_est"] 51 | min_split = params["min_split"] 52 | 53 | # Load the data 54 | with open(os.path.join(input, "train.pkl"), "rb") as fd: 55 | matrix, _ = pickle.load(fd) 56 | 57 | clf = train(seed=seed, n_est=n_est, min_split=min_split, matrix=matrix) 58 | 59 | # Save the model 60 | with open(output, "wb") as fd: 61 | pickle.dump(clf, fd) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | --------------------------------------------------------------------------------