├── .gitignore
├── Dockerfile
├── README.md
├── api.py
├── cli.py
├── models
    └── classification
    │   └── commands.py
├── requirements.txt
├── utils.py
└── valohai.yaml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Data
132 | data/
133 | 
134 | # VScode
135 | .vscode/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |   build-essential \
 5 |   git \
 6 |   python-dev \
 7 |   python-numpy \
 8 |   python-scipy
 9 | 
10 | RUN git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \
11 |   rm -rf /tmp/fastText/.git* && \
12 |   cd /tmp/fastText && \
13 |   make && \
14 |   pip install .
15 | 
16 | COPY requirements.txt .
17 | 
18 | RUN pip install -r requirements.txt
19 | 
20 | WORKDIR /
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Production Machine Learning Pipeline for Text Classification
 2 | 
 3 | This repository is the support for three articles:
 4 | - [Production Machine Learning Pipeline for Text Classification with fastText](https://blog.valohai.com/production-machine-learning-pipeline-text-classification-fasttext)
 5 | - [Classifying 4M Reddit posts in 4k subreddits: an end-to-end machine learning pipeline](https://blog.valohai.com/machine-learning-pipeline-classifying-reddit-posts)
 6 | - [What did I Learn about CI/CD for Machine Learning](https://valohai.com/blog/cicd-for-machine-learning/)
 7 | 
 8 | ![ml-pipeline](https://valohai.com/blog/machine-learning-pipeline-classifying-reddit-posts/end-to-end-ml-pipeline.jpg)
 9 | 
10 | ## Libraries and code structure
11 | - [fastText](https://fasttext.cc/) is a library for efficient text classification and representation learning.
12 | 
13 | Check the code in [commands.py](https://github.com/arimbr/valohai-fasttext-example/blob/master/models/classification/commands.py) to see how to use fastText's Python bindings.
14 | - [Valohai](https://valohai.com) is a machine learning platform that automates MLOps and record keeping.
15 | 
16 | Check the code in [valohai.yaml](https://github.com/arimbr/valohai-fasttext-example/blob/master/valohai.yaml) to see how to integrate your custom ML code with Valohai.
17 | - [FastAPI](https://fastapi.tiangolo.com/) is a web framework for high performance, easy to learn, fast to code and ready for production.
18 | 
19 | Check the code in [api.py](https://github.com/arimbr/valohai-fasttext-example/blob/master/api.py) to see how to create models and prediction endpoints.
20 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import uuid
 4 | from typing import List, Dict
 5 | 
 6 | import fasttext
 7 | from fastapi import FastAPI
 8 | from pydantic import BaseModel
 9 | 
10 | from models.classification.commands import process_text, format_label
11 | 
12 | MODEL_PATH = 'model.bin'
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class FeaturesModel(BaseModel):
18 |     text: str
19 | 
20 | 
21 | class PredictionModel(BaseModel):
22 |     label: str
23 |     probability: float
24 | 
25 | 
26 | class PredictionsModel(BaseModel):
27 |     predictions: List[PredictionModel]
28 | 
29 | 
30 | # Initialize model
31 | model = fasttext.load_model(MODEL_PATH)
32 | 
33 | # Initialize API
34 | app = FastAPI()
35 | 
36 | 
37 | @app.get(".*/predict")
38 | def hello():
39 |     return 'OK'
40 | 
41 | 
42 | @app.post(".*/predict", response_model=PredictionsModel)
43 | def predict(features: FeaturesModel, k: int = 10, decimals: int = 2):
44 |     request_id = uuid.uuid4().hex
45 | 
46 |     # Log features
47 |     logger.info(json.dumps(
48 |         {'request_id': request_id, 'features': features.dict()}))
49 | 
50 |     # Preprocess data
51 |     data = process_text(features.text)
52 | 
53 |     # Get predictions
54 |     labels, probas = model.predict(data, k=k)
55 | 
56 |     # Format predictions
57 |     predictions = [{
58 |         'label': format_label(label),
59 |         'probability': round(proba, decimals)
60 |     } for label, proba in zip(labels, probas)]
61 | 
62 |     # Log predictions
63 |     logger.info(json.dumps(
64 |         {'request_id': request_id, 'predictions': predictions}))
65 | 
66 |     return {'predictions': predictions}
67 | 


--------------------------------------------------------------------------------
/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from models.classification.commands import classification
 4 | 
 5 | 
 6 | @click.group()
 7 | def cli():
 8 |     pass
 9 | 
10 | 
11 | cli.add_command(classification)
12 | 
13 | if __name__ == '__main__':
14 |     cli()
15 | 


--------------------------------------------------------------------------------
/models/classification/commands.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import string
  4 | import os
  5 | import multiprocessing
  6 | import random
  7 | 
  8 | import click
  9 | import pandas as pd
 10 | import fasttext
 11 | 
 12 | from utils import get_input_path, get_output_path
 13 | 
 14 | 
 15 | TEXT_COLUMN = 'text'
 16 | LABEL_COLUMN = 'label'
 17 | LABEL_SEPARATOR = '__label__'
 18 | PROBABILITY_COLUMN = 'p'
 19 | RANDOM_SEED = 42
 20 | VERBOSE = 3
 21 | 
 22 | train_parameters = {
 23 |     'lr': 0.1,
 24 |     'dim': 100,
 25 |     'ws': 5,
 26 |     'epoch': 5,
 27 |     'minCount': 1,
 28 |     'minCountLabel': 0,
 29 |     'minn': 0,
 30 |     'maxn': 0,
 31 |     'neg': 5,
 32 |     'wordNgrams': 1,
 33 |     'bucket': 2000000,
 34 |     'thread': multiprocessing.cpu_count() - 1,
 35 |     'lrUpdateRate': 100,
 36 |     't': 1e-4,
 37 |     'label': LABEL_SEPARATOR,
 38 |     'verbose': 2,
 39 |     'pretrainedVectors': '',
 40 |     'seed': 0,
 41 | }
 42 | 
 43 | 
 44 | CLEAN_LABEL_REGEX = re.compile(r'{}'.format(LABEL_SEPARATOR))
 45 | 
 46 | 
 47 | def format_label(label):
 48 |     return re.sub(CLEAN_LABEL_REGEX, '', label)
 49 | 
 50 | 
 51 | def format_labels(labels):
 52 |     return [format_label(label) for label in labels]
 53 | 
 54 | 
 55 | def not_empty_str(x):
 56 |     return isinstance(x, str) and x != ''
 57 | 
 58 | 
 59 | def get_model_parameters(model):
 60 |     args_getter = model.f.getArgs()
 61 | 
 62 |     parameters = {}
 63 |     for param in train_parameters:
 64 |         attr = getattr(args_getter, param)
 65 |         if param == 'loss':
 66 |             attr = attr.name
 67 |         parameters[param] = attr
 68 | 
 69 |     return parameters
 70 | 
 71 | 
 72 | def split_text(text):
 73 |     text, label = text.split(LABEL_SEPARATOR)
 74 |     return text.strip(), label.strip()
 75 | 
 76 | 
 77 | def process_text(text):
 78 |     # Transform multiple spaces and \n to a single space
 79 |     text = re.sub(r'\s{1,}', ' ', text)
 80 |     # Remove punctuation
 81 |     remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
 82 |     text = text.translate(remove_punct_map)
 83 |     # Transform to lowercase
 84 |     text = text.lower()
 85 |     return text
 86 | 
 87 | 
 88 | def get_predictions_df(all_labels, all_probs, k):
 89 |     labels_columns = [f'{LABEL_COLUMN}@{i}' for i in range(1, k+1)]
 90 |     probs_columns = [f'{PROBABILITY_COLUMN}@{i}' for i in range(1, k+1)]
 91 | 
 92 |     return pd.DataFrame((
 93 |         format_labels(labels) + list(probs)
 94 |         for labels, probs in zip(all_labels, all_probs)
 95 |     ), columns=labels_columns + probs_columns)
 96 | 
 97 | 
 98 | @click.group()
 99 | def classification():
100 |     pass
101 | 
102 | 
103 | @classification.command()
104 | @click.option('--input_dir', default='input_dir')
105 | @click.option('--output_file', default='output_file')
106 | def collect_bbc_data(input_dir, output_file):
107 | 
108 |     def rows_generator():
109 |         for root, _, files in os.walk(input_dir):
110 |             category = root.split('/')[-1]
111 |             for fname in files:
112 |                 if fname.endswith('.txt'):
113 |                     text = open(os.path.join(root, fname), 'rb').read()
114 |                     yield text.decode('latin-1'), category
115 | 
116 |     df = pd.DataFrame(rows_generator(), columns=[TEXT_COLUMN, LABEL_COLUMN])
117 | 
118 |     df.to_csv(output_file, index=False)
119 | 
120 | 
121 | @classification.command()
122 | @click.option('--input_data', default='data')
123 | @click.option('--output_data', default='preprocessed.txt')
124 | @click.option('--text_column', default=TEXT_COLUMN)
125 | @click.option('--label_column', default=LABEL_COLUMN)
126 | @click.option('--engine', default='python')
127 | def preprocess(input_data, output_data, text_column, label_column, engine):
128 |     # TODO: make it work also with prediction data without label
129 |     input_data_path = get_input_path(input_data)
130 |     output_data_path = get_output_path(output_data)
131 | 
132 |     df = pd.read_csv(
133 |         input_data_path,
134 |         engine=engine).fillna('')
135 | 
136 |     # Concatenate strings if multiple text columns
137 |     if ',' in text_column:
138 |         df[text_column] = df[text_column.split(',')].agg(' '.join, axis=1)
139 | 
140 |     with open(output_data_path, 'w') as output:
141 |         for text, label in zip(df[text_column], df[label_column]):
142 |             if not_empty_str(text) and not_empty_str(label):
143 |                 output.write(f'{process_text(text)} {LABEL_SEPARATOR}{label}\n')
144 | 
145 | 
146 | @classification.command()
147 | @click.option('--input_data', default='data')
148 | @click.option('--output_train', default='train.txt')
149 | @click.option('--output_validation', default='validation.txt')
150 | @click.option('--output_test', default='test.txt')
151 | @click.option('--train_ratio', default=0.8)
152 | @click.option('--validation_ratio', default=0.1)
153 | @click.option('--test_ratio', default=0.1)
154 | @click.option('--shuffle', is_flag=True)
155 | def split(input_data, output_train, output_validation, output_test,
156 |     train_ratio, validation_ratio, test_ratio, shuffle):
157 |     input_data_path = get_input_path(input_data)
158 |     output_train_path = get_output_path(output_train)
159 |     output_validation_path = get_output_path(output_validation)
160 |     output_test_path = get_output_path(output_test)
161 | 
162 |     with open(input_data_path, 'r') as f:
163 |         data = f.read().strip().split('\n')
164 | 
165 |     # Shuffle data
166 |     if shuffle:
167 |         print('Shuffling data')
168 |         random.seed(RANDOM_SEED)
169 |         random.shuffle(data)
170 | 
171 |     # Split train, validation and test data
172 |     validation_index = round(len(data) * train_ratio)
173 |     test_index = round(len(data) * (train_ratio + validation_ratio))
174 |     end_index = round(len(data) * (train_ratio + validation_ratio + test_ratio))
175 | 
176 |     with open(output_train_path, 'w') as f:
177 |         f.write('\n'.join(data[:validation_index]))
178 | 
179 |     with open(output_validation_path, 'w') as f:
180 |         f.write('\n'.join(data[validation_index:test_index]))
181 | 
182 |     with open(output_test_path, 'w') as f:
183 |         f.write('\n'.join(data[test_index:end_index]))
184 | 
185 | 
186 | @classification.command()
187 | @click.option('--input_train', default='train')
188 | @click.option('--input_validation', default='validation')
189 | @click.option('--output_model', default='train_model.bin')
190 | @click.option('--output_parameters', default='parameters.json')
191 | @click.option('--metric', default='f1')
192 | @click.option('--k', default=1)
193 | @click.option('--duration', default=1200)
194 | @click.option('--model_size', default='2000M')
195 | def autotune(input_train, input_validation, output_model, output_parameters,
196 |     metric, k, duration, model_size):
197 |     input_train_path = get_input_path(input_train)
198 |     input_validation_path = get_input_path(input_validation)
199 |     output_model_path = get_output_path(output_model)
200 |     output_parameters_path = get_output_path(output_parameters)
201 | 
202 |     # Autotune model
203 |     model = fasttext.train_supervised(
204 |         input=input_train_path,
205 |         autotuneValidationFile=input_validation_path,
206 |         autotuneMetric=metric,
207 |         autotuneDuration=duration,
208 |         autotuneModelSize=model_size,
209 |         verbose=VERBOSE)
210 | 
211 |     # Log best model metrics
212 |     n, p, r = model.test(input_validation_path, k=k)
213 |     print(json.dumps(
214 |         {'n': n, 'precision': p, 'recall': r, 'k': k}))
215 | 
216 |     # Save best parameters
217 |     with open(output_parameters_path, 'w') as f:
218 |         json.dump(get_model_parameters(model), f)
219 | 
220 |     # Save best model
221 |     model.save_model(output_model_path)
222 | 
223 | 
224 | @classification.command()
225 | @click.option('--input_data', default='data')
226 | @click.option('--input_parameters', default='parameters')
227 | @click.option('--output_model', default='model.bin')
228 | def train(input_data, input_parameters, output_model):
229 |     input_data_path = get_input_path(input_data)
230 |     input_parameters_path = get_input_path(input_parameters)
231 |     output_model_path = get_output_path(output_model)
232 | 
233 |     # Parse parameters
234 |     with open(input_parameters_path) as f:
235 |         parameters = json.load(f)
236 | 
237 |     # Train model
238 |     model = fasttext.train_supervised(
239 |         input=input_data_path,
240 |         **parameters)
241 | 
242 |     # Save model
243 |     model.save_model(output_model_path)
244 | 
245 | 
246 | @classification.command()
247 | @click.option('--input_test', default='test')
248 | @click.option('--input_model', default='model')
249 | @click.option('--output_predictions', default='test_predictions.csv')
250 | @click.option('--k', default=1)
251 | def test(input_test, input_model, output_predictions, k):
252 |     input_test_path = get_input_path(input_test)
253 |     input_model_path = get_input_path(input_model)
254 |     output_predictions_path = get_output_path(output_predictions)
255 | 
256 |     model = fasttext.load_model(input_model_path)
257 | 
258 |     # Log model metrics
259 |     n, p, r = model.test(input_test_path, k=k)
260 |     print(json.dumps(
261 |         {'n': n, 'precision': p, 'recall': r, 'k': k}))
262 | 
263 |     # Split feature and category in a DataFrame
264 |     with open(input_test_path) as f:
265 |         df = pd.DataFrame(
266 |             (split_text(line) for line in f),
267 |             columns=[TEXT_COLUMN, LABEL_COLUMN])
268 | 
269 |     # Get predictions
270 |     all_labels, all_probs = model.predict(
271 |         list(df[TEXT_COLUMN]), k=k)
272 | 
273 |     # Add formatted predictions
274 |     predictions_df = get_predictions_df(all_labels, all_probs, k)
275 |     df = df.join(predictions_df)
276 | 
277 |     # Add error column
278 |     df['error'] = (df[f'{LABEL_COLUMN}'] != df[f'{LABEL_COLUMN}@1'])
279 | 
280 |     # Save predictions
281 |     df.to_csv(output_predictions_path, index=False)
282 | 
283 | 
284 | @classification.command()
285 | @click.option('--input_data', default='data')
286 | @click.option('--input_model', default='model')
287 | @click.option('--output_predictions', default='predictions.csv')
288 | @click.option('--k', default=1)
289 | def predict(input_data, input_model, output_predictions, k):
290 |     input_data_path = get_input_path(input_data)
291 |     input_model_path = get_input_path(input_model)
292 |     output_predictions_path = get_output_path(output_predictions)
293 | 
294 |     model = fasttext.load_model(input_model_path)
295 | 
296 |     # Create text DataFrame
297 |     with open(input_data_path) as f:
298 |         df = pd.DataFrame(
299 |             (line for line in f),
300 |             columns=[TEXT_COLUMN])
301 | 
302 |     # Get predictions
303 |     all_labels, all_probs = model.predict(
304 |         list(df[TEXT_COLUMN]), k=k)
305 | 
306 |     # Add formatted predictions
307 |     predictions_df = get_predictions_df(all_labels, all_probs, k)
308 |     df = df.join(predictions_df)
309 | 
310 |     # Save predictions
311 |     df.to_csv(output_predictions_path, index=False)
312 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Click==7.0
2 | fastapi==0.49.0
3 | pandas==0.25.3
4 | uvicorn==0.11.3


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_first_file(path):
 5 |     filename = os.listdir(path)[0]
 6 |     return os.path.join(path, filename)
 7 | 
 8 | 
 9 | def get_input_path(input_name):
10 |     '''
11 |     Args:
12 |         input_name (str): the input name in the valohai.yaml.
13 |         Locally pass the relative path to the input file.
14 |     '''
15 |     inputs_dir = os.getenv('VH_INPUTS_DIR')
16 |     if inputs_dir:
17 |         input_dir = os.path.join(inputs_dir, input_name)
18 |         return get_first_file(input_dir)
19 |     return input_name
20 | 
21 | 
22 | def get_output_path(output_file):
23 |     '''
24 |     Args:
25 |         output_file (str): the output file name.
26 |         Locally pass the relative path to the output file.
27 |     '''
28 |     outputs_dir = os.getenv('VH_OUTPUTS_DIR')
29 |     if outputs_dir:
30 |         return os.path.join(outputs_dir, output_file)
31 |     return output_file
32 | 


--------------------------------------------------------------------------------
/valohai.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | - step:
  4 |     name: preprocess
  5 |     image: arimbr/valohai-fasttext-example
  6 |     command: python cli.py classification preprocess {parameters}
  7 |     inputs:
  8 |       - name: data
  9 |         default: s3://valohai-fasttext-example/bbc/data.csv
 10 |     parameters:
 11 |      - name: output_data
 12 |        description: name of the output file
 13 |        type: string
 14 |        default: preprocessed.txt
 15 |      - name: text_column
 16 |        description: name of the text column (pass multiple column names separated with a ,)
 17 |        type: string
 18 |        default: text
 19 |      - name: label_column
 20 |        description: name of label column
 21 |        type: string
 22 |        default: label
 23 |      - name: engine
 24 |        description: CSV parser engine to use (python, python-fwf, c)
 25 |        type: string
 26 |        default: python
 27 | 
 28 | - step:
 29 |     name: split
 30 |     image: arimbr/valohai-fasttext-example
 31 |     command: python cli.py classification split {parameters}
 32 |     inputs:
 33 |       - name: data
 34 |     parameters:
 35 |       - name: output_train
 36 |         description: name of the train output file
 37 |         type: string
 38 |         default: train.txt
 39 |       - name: output_validation
 40 |         description: name of the validation output file
 41 |         type: string
 42 |         default: validation.txt
 43 |       - name: output_test
 44 |         description: name of the test output file
 45 |         type: string
 46 |         default: test.txt
 47 |       - name: train_ratio
 48 |         description: ratio of the train data
 49 |         type: float
 50 |         default: 0.8
 51 |       - name: validation_ratio
 52 |         description: ratio of the validation data
 53 |         type: float
 54 |         default: 0.1
 55 |       - name: test_ratio
 56 |         description: ratio of the test data
 57 |         type: float
 58 |         default: 0.1
 59 |       - name: shuffle
 60 |         description: shuffle data
 61 |         type: flag
 62 |         default: true
 63 | 
 64 | - step:
 65 |     name: autotune
 66 |     image: arimbr/valohai-fasttext-example
 67 |     command: python cli.py classification autotune {parameters}
 68 |     inputs:
 69 |       - name: train
 70 |       - name: validation
 71 |     parameters:
 72 |      - name: metric
 73 |        description: metric objective {f1, f1:labelname}
 74 |        type: string
 75 |        default: f1
 76 |      - name: k
 77 |        description: number of predictions used for evaluation
 78 |        type: integer
 79 |        default: 1
 80 |      - name: duration
 81 |        description: maximum duration in seconds
 82 |        type: integer
 83 |        default: 1200
 84 |      - name: model_size
 85 |        description: constraint model file size (empty = do not quantize)
 86 |        type: string
 87 |        default: 2000M
 88 | 
 89 | - step:
 90 |     name: train
 91 |     image: arimbr/valohai-fasttext-example
 92 |     command: python cli.py classification train {parameters}
 93 |     inputs:
 94 |       - name: data
 95 |       - name: parameters
 96 | 
 97 | - step:
 98 |     name: test
 99 |     image: arimbr/valohai-fasttext-example
100 |     command: python cli.py classification test {parameters}
101 |     inputs:
102 |       - name: test
103 |       - name: model
104 |     parameters:
105 |      - name: k
106 |        description: number of labels to predict
107 |        type: integer
108 |        default: 1
109 | 
110 | - step:
111 |     name: predict
112 |     image: arimbr/valohai-fasttext-example
113 |     command: python cli.py classification predict {parameters}
114 |     inputs:
115 |       - name: data
116 |       - name: model
117 |     parameters:
118 |      - name: k
119 |        description: number of labels to predict
120 |        type: integer
121 |        default: 1
122 | 
123 | - pipeline:
124 |     name: fasttext-train
125 |     nodes:
126 |       - name: split
127 |         type: execution
128 |         step: split
129 |       - name: preprocess
130 |         type: execution
131 |         step: preprocess
132 |       - name: autotune
133 |         type: execution
134 |         step: autotune
135 |       - name: train
136 |         type: execution
137 |         step: train
138 |       - name: test
139 |         type: execution
140 |         step: test
141 |     edges:
142 |       - [preprocess.output.preprocessed.txt, split.input.data]
143 |       - [preprocess.output.preprocessed.txt, train.input.data]
144 |       - [split.output.train.txt, autotune.input.train]
145 |       - [split.output.validation.txt, autotune.input.validation]
146 |       - [split.output.test.txt, test.input.test]
147 |       - [autotune.output.parameters.json, train.input.parameters]
148 |       - [autotune.output.train_model.bin, test.input.model]
149 | 
150 | - endpoint:
151 |     name: predict
152 |     description: Predict labels from text
153 |     image: arimbr/valohai-fasttext-example:deployment
154 |     port: 8000
155 |     server-command: uvicorn api:app --host 0.0.0.0 --port 8000
156 |     files:
157 |       - name: model
158 |         description: Model output file from training step.
159 |         path: model.bin


--------------------------------------------------------------------------------