├── cedar ├── __init__.py ├── utils │ ├── __init__.py │ └── timer.py ├── client │ ├── __init__.py │ ├── constants.py │ └── logger.py ├── compose │ ├── __init__.py │ └── constants.py ├── pipes │ ├── custom │ │ ├── simclrv2_pytorch.py │ │ ├── wikitext103.py │ │ ├── wikitext103_tf_service.py │ │ ├── coco.py │ │ ├── simclrv2.py │ │ └── commonvoice.py │ ├── tf.py │ ├── optimize │ │ ├── __init__.py │ │ ├── registry.py │ │ └── noop.py │ └── __init__.py ├── sources │ └── __init__.py ├── service │ ├── task.py │ ├── __init__.py │ ├── multiprocess.py │ ├── actor.py │ └── multithread.py └── config.py ├── tests ├── __init__.py ├── .gitignore └── data │ ├── test_text.txt │ ├── test_text_2.txt │ ├── .gitignore │ ├── test_tf_string.txt │ ├── example_image.jpeg │ ├── images │ ├── image_1.jpg │ ├── image_2.jpg │ ├── image_3.jpg │ ├── image_4.jpg │ ├── image_5.jpg │ ├── image_6.jpg │ ├── image_7.jpg │ ├── image_8.jpg │ ├── image_9.jpg │ ├── image_10.jpg │ └── image_11.jpg │ ├── t10k-images-idx3-ubyte.gz │ ├── t10k-labels-idx1-ubyte.gz │ ├── test_text_3.txt │ ├── test_profile_stats.yml │ ├── config_tf_string.yml │ ├── test_optimizer_stats.yml │ ├── config_fuse_reorder_tf.yml │ ├── test_full_optimizer_stats.yml │ ├── config_ref.yml │ ├── config_ref_variant.yml │ ├── test_cache_optimizer_stats_expensive_io.yml │ ├── test_cache_optimizer_stats.yml │ ├── config_ref_prefetch.yml │ ├── config_ref_mp.yml │ ├── config_fuse_ray.yml │ └── insert_config_ref.yml ├── evaluation ├── __init__.py ├── .gitignore ├── pipelines │ ├── __init__.py │ ├── coco │ │ ├── __init__.py │ │ ├── download.sh │ │ └── configs │ │ │ ├── ablation_p.yml │ │ │ ├── ablation_pr.yml │ │ │ ├── ablation_baseline.yml │ │ │ ├── cedar_local_plan.yml │ │ │ ├── ablation_tf_p.yml │ │ │ ├── ablation_tf_pr.yml │ │ │ ├── ablation_tf_pro.yml │ │ │ ├── ablation_tf_baseline.yml │ │ │ ├── ablation_pro.yml │ │ │ ├── cedar_tf_local_plan.yml │ │ │ ├── cedar_tf_remote_plan.yml │ │ │ └── cedar_remote_plan.yml │ ├── simclrv2 │ │ ├── __init__.py │ │ ├── download.py │ │ ├── configs │ │ │ ├── ablation_tf_p.yaml │ │ │ ├── ablation_tf_p_r.yaml │ │ │ ├── ablation_tf_p_r_o.yaml │ │ │ ├── eval_controller_local.yaml │ │ │ ├── ablation_p.yaml │ │ │ ├── ablation_p_r.yaml │ │ │ ├── ablation_baseline.yaml │ │ │ ├── eval_ember_remote.yaml │ │ │ ├── eval_controller_remote.yaml │ │ │ ├── eval_ember_local.yaml │ │ │ ├── eval_ember_remote_tf.yaml │ │ │ ├── ablation_tf_baseline.yaml │ │ │ ├── ablation_p_r_o.yaml │ │ │ └── eval_ember_local_tf.yaml │ │ ├── cache_results │ │ │ └── configs │ │ │ │ ├── no_cache_plan.yml │ │ │ │ ├── cache_after_grayscale.yml │ │ │ │ ├── new_simclrv2_optimized_plan.yml │ │ │ │ ├── cache_after_list.yml │ │ │ │ └── cache_after_read.yml │ │ ├── torch_dataset.py │ │ ├── ray_dataset.py │ │ └── tf_dataset.py │ ├── wikitext103 │ │ ├── configs │ │ │ ├── ablation_tf_service_p.yaml │ │ │ ├── ablation_tf_service_p_r.yaml │ │ │ ├── ablation_tf_service_baseline.yaml │ │ │ ├── eval_local_tf_service.yaml │ │ │ ├── eval_remote_tf_service.yaml │ │ │ ├── ablation_tf_p.yaml │ │ │ ├── ablation_tf_p_r.yaml │ │ │ ├── ablation_tf_baseline.yaml │ │ │ ├── ablation_tf_service_p_r_o.yaml │ │ │ ├── eval_local_tf.yaml │ │ │ ├── eval_remote_tf.yaml │ │ │ ├── ablation_tf_p_r_o.yaml │ │ │ ├── ablation_p.yaml │ │ │ ├── ablation_p_r.yaml │ │ │ ├── ablation_baseline.yaml │ │ │ ├── eval_remote.yaml │ │ │ ├── eval_local.yaml │ │ │ └── ablation_p_r_o.yaml │ │ ├── cache_results │ │ │ └── configs │ │ │ │ ├── wikitext_no_caching_plan.yml │ │ │ │ ├── wikitext_cache_after_truncate.yml │ │ │ │ ├── new_wikitext_optimal_cache_plan.yml │ │ │ │ ├── wikitext_cache_after_tensor_conversion.yml │ │ │ │ ├── wikitext_cache_after_tokenizer_one_offload.yml │ │ │ │ ├── wikitext_cache_after_truncate_one_offload.yml │ │ │ │ ├── wikitext_cache_after_tokenizer_two_offloads.yml │ │ │ │ └── wikitext_cache_after_truncate_two_offloads.yml │ │ ├── download.py │ │ ├── tf_dataset.py │ │ ├── torch_dataset.py │ │ └── tf_service_dataset.py │ └── commonvoice │ │ ├── configs │ │ ├── eval_remote.yaml │ │ ├── eval_local.yaml │ │ ├── ablation_p.yaml │ │ ├── ablation_p_r.yaml │ │ ├── ablation_baseline.yaml │ │ └── ablation_p_r_o.yaml │ │ ├── download.py │ │ ├── torch_dataset.py │ │ └── tf_dataset.py ├── fastflow │ └── examples │ │ ├── .gitignore │ │ ├── default_config.yaml │ │ ├── requirements_common.txt │ │ ├── config.yaml │ │ ├── requirements_cuda10.txt │ │ ├── requirements_cuda11.txt │ │ ├── run_fastflow.sh │ │ ├── test_app.py │ │ ├── nlp_hf_app.py │ │ ├── nlp_app.py │ │ └── simclr_app.py ├── plumber │ ├── coco │ │ ├── .gitignore │ │ ├── run_plumber.sh │ │ ├── dataset_flags.py │ │ └── graph_rewrites.py │ └── simclr │ │ ├── .gitignore │ │ ├── run_plumber.sh │ │ ├── dataset_flags.py │ │ ├── graph_rewrites.py │ │ └── show_bneck.py ├── datasets │ └── .gitignore ├── tf_requirements.txt ├── run_tf_service.sh ├── run_torch.sh ├── run_ray_remote.sh ├── run_ray_local.sh ├── torch_utils.py ├── run_tf.sh ├── tf_utils.py ├── run_autotuning.sh ├── run_tf_service.py ├── plots │ ├── ablation.csv │ ├── aggregate_data.csv │ ├── plot_ablation.py │ └── plot_scaling.py ├── run_caching.sh ├── cedar_utils.py ├── run_cedar_local.sh └── run_cedar_remote.sh ├── setup.py ├── requirements.txt └── LICENSE /cedar/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cedar/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | *.png -------------------------------------------------------------------------------- /evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | tf_env/ -------------------------------------------------------------------------------- /evaluation/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/test_text.txt: -------------------------------------------------------------------------------- 1 | hello 2 | world 3 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | -------------------------------------------------------------------------------- /tests/data/test_text_2.txt: -------------------------------------------------------------------------------- 1 | hello 2 | world 3 | this 4 | is 5 | ember 6 | speaking 7 | ! 8 | -------------------------------------------------------------------------------- /tests/data/.gitignore: -------------------------------------------------------------------------------- 1 | results.json 2 | config_output.yml 3 | insert_config_output.yml 4 | *.png -------------------------------------------------------------------------------- /evaluation/plumber/coco/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | logdir/ 3 | *.pb 4 | *.txt 5 | *.pdf 6 | *.dot 7 | *.ps -------------------------------------------------------------------------------- /tests/data/test_tf_string.txt: -------------------------------------------------------------------------------- 1 | HELLO 2 | WORLD 3 | HELLO 4 | WORLD 5 | HELLO 6 | WORLD 7 | HELLO 8 | WORLD -------------------------------------------------------------------------------- /tests/data/example_image.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/example_image.jpeg -------------------------------------------------------------------------------- /tests/data/images/image_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_1.jpg -------------------------------------------------------------------------------- /tests/data/images/image_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_2.jpg -------------------------------------------------------------------------------- /tests/data/images/image_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_3.jpg -------------------------------------------------------------------------------- /tests/data/images/image_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_4.jpg -------------------------------------------------------------------------------- /tests/data/images/image_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_5.jpg -------------------------------------------------------------------------------- /tests/data/images/image_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_6.jpg -------------------------------------------------------------------------------- /tests/data/images/image_7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_7.jpg -------------------------------------------------------------------------------- /tests/data/images/image_8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_8.jpg -------------------------------------------------------------------------------- /tests/data/images/image_9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_9.jpg -------------------------------------------------------------------------------- /evaluation/plumber/simclr/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | logdir/ 3 | *.csv 4 | *.pb 5 | *.txt 6 | *.pdf 7 | *.dot 8 | *.ps -------------------------------------------------------------------------------- /tests/data/images/image_10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_10.jpg -------------------------------------------------------------------------------- /tests/data/images/image_11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/images/image_11.jpg -------------------------------------------------------------------------------- /evaluation/datasets/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in datasets dir. 2 | # Use scripts to fetch datasets 3 | * 4 | !.gitignore -------------------------------------------------------------------------------- /tests/data/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /tests/data/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-mast/cedar/HEAD/tests/data/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /evaluation/plumber/coco/run_plumber.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python tf_dataset.py --profile && python graph_rewrites.py --skip_baseline=False -------------------------------------------------------------------------------- /evaluation/plumber/simclr/run_plumber.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python tf_dataset.py --profile && python graph_rewrites.py --skip_baseline=False -------------------------------------------------------------------------------- /tests/data/test_text_3.txt: -------------------------------------------------------------------------------- 1 | hello 2 | world 3 | this 4 | is 5 | ember 6 | speaking 7 | ! 8 | 1 9 | 2 10 | 3 11 | 4 12 | 5 13 | -------------------------------------------------------------------------------- /cedar/client/__init__.py: -------------------------------------------------------------------------------- 1 | from cedar.client.dataset import DataSet 2 | 3 | __all__ = ["DataSet"] 4 | 5 | assert __all__ == sorted(__all__) 6 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/default_config.yaml: -------------------------------------------------------------------------------- 1 | dispatcher_addr: 0.0.0.0 2 | parallel: 0 3 | num_profile_steps: 100 4 | num_initial_steps: 10 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="cedar", 6 | version="0.0.1", 7 | packages=find_packages(), 8 | ) 9 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/requirements_common.txt: -------------------------------------------------------------------------------- 1 | # Common 2 | matplotlib==3.5.3 3 | pandas 4 | tensorflow_datasets 5 | tensorflow_io 6 | tensorflow_addons 7 | imgaug 8 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/config.yaml: -------------------------------------------------------------------------------- 1 | dispatcher_addr: 10.138.0.14 2 | dispatcher_port: 5000 # dispatcher port 3 | num_profile_steps: 100 # number of profiling steps 4 | num_initial_steps: 10 # number of initial steps to skip metric profiling -------------------------------------------------------------------------------- /cedar/compose/__init__.py: -------------------------------------------------------------------------------- 1 | from cedar.compose.feature import Feature 2 | from cedar.compose.optimizer import OptimizerOptions, PhysicalPlan 3 | 4 | __all__ = ["Feature", "OptimizerOptions", "PhysicalPlan"] 5 | 6 | assert __all__ == sorted(__all__) 7 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/requirements_cuda10.txt: -------------------------------------------------------------------------------- 1 | -r requirements_common.txt 2 | 3 | # Dependencies to install DALI for CUDA 10.2 4 | --extra-index-url https://developer.download.nvidia.com/compute/redist 5 | nvidia-dali-cuda102 6 | nvidia-dali-tf-plugin-cuda102 7 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/requirements_cuda11.txt: -------------------------------------------------------------------------------- 1 | -r requirements_common.txt 2 | 3 | # Dependencies to install DALI for CUDA 11.0 4 | --extra-index-url https://developer.download.nvidia.com/compute/redist 5 | nvidia-dali-cuda110 6 | nvidia-dali-tf-plugin-cuda110 7 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/run_fastflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #cv-tf 4 | python eval_app_runner.py simclr_app.py simclr ff config.yaml 5 | #nlp-tf 6 | python eval_app_runner.py nlp_app.py nlp ff config.yaml 7 | #ssd-tf 8 | python eval_app_runner.py coco_app.py coco ff config.yaml -------------------------------------------------------------------------------- /evaluation/tf_requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.14.0 2 | tensorflow-addons[tensorflow] 3 | pyyaml 4 | ray[default]==2.7.0 5 | Pympler 6 | 7 | --extra-index-url https://download.pytorch.org/whl/cpu 8 | torch==2.0.1 9 | torchvision==0.15.2 10 | torchaudio==2.0.2 11 | torchdata 12 | -------------------------------------------------------------------------------- /tests/data/test_profile_stats.yml: -------------------------------------------------------------------------------- 1 | baseline: 2 | input_sizes: 3 | 0: 100 4 | 1: 100 5 | 2: 10 6 | 3: 0 7 | latencies: 8 | 0: 1 9 | 1: 1 10 | 2: 1 11 | 3: 1 12 | output_sizes: 13 | 0: 10 14 | 1: 100 15 | 2: 100 16 | 3: 10 17 | throughput: 10 18 | -------------------------------------------------------------------------------- /tests/data/config_tf_string.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 1: '' 4 | pipes: 5 | 0: 6 | name: MapperPipe_string_lower 7 | variant: TF 8 | 1: 9 | name: TFLocalLinePipe 10 | variant: TF 11 | variant_ctx: 12 | num_parallel_calls: -1 13 | fused_pipes: 14 | - 0 -------------------------------------------------------------------------------- /cedar/pipes/custom/simclrv2_pytorch.py: -------------------------------------------------------------------------------- 1 | from torchvision.io import ImageReadMode, read_image 2 | import torch 3 | 4 | IMG_HEIGHT = 244 5 | IMG_WIDTH = 244 6 | GAUSSIAN_BLUR_KERNEL_SIZE = 11 7 | 8 | 9 | def read_image_pytorch(x): 10 | return read_image(x, mode=ImageReadMode.RGB) 11 | 12 | 13 | def to_float(x): 14 | return x.to(torch.float32) 15 | -------------------------------------------------------------------------------- /cedar/pipes/tf.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class TFTensorDontCare: 7 | pass 8 | 9 | 10 | class TFOutputHint: 11 | def __init__( 12 | self, 13 | shape: Union[List, TFTensorDontCare], 14 | dtype: Union[tf.dtypes.DType, TFTensorDontCare], 15 | ): 16 | self.shape = shape 17 | self.dtype = dtype 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision 3 | torchaudio 4 | torchdata 5 | torchtext 6 | pytest 7 | google-cloud-storage 8 | google-api-python-client 9 | pytest-mock 10 | transformers 11 | pyyaml 12 | h5py 13 | pandas 14 | responses 15 | ray[default]==2.7.0 16 | Pympler 17 | tensorflow==2.14.0 18 | tensorflow-addons[tensorflow] 19 | tensorflow-text==2.14.0 20 | keras-nlp 21 | librosa 22 | pyarrow==13.0.0 23 | numpy==1.26.0 -------------------------------------------------------------------------------- /tests/data/test_optimizer_stats.yml: -------------------------------------------------------------------------------- 1 | baseline: 2 | input_sizes: 3 | 0: 32.0 4 | 1: 32.0 5 | 2: 32.0 6 | 3: 32.0 7 | 4: 0.0 8 | latencies: 9 | 0: 6928.421568627452 10 | 1: 16207.529411764706 11 | 2: 15714.558823529413 12 | 3: 16007.382352941177 13 | 4: 4067.4411764705883 14 | output_sizes: 15 | 0: 32.0 16 | 1: 32.0 17 | 2: 32.0 18 | 3: 32.0 19 | 4: 32.0 20 | throughput: 11503.603334229632 21 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/download.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | mkdir ~/cedar/evaluation/datasets/coco 4 | 5 | wget http://images.cocodataset.org/zips/val2017.zip 6 | unzip val2017.zip 7 | mv val2017/ ~/cedar/evaluation/datasets/coco 8 | 9 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 10 | 11 | unzip annotations_trainval2017.zip 12 | mv annotations/ ~/cedar/evaluation/datasets/coco 13 | 14 | rm val2017.zip 15 | rm annotations_trainval2017.zip 16 | -------------------------------------------------------------------------------- /evaluation/run_tf_service.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #cv-tf 4 | python eval_tf.py --dataset_file pipelines/simclrv2/tf_dataset.py --service_addr 10.138.0.90:38655 --num_parallel_calls -1 5 | #nlp-tf 6 | python eval_tf.py --dataset_file pipelines/wikitext103/tf_service_dataset.py --service_addr 10.138.0.90:38655 --num_parallel_calls -1 --num_total_samples 200000 7 | #ssd-tf 8 | python eval_tf.py --dataset_file pipelines/coco/tf_service_dataset.py --service_addr 10.138.0.90:38655 --num_parallel_calls -1 -------------------------------------------------------------------------------- /evaluation/run_torch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #cv-torch 4 | python eval_torch.py --dataset_file pipelines/simclrv2/torch_dataset.py --num_workers 8 5 | #nlp-torch 6 | python eval_torch.py --dataset_file pipelines/wikitext103/torch_dataset.py --num_workers 8 --num_total_samples 100000 7 | #asr 8 | python eval_torch.py --dataset_file pipelines/commonvoice/torch_dataset.py --num_workers 8 --num_total_samples 10000 9 | # ssd 10 | python eval_torch.py --dataset_file pipelines/coco/torch_dataset.py --num_workers 8 -------------------------------------------------------------------------------- /evaluation/plumber/coco/dataset_flags.py: -------------------------------------------------------------------------------- 1 | from absl import app 2 | from absl import flags 3 | 4 | FLAGS = flags.FLAGS 5 | 6 | flags.DEFINE_integer( 7 | 'benchmark_num_elements', default=1000, 8 | help=('The number of elements to use for the benchmark')) 9 | 10 | flags.DEFINE_integer( 11 | 'dataset_threadpool_size', default=8, 12 | help=('The size of the private datapool size in dataset.')) 13 | 14 | flags.DEFINE_bool( 15 | 'map_and_batch_fusion', default=True, 16 | help=('tf.data options')) 17 | -------------------------------------------------------------------------------- /evaluation/plumber/simclr/dataset_flags.py: -------------------------------------------------------------------------------- 1 | from absl import app 2 | from absl import flags 3 | 4 | FLAGS = flags.FLAGS 5 | 6 | flags.DEFINE_integer( 7 | 'benchmark_num_elements', default=1000, 8 | help=('The number of elements to use for the benchmark')) 9 | 10 | flags.DEFINE_integer( 11 | 'dataset_threadpool_size', default=8, 12 | help=('The size of the private datapool size in dataset.')) 13 | 14 | flags.DEFINE_bool( 15 | 'map_and_batch_fusion', default=True, 16 | help=('tf.data options')) 17 | -------------------------------------------------------------------------------- /evaluation/run_ray_remote.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #cv-torch 3 | python pipelines/simclrv2/ray_dataset.py 4 | #cv-tf 5 | python pipelines/simclrv2/ray_tf_dataset.py 6 | #nlp-torch 7 | python pipelines/wikitext103/ray_dataset.py 8 | #nlp-hf-tf 9 | python pipelines/wikitext103/ray_tf_dataset.py 10 | #nlp-tf 11 | python pipelines/wikitext103/ray_tf_service_dataset.py 12 | #asr 13 | python pipelines/commonvoice/ray_dataset.py 14 | #ssd 15 | python pipelines/coco/ray_dataset.py 16 | #ssd-tf 17 | python pipelines/coco/ray_tf_dataset.py -------------------------------------------------------------------------------- /evaluation/run_ray_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #cv-torch 4 | python pipelines/simclrv2/ray_dataset.py 5 | #cv-tf 6 | python pipelines/simclrv2/ray_tf_dataset.py 7 | #nlp-torch 8 | python pipelines/wikitext103/ray_dataset.py 9 | #nlp-hf-tf 10 | python pipelines/wikitext103/ray_tf_dataset.py 11 | #nlp-tf 12 | python pipelines/wikitext103/ray_tf_service_dataset.py 13 | #asr 14 | python pipelines/commonvoice/ray_dataset.py 15 | # ssd-torch 16 | python pipelines/coco/ray_dataset.py 17 | # ssd-tf 18 | python pipelines/coco/ray_tf_dataset.py -------------------------------------------------------------------------------- /cedar/sources/__init__.py: -------------------------------------------------------------------------------- 1 | from cedar.sources.iterable import IterSource 2 | from cedar.sources.local import LocalFSSource, LocalLineSource 3 | from cedar.sources.source import Source 4 | from cedar.sources.tf_sources import TFLocalLineSource 5 | from cedar.sources.coco import COCOSource, COCOFileSource 6 | 7 | __all__ = [ 8 | "COCOFileSource", 9 | "COCOSource", 10 | "IterSource", 11 | "LocalFSSource", 12 | "LocalLineSource", 13 | "Source", 14 | "TFLocalLineSource", 15 | ] 16 | 17 | assert __all__ == sorted(__all__) 18 | -------------------------------------------------------------------------------- /cedar/pipes/optimize/__init__.py: -------------------------------------------------------------------------------- 1 | from cedar.pipes.optimize.fuse import FusedOptimizerPipe 2 | from cedar.pipes.optimize.noop import NoopOptimizerPipe 3 | from cedar.pipes.optimize.io import ObjectDiskCachePipe 4 | from cedar.pipes.optimize.registry import OptimizerPipeRegistry 5 | from cedar.pipes.optimize.prefetch import PrefetcherPipe 6 | 7 | __all__ = [ 8 | "FusedOptimizerPipe", 9 | "NoopOptimizerPipe", 10 | "ObjectDiskCachePipe", 11 | "OptimizerPipeRegistry", 12 | "PrefetcherPipe", 13 | ] 14 | 15 | assert __all__ == sorted(__all__) 16 | -------------------------------------------------------------------------------- /evaluation/plumber/coco/graph_rewrites.py: -------------------------------------------------------------------------------- 1 | from absl import app 2 | from absl import flags 3 | 4 | import tensorflow as tf 5 | 6 | import pandas as pd 7 | try: 8 | import dataloader 9 | except ImportError: 10 | try: 11 | import resnet_flags 12 | except ImportError: 13 | import dataset_flags 14 | from plumber_analysis import graph_rewrites 15 | 16 | 17 | FLAGS = flags.FLAGS 18 | graph_rewrites.apply_default_flags() 19 | 20 | def main(_): 21 | graph_rewrites.default_main(_) 22 | 23 | if __name__ == '__main__': 24 | app.run(main) -------------------------------------------------------------------------------- /evaluation/plumber/simclr/graph_rewrites.py: -------------------------------------------------------------------------------- 1 | from absl import app 2 | from absl import flags 3 | 4 | import tensorflow as tf 5 | 6 | import pandas as pd 7 | try: 8 | import dataloader 9 | except ImportError: 10 | try: 11 | import resnet_flags 12 | except ImportError: 13 | import dataset_flags 14 | from plumber_analysis import graph_rewrites 15 | 16 | 17 | FLAGS = flags.FLAGS 18 | graph_rewrites.apply_default_flags() 19 | 20 | def main(_): 21 | graph_rewrites.default_main(_) 22 | 23 | if __name__ == '__main__': 24 | app.run(main) -------------------------------------------------------------------------------- /evaluation/torch_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | 4 | class TorchEvalSpec: 5 | def __init__( 6 | self, 7 | batch_size: int, 8 | num_workers: int, 9 | num_epochs: int = 1, 10 | num_total_samples: Optional[int] = None, 11 | iteration_time: Optional[float] = None, 12 | ): 13 | self.batch_size = batch_size 14 | self.num_workers = num_workers 15 | self.num_total_samples = num_total_samples 16 | self.num_epochs = num_epochs 17 | self.iteration_time = iteration_time 18 | -------------------------------------------------------------------------------- /cedar/utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class Timer: 5 | def __init__(self): 6 | self._start = None 7 | self._end = None 8 | 9 | def __enter__(self): 10 | self._start = time.perf_counter() 11 | 12 | def __exit__(self, exc_type, exc_val, exc_tb): 13 | self._end = time.perf_counter() 14 | 15 | def reset(self): 16 | self._start = time.perf_counter() 17 | 18 | def delta(self): 19 | if self._start is None or self._end is None: 20 | raise RuntimeError() 21 | return self._end - self._start 22 | -------------------------------------------------------------------------------- /cedar/service/task.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Any 3 | 4 | 5 | class Task(abc.ABC): 6 | """ 7 | A Task represents a discrete unit of processing, meant to be offloaded 8 | to an executor. 9 | """ 10 | 11 | @abc.abstractmethod 12 | def process(self) -> Any: 13 | pass 14 | 15 | 16 | class MultiprocessTask(Task): 17 | def __init__(self, input_data: Any) -> None: 18 | self.input_data = input_data 19 | 20 | 21 | class MultithreadedTask(Task): 22 | def __init__(self, input_data: Any) -> None: 23 | self.input_data = input_data 24 | -------------------------------------------------------------------------------- /cedar/compose/constants.py: -------------------------------------------------------------------------------- 1 | LOCAL_PARALLELISM_SCALING_FACTOR = 0.8 2 | OFFLOAD_THRESHOLD_FRAC = 0.05 3 | FUSED_PIPE_NAME = "FusedPipe" 4 | 5 | RAY_SUBMIT_BATCH_SIZE = 30 6 | RAY_AVAILABLE_PARALLELISM = 32 7 | # RAY_SUBMIT_BATCH_SCALING_FACTOR = 2000 8 | RAY_SUBMIT_BATCH_SCALING_FACTOR = 2000000 9 | 10 | # Threshold at which we forbid local workers due to serialization bottlenecks 11 | LOCAL_PARALLELISM_THRESHOLD = 100000000 12 | 13 | SMP_AVAILABLE_PARALLELISM = 8 14 | 15 | 16 | # Threshold for samples/s at which local parallelism is forbidden 17 | LOCAL_PARALLELISM_SAMPLES_PER_SEC_THRESHOLD = 100 18 | -------------------------------------------------------------------------------- /tests/data/config_fuse_reorder_tf.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 4: '1' 4 | 1: '2' 5 | 2: '5' 6 | 5: '' 7 | pipes: 8 | 0: 9 | name: MapperPipe__add_one 10 | variant: TF 11 | 1: 12 | name: MapperPipe__cast 13 | variant: TF 14 | 2: 15 | name: MapperPipe__fill_tensor 16 | variant: TF 17 | 3: 18 | name: MapperPipe__add_one 19 | variant: TF 20 | 4: 21 | name: IterSourcePipe 22 | variant: INPROCESS 23 | 5: 24 | name: FusedPipe 25 | variant: TF 26 | fused_pipes: 27 | - 3 28 | - 0 -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_service_p.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '4' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '' 8 | n_local_workers: 1 9 | pipes: 10 | 0: 11 | name: MapperPipe__embedding 12 | variant: TF 13 | 1: 14 | name: MapperPipe__truncate 15 | variant: TF 16 | 2: 17 | name: MapperPipe__tokenize 18 | variant: TF 19 | 3: 20 | name: TFLocalLinePipe 21 | variant: TF 22 | 4: 23 | name: PrefetcherPipe 24 | variant: INPROCESS 25 | variant_ctx: 26 | variant_type: INPROCESS 27 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_service_p_r.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '4' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '' 8 | n_local_workers: 1 9 | pipes: 10 | 0: 11 | name: MapperPipe__embedding 12 | variant: TF 13 | 1: 14 | name: MapperPipe__truncate 15 | variant: TF 16 | 2: 17 | name: MapperPipe__tokenize 18 | variant: TF 19 | 3: 20 | name: TFLocalLinePipe 21 | variant: TF 22 | 4: 23 | name: PrefetcherPipe 24 | variant: INPROCESS 25 | variant_ctx: 26 | variant_type: INPROCESS 27 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_service_baseline.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '4' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '' 8 | n_local_workers: 1 9 | pipes: 10 | 0: 11 | name: MapperPipe__embedding 12 | variant: TF 13 | 1: 14 | name: MapperPipe__truncate 15 | variant: TF 16 | 2: 17 | name: MapperPipe__tokenize 18 | variant: TF 19 | 3: 20 | name: TFLocalLinePipe 21 | variant: TF 22 | 4: 23 | name: PrefetcherPipe 24 | variant: INPROCESS 25 | variant_ctx: 26 | variant_type: INPROCESS 27 | -------------------------------------------------------------------------------- /cedar/pipes/custom/wikitext103.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from transformers import GPT2Tokenizer 3 | 4 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 5 | 6 | embedding = tf.Variable(tf.random.uniform([50257, 764], -1.0, 1.0)) 7 | 8 | 9 | @tf.py_function(Tout=tf.int32) 10 | def _tokenize(x): 11 | return tokenizer(str(x.numpy()), return_tensors="tf")["input_ids"] 12 | 13 | 14 | def _embedding(x): 15 | return tf.nn.embedding_lookup(embedding, x) 16 | 17 | 18 | def _truncate(x): 19 | dim = tf.shape(x)[1] 20 | slice_size = tf.minimum(dim, 254) 21 | x = tf.slice(x, [0, 0], [1, slice_size]) 22 | return x 23 | -------------------------------------------------------------------------------- /evaluation/run_tf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #cv-tf 4 | python eval_tf.py --dataset_file pipelines/simclrv2/tf_dataset.py --num_parallel_calls -1 5 | #nlp-hf-tf 6 | python eval_tf.py --dataset_file pipelines/wikitext103/tf_dataset.py --num_parallel_calls -1 --num_total_samples 100000 7 | #nlp-tf 8 | python eval_tf.py --dataset_file pipelines/wikitext103/tf_service_dataset.py --num_parallel_calls -1 --num_total_samples 200000 9 | #asr 10 | python eval_tf.py --dataset_file pipelines/commonvoice/tf_dataset.py --num_parallel_calls -1 --num_total_samples 10000 11 | # ssd 12 | python eval_tf.py --dataset_file pipelines/coco/tf_dataset.py --num_parallel_calls -1 -------------------------------------------------------------------------------- /tests/data/test_full_optimizer_stats.yml: -------------------------------------------------------------------------------- 1 | baseline: 2 | input_sizes: 3 | 0: 1 4 | 1: 100 5 | 2: 100 6 | 3: 100 7 | 4: 100 8 | 5: 1 9 | 6: 1 10 | latencies: 11 | 0: 1 12 | 1: 1 13 | 2: 1 14 | 3: 1 15 | 4: 1 16 | 5: 1 17 | 6: 1 18 | output_sizes: 19 | 0: 1 20 | 1: 1 21 | 2: 100 22 | 3: 100 23 | 4: 100 24 | 5: 100 25 | 6: 1 26 | throughput: 100 27 | offloads: 28 | RAY: 29 | 1: 30 | throughput: 101 31 | 2: 32 | throughput: 1 33 | 3: 34 | throughput: 1 35 | 4: 36 | throughput: 101 37 | 5: 38 | throughput: 150 39 | -------------------------------------------------------------------------------- /tests/data/config_ref.yml: -------------------------------------------------------------------------------- 1 | logical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | pipes: 8 | 0: 9 | name: NoopPipe 10 | 1: 11 | name: NoopPipe 12 | 2: 13 | name: NoopPipe 14 | 3: 15 | name: IterSourcePipe 16 | physical_plan: 17 | graph: 18 | 0: '' 19 | 1: '0' 20 | 2: '1' 21 | 3: '2' 22 | pipes: 23 | 0: 24 | name: NoopPipe 25 | variant: INPROCESS 26 | 1: 27 | name: NoopPipe 28 | variant: INPROCESS 29 | 2: 30 | name: NoopPipe 31 | variant: INPROCESS 32 | 3: 33 | name: IterSourcePipe 34 | variant: INPROCESS 35 | -------------------------------------------------------------------------------- /cedar/client/constants.py: -------------------------------------------------------------------------------- 1 | from cedar.compose.constants import RAY_SUBMIT_BATCH_SIZE 2 | 3 | RAY_PROFILE_N_ACTORS = 8 4 | RAY_PROFILE_INFLIGHT = 100 5 | RAY_PROFILE_PREFETCH = 100 6 | RAY_PROFILE_SUBMIT_BATCH_SIZE = 10 7 | CONTROLLER_PERIOD_SEC = 3 8 | MAX_HISTORY = CONTROLLER_PERIOD_SEC * 10 9 | THROUGHPUT_LOG_TIME_SEC = 1 10 | SCALE_ATTEMPTS = 3 11 | THROUGHPUT_THRESHOLD = 1.01 12 | EMPTY_BUFFER_THRESHOLD = 500 # set arouhd half of max buffer size 13 | AVAILABLE_RAY_SCALE = 32 14 | SMP_PROFILE_N_PROCS = 8 15 | SMP_TASKSET_MASK = 0xFF # should match the taskset cpu mask of smp n_procs 16 | SMP_PROFILE_INFLIGHT = 100 17 | SMP_PROFILE_PREFETCH = 100 18 | CONTROLLER_SCALE_DOWN_COUNTER = 10 19 | -------------------------------------------------------------------------------- /tests/data/config_ref_variant.yml: -------------------------------------------------------------------------------- 1 | logical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | pipes: 8 | 0: 9 | name: NoopPipe 10 | 1: 11 | name: NoopPipe 12 | 2: 13 | name: NoopPipe 14 | 3: 15 | name: IterSourcePipe 16 | physical_plan: 17 | graph: 18 | 0: '' 19 | 1: '0' 20 | 2: '1' 21 | 3: '2' 22 | pipes: 23 | 0: 24 | name: NoopPipe 25 | variant: INPROCESS 26 | 1: 27 | name: NoopPipe 28 | variant: SMP 29 | variant_ctx: 30 | n_procs: 10 31 | 2: 32 | name: NoopPipe 33 | variant: INPROCESS 34 | 3: 35 | name: IterSourcePipe 36 | variant: INPROCESS 37 | -------------------------------------------------------------------------------- /tests/data/test_cache_optimizer_stats_expensive_io.yml: -------------------------------------------------------------------------------- 1 | baseline: 2 | input_sizes: 3 | 0: 1 4 | 1: 100 5 | 2: 100 6 | 3: 100 7 | 4: 100 8 | 5: 1 9 | 6: 1 10 | latencies: 11 | 0: 1 12 | 1: 1 13 | 2: 1 14 | 3: 1 15 | 4: 1 16 | 5: 1 17 | 6: 1 18 | output_sizes: 19 | 0: 1 20 | 1: 1 21 | 2: 100 22 | 3: 100 23 | 4: 100 24 | 5: 100 25 | 6: 1 26 | throughput: 100 27 | offloads: 28 | RAY: 29 | 1: 30 | throughput: 100 31 | 2: 32 | throughput: 1 33 | 3: 34 | throughput: 1 35 | 4: 36 | throughput: 100 37 | 5: 38 | throughput: 150 39 | disk_info: 40 | read_latency: 100 41 | write_latency: 100 42 | -------------------------------------------------------------------------------- /tests/data/test_cache_optimizer_stats.yml: -------------------------------------------------------------------------------- 1 | baseline: 2 | input_sizes: 3 | 0: 1 4 | 1: 100 5 | 2: 100 6 | 3: 100 7 | 4: 100 8 | 5: 1 9 | 6: 1 10 | latencies: 11 | 0: 1 12 | 1: 1 13 | 2: 1 14 | 3: 1 15 | 4: 1 16 | 5: 1 17 | 6: 1 18 | output_sizes: 19 | 0: 1 20 | 1: 1 21 | 2: 100 22 | 3: 100 23 | 4: 100 24 | 5: 100 25 | 6: 1 26 | throughput: 100 27 | offloads: 28 | RAY: 29 | 1: 30 | throughput: 100 31 | 2: 32 | throughput: 1 33 | 3: 34 | throughput: 1 35 | 4: 36 | throughput: 100 37 | 5: 38 | throughput: 150 39 | disk_info: 40 | read_latency: 1.6408648662036286e-09 41 | write_latency: 2.5914232537616046e-09 42 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/eval_local_tf_service.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 3: '4' 4 | 4: '' 5 | n_local_workers: 1 6 | pipes: 7 | 0: 8 | name: MapperPipe__embedding 9 | variant: INPROCESS 10 | 1: 11 | name: MapperPipe__truncate 12 | variant: INPROCESS 13 | 2: 14 | name: MapperPipe__tokenize 15 | variant: INPROCESS 16 | 3: 17 | fused_pipes: 18 | - 2 19 | - 1 20 | - 0 21 | name: TFLocalLinePipe 22 | variant: TF 23 | variant_ctx: 24 | num_parallel_calls: -1 25 | variant_type: TF 26 | 4: 27 | name: PrefetcherPipe 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/eval_remote_tf_service.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 3: '4' 4 | 4: '' 5 | n_local_workers: 1 6 | pipes: 7 | 0: 8 | name: MapperPipe__embedding 9 | variant: INPROCESS 10 | 1: 11 | name: MapperPipe__truncate 12 | variant: INPROCESS 13 | 2: 14 | name: MapperPipe__tokenize 15 | variant: INPROCESS 16 | 3: 17 | fused_pipes: 18 | - 2 19 | - 1 20 | - 0 21 | name: TFLocalLinePipe 22 | variant: TF 23 | variant_ctx: 24 | num_parallel_calls: -1 25 | variant_type: TF 26 | 4: 27 | name: PrefetcherPipe 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | -------------------------------------------------------------------------------- /tests/data/config_ref_prefetch.yml: -------------------------------------------------------------------------------- 1 | logical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | pipes: 8 | 0: 9 | name: NoopPipe 10 | 1: 11 | name: NoopPipe 12 | 2: 13 | name: NoopPipe 14 | 3: 15 | name: IterSourcePipe 16 | physical_plan: 17 | graph: 18 | 0: '4' 19 | 1: '0' 20 | 2: '1' 21 | 3: '2' 22 | 4: '' 23 | pipes: 24 | 0: 25 | name: NoopPipe 26 | variant: INPROCESS 27 | 1: 28 | name: NoopPipe 29 | variant: INPROCESS 30 | 2: 31 | name: NoopPipe 32 | variant: INPROCESS 33 | 3: 34 | name: IterSourcePipe 35 | variant: INPROCESS 36 | 4: 37 | name: PrefetcherPipe 38 | variant: INPROCESS 39 | -------------------------------------------------------------------------------- /evaluation/plumber/simclr/show_bneck.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | filename = "stats.pb" 4 | plumber = tf.data.experimental.analysis.PlumberPerformanceModel(filename) 5 | model = plumber.model() 6 | 7 | recommendation = model.recommendation() 8 | slowest_node = recommendation.bottleneck_node() 9 | print("Slowest node: {}".format(slowest_node.name)) 10 | CPU_time_used = model.total_CPU_time() 11 | wallclock_used = model.total_wallclock_time() 12 | cpu_util = model.CPU_Util() 13 | disk_util = model.Disk_Util() 14 | print("Resource utilization: CPU Util {} ({}s CPU time,{}s wallclock time)," 15 | " Disk Util {}".format(cpu_util, 16 | CPU_time_used, 17 | wallclock_used, 18 | disk_util)) -------------------------------------------------------------------------------- /cedar/service/__init__.py: -------------------------------------------------------------------------------- 1 | from cedar.service.multiprocess import MultiprocessService 2 | from cedar.service.multithread import MultithreadedService 3 | from cedar.service.smp import SMPService, SMPRequest, SMPResponse 4 | from cedar.service.task import ( 5 | MultiprocessTask, 6 | MultithreadedTask, 7 | Task, 8 | ) 9 | from cedar.service.actor import SMPActor 10 | from cedar.service.ray_service import RayActor, RayService 11 | 12 | 13 | __all__ = [ 14 | "MultiprocessService", 15 | "MultiprocessTask", 16 | "MultithreadedService", 17 | "MultithreadedTask", 18 | "RayActor", 19 | "RayService", 20 | "SMPActor", 21 | "SMPRequest", 22 | "SMPResponse", 23 | "SMPService", 24 | "Task", 25 | ] 26 | 27 | assert __all__ == sorted(__all__) 28 | -------------------------------------------------------------------------------- /evaluation/tf_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | 4 | class TFEvalSpec: 5 | def __init__( 6 | self, 7 | batch_size: int, 8 | num_parallel_calls: Optional[int], 9 | num_epochs: int = 1, 10 | num_total_samples: Optional[int] = None, 11 | iteration_time: Optional[float] = None, 12 | service_addr: Optional[str] = None, 13 | read_from_remote: bool = False, 14 | ): 15 | self.batch_size = batch_size 16 | self.num_parallel_calls = num_parallel_calls 17 | self.num_total_samples = num_total_samples 18 | self.num_epochs = num_epochs 19 | self.iteration_time = iteration_time 20 | self.service_addr = service_addr 21 | self.read_from_remote = read_from_remote 22 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_p.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '5' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: MapperPipe__embedding 13 | variant: INPROCESS 14 | 1: 15 | name: MapperPipe__truncate 16 | variant: INPROCESS 17 | 2: 18 | name: MapperPipe__tokenize 19 | variant: INPROCESS 20 | 3: 21 | name: MapperPipe_convert_to_tensor_v2_with_dispatch 22 | variant: INPROCESS 23 | 4: 24 | name: LocalLinePipe 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 5: 29 | name: PrefetcherPipe 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_p_r.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '5' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: MapperPipe__embedding 13 | variant: INPROCESS 14 | 1: 15 | name: MapperPipe__truncate 16 | variant: INPROCESS 17 | 2: 18 | name: MapperPipe__tokenize 19 | variant: INPROCESS 20 | 3: 21 | name: MapperPipe_convert_to_tensor_v2_with_dispatch 22 | variant: INPROCESS 23 | 4: 24 | name: LocalLinePipe 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 5: 29 | name: PrefetcherPipe 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_baseline.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '5' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: MapperPipe__embedding 13 | variant: INPROCESS 14 | 1: 15 | name: MapperPipe__truncate 16 | variant: INPROCESS 17 | 2: 18 | name: MapperPipe__tokenize 19 | variant: INPROCESS 20 | 3: 21 | name: MapperPipe_convert_to_tensor_v2_with_dispatch 22 | variant: INPROCESS 23 | 4: 24 | name: LocalLinePipe 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 5: 29 | name: PrefetcherPipe 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | -------------------------------------------------------------------------------- /tests/data/config_ref_mp.yml: -------------------------------------------------------------------------------- 1 | logical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | pipes: 9 | 0: 10 | name: BatcherPipe(batch_size=3) 11 | 1: 12 | name: NoopPipe 13 | 2: 14 | name: NoopPipe 15 | 3: 16 | name: NoopPipe 17 | 4: 18 | name: IterSourcePipe 19 | physical_plan: 20 | graph: 21 | 0: '' 22 | 1: '0' 23 | 2: '1' 24 | 3: '2' 25 | 4: '3' 26 | pipes: 27 | 0: 28 | name: BatcherPipe(batch_size=3) 29 | variant: INPROCESS 30 | 1: 31 | name: NoopPipe 32 | variant: INPROCESS 33 | 2: 34 | name: NoopPipe 35 | variant: INPROCESS 36 | 3: 37 | name: NoopPipe 38 | variant: INPROCESS 39 | 4: 40 | name: IterSourcePipe 41 | variant: INPROCESS 42 | n_local_workers: 3 43 | -------------------------------------------------------------------------------- /cedar/pipes/custom/wikitext103_tf_service.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_text as text 3 | 4 | # from https://github.com/cirquit/presto/blob/master/openwebtext_pipeline_modern.py # noqa: E501 5 | # vocabulary size 50001, GPT2 originally used 50257 6 | vocabulary_size = 50001 7 | bpe_model_path = tf.keras.utils.get_file( 8 | "bpe_en_50k.model", 9 | "https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs50000.model", 10 | ) 11 | bpe_model = open(bpe_model_path, "rb").read() 12 | 13 | embedding_dimension = 768 14 | bpe_tokernizer = text.SentencepieceTokenizer( 15 | model=bpe_model, out_type=tf.dtypes.int32 16 | ) 17 | 18 | embedding = tf.Variable( 19 | tf.random.uniform([vocabulary_size, embedding_dimension], -1.0, 1.0) 20 | ) 21 | 22 | 23 | def _truncate(x): 24 | dim = tf.shape(x)[0] 25 | slice_size = tf.minimum(dim, 254) 26 | x = tf.slice(x, [0], [slice_size]) 27 | return x 28 | 29 | 30 | def _embedding(x): 31 | return tf.nn.embedding_lookup(embedding, x) 32 | 33 | 34 | def _tokenize(x): 35 | return bpe_tokernizer.tokenize(x) 36 | -------------------------------------------------------------------------------- /tests/data/config_fuse_ray.yml: -------------------------------------------------------------------------------- 1 | logical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | pipes: 9 | 0: 10 | name: NoopPipe 11 | 1: 12 | name: MapperPipe__add_one 13 | 2: 14 | name: MapperPipe__add_one 15 | 3: 16 | name: NoopPipe 17 | 4: 18 | name: IterSourcePipe 19 | physical_plan: 20 | graph: 21 | 0: '5' 22 | 3: '6' 23 | 4: '3' 24 | 5: '' 25 | 6: '0' 26 | pipes: 27 | 0: 28 | name: NoopPipe 29 | variant: INPROCESS 30 | 1: 31 | name: MapperPipe__add_one 32 | variant: INPROCESS 33 | 2: 34 | name: MapperPipe__add_one 35 | variant: INPROCESS 36 | 3: 37 | name: NoopPipe 38 | variant: INPROCESS 39 | 4: 40 | name: IterSourcePipe 41 | variant: INPROCESS 42 | 5: 43 | name: PrefetcherPipe 44 | variant: INPROCESS 45 | 6: 46 | name: FusedPipe 47 | variant: RAY 48 | variant_ctx: 49 | n_actors: 1 50 | fused_pipes: 51 | - 2 52 | - 1 -------------------------------------------------------------------------------- /cedar/pipes/custom/coco.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from torchvision.transforms import v2 4 | from torchvision.datapoints import BoundingBox, BoundingBoxFormat 5 | 6 | 7 | def to_float(x): 8 | return x.to(torch.float32) 9 | 10 | 11 | def to_tensor(x): 12 | x["image"] = v2.ToTensor()(x["image"]) 13 | return x 14 | 15 | 16 | def distort(x): 17 | x["image"] = v2.RandomPhotometricDistort(p=1)(x["image"]) 18 | return x 19 | 20 | 21 | def zoom_out(x): 22 | x["image"], x["boxes"] = v2.RandomZoomOut(fill=[123.0, 117.0, 104.0], p=1)( 23 | x["image"], x["boxes"] 24 | ) 25 | return x 26 | 27 | 28 | def crop(x): 29 | x["image"], x["boxes"] = v2.RandomIoUCrop()(x["image"], x["boxes"]) 30 | return x 31 | 32 | 33 | def read_image(x): 34 | img = Image.open(x["image"]).convert("RGB") 35 | x["image"] = img 36 | bboxes = BoundingBox( 37 | x["boxes"], 38 | format=BoundingBoxFormat.XYXY, 39 | spatial_size=(img.height, img.width), 40 | ) 41 | x["boxes"] = bboxes 42 | 43 | return x 44 | -------------------------------------------------------------------------------- /tests/data/insert_config_ref.yml: -------------------------------------------------------------------------------- 1 | logical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | pipes: 8 | 0: 9 | name: NoopPipe 10 | 1: 11 | name: NoopPipe 12 | 2: 13 | name: NoopPipe 14 | 3: 15 | name: IterSourcePipe 16 | physical_plan: 17 | graph: 18 | 0: '' 19 | 1: '0' 20 | 2: '4' 21 | 3: '2' 22 | 4: '1' 23 | n_local_workers: 1 24 | pipes: 25 | 0: 26 | name: NoopPipe 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 1: 31 | name: NoopPipe 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 2: 36 | name: NoopPipe 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 3: 41 | name: IterSourcePipe 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 4: 46 | name: NoopOptimizerPipe 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_service_p_r_o.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '4' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '' 8 | n_local_workers: 1 9 | pipes: 10 | 0: 11 | name: MapperPipe__embedding 12 | variant: TF 13 | variant_ctx: 14 | num_parallel_calls: null 15 | variant_type: TF 16 | 1: 17 | name: MapperPipe__truncate 18 | variant: TF 19 | variant_ctx: 20 | num_parallel_calls: null 21 | variant_type: TF 22 | 2: 23 | name: MapperPipe__tokenize 24 | variant: TF_RAY 25 | variant_ctx: 26 | max_inflight: 48000 27 | max_prefetch: 48000 28 | n_actors: 32 29 | num_parallel_calls: null 30 | submit_batch_size: 500 31 | use_threads: true 32 | variant_type: TF_RAY 33 | 3: 34 | name: TFLocalLinePipe 35 | variant: TF 36 | variant_ctx: 37 | num_parallel_calls: null 38 | variant_type: TF 39 | 4: 40 | name: PrefetcherPipe 41 | variant: INPROCESS 42 | variant_ctx: 43 | variant_type: INPROCESS 44 | -------------------------------------------------------------------------------- /cedar/pipes/optimize/registry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Type, Callable 3 | 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class OptimizerPipeRegistry: 9 | _registered_pipes = {} 10 | 11 | @classmethod 12 | def register_pipe(cls, name: str, pipe_cls: Type): 13 | if name in cls._registered_pipes: 14 | logger.warning( 15 | f"Optimizer Pipe {name} already registered. Overwriting." 16 | ) 17 | logger.info(f"Registering Optimizer Pipe {name}.") 18 | cls._registered_pipes[name] = pipe_cls 19 | 20 | @classmethod 21 | def get_pipe(cls, name: str): 22 | if name not in cls._registered_pipes: 23 | raise ValueError(f"Pipe {name} not reigstered.") 24 | return cls._registered_pipes[name] 25 | 26 | 27 | def register_optimizer_pipe(name: str) -> Callable[[Type], Type]: 28 | """ 29 | Decorator to register an optimizer pipe. 30 | """ 31 | 32 | def decorator(optimizer_cls: Type) -> Type: 33 | OptimizerPipeRegistry.register_pipe(name, optimizer_cls) 34 | return optimizer_cls 35 | 36 | return decorator 37 | -------------------------------------------------------------------------------- /evaluation/run_autotuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_controller_dataset.py --master_feature_config pipelines/simclrv2/configs/eval_controller_remote.yaml --use_ray --ray_ip 10.138.0.7 --num_epochs 5 --iteration_time 0.05 3 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_controller_dataset.py --master_feature_config pipelines/simclrv2/configs/eval_controller_remote.yaml --use_ray --ray_ip 10.138.0.50 --num_epochs 5 --iteration_time 0.02 4 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_controller_dataset.py --master_feature_config pipelines/simclrv2/configs/eval_controller_remote.yaml --use_ray --ray_ip 10.138.0.50 --num_epochs 5 --iteration_time 0.01 5 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_controller_dataset.py --master_feature_config pipelines/simclrv2/configs/eval_controller_remote.yaml --use_ray --ray_ip 10.138.0.50 --num_epochs 5 --iteration_time 0.005 6 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_controller_dataset.py --master_feature_config pipelines/simclrv2/configs/eval_controller_remote.yaml --use_ray --ray_ip 10.138.0.50 --num_epochs 5 --iteration_time 0.00333 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | cedar Open Source License. 2 | 3 | Copyright 2024 Board of Trustees of Stanford University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cedar/service/multiprocess.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from concurrent.futures import ProcessPoolExecutor, Future 4 | 5 | from .task import MultiprocessTask 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class MultiprocessService: 11 | """ 12 | A multiprocess service that executes preprocessing tasks 13 | using a pool of workers. 14 | 15 | Args: 16 | num_workers: Number of workers in the pool 17 | """ 18 | 19 | def __init__(self, num_workers: int): 20 | if num_workers < 1: 21 | raise ValueError( 22 | "Cannot create a MultiprocessService with {} workers".format( 23 | num_workers 24 | ) 25 | ) 26 | self.executor = ProcessPoolExecutor(max_workers=num_workers) 27 | logger.info( 28 | f"Started Multiprocess Service with {num_workers} workers." 29 | ) 30 | 31 | def shutdown(self): 32 | self.executor.shutdown() 33 | 34 | def submit(self, task: MultiprocessTask) -> Future: 35 | future = self.executor.submit(task.process) 36 | return future 37 | 38 | def __del__(self): 39 | self.shutdown() 40 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/eval_local_tf.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '5' 4 | 4: '6' 5 | 5: '' 6 | 6: '0' 7 | n_local_workers: 1 8 | pipes: 9 | 0: 10 | name: MapperPipe__embedding 11 | variant: TF 12 | variant_ctx: 13 | num_parallel_calls: null 14 | variant_type: TF 15 | 1: 16 | name: MapperPipe__truncate 17 | variant: INPROCESS 18 | 2: 19 | name: MapperPipe__tokenize 20 | variant: INPROCESS 21 | 3: 22 | name: MapperPipe_convert_to_tensor_v2_with_dispatch 23 | variant: INPROCESS 24 | 4: 25 | name: LocalLinePipe 26 | variant: INPROCESS 27 | variant_ctx: 28 | variant_type: INPROCESS 29 | 5: 30 | name: PrefetcherPipe 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 6: 35 | fused_pipes: 36 | - 3 37 | - 2 38 | - 1 39 | name: FusedPipe 40 | variant: SMP 41 | variant_ctx: 42 | disable_torch_parallelism: true 43 | max_inflight: 50 44 | max_prefetch: 50 45 | n_procs: 8 46 | use_threads: true 47 | variant_type: SMP 48 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/eval_remote_tf.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '5' 4 | 4: '6' 5 | 5: '' 6 | 6: '0' 7 | n_local_workers: 8 8 | pipes: 9 | 0: 10 | name: MapperPipe__embedding 11 | variant: TF 12 | variant_ctx: 13 | num_parallel_calls: null 14 | variant_type: TF 15 | 1: 16 | name: MapperPipe__truncate 17 | variant: INPROCESS 18 | 2: 19 | name: MapperPipe__tokenize 20 | variant: INPROCESS 21 | 3: 22 | name: MapperPipe_convert_to_tensor_v2_with_dispatch 23 | variant: INPROCESS 24 | 4: 25 | name: LocalLinePipe 26 | variant: INPROCESS 27 | variant_ctx: 28 | variant_type: INPROCESS 29 | 5: 30 | name: PrefetcherPipe 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 6: 35 | fused_pipes: 36 | - 3 37 | - 2 38 | - 1 39 | name: FusedPipe 40 | variant: TF_RAY 41 | variant_ctx: 42 | max_inflight: 1500 43 | max_prefetch: 1500 44 | n_actors: 4 45 | num_parallel_calls: null 46 | submit_batch_size: 500 47 | use_threads: true 48 | variant_type: TF_RAY 49 | -------------------------------------------------------------------------------- /evaluation/run_tf_service.py: -------------------------------------------------------------------------------- 1 | """ 2 | Launches a tf.data.service worker and dispatcher on this machine. 3 | """ 4 | 5 | import tensorflow as tf 6 | import time 7 | import argparse 8 | 9 | DISPATCHER_PORT = 38655 10 | WORKER_PORT = 38656 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description="Server for tf.data.service") 15 | parser.add_argument( 16 | "--ip_addr", 17 | type=str, 18 | help="IP Address of local host", 19 | required=True, 20 | ) 21 | args = parser.parse_args() 22 | 23 | d_config = tf.data.experimental.service.DispatcherConfig( 24 | port=DISPATCHER_PORT 25 | ) 26 | dispatcher = tf.data.experimental.service.DispatchServer(d_config) 27 | dispatcher_address = dispatcher.target.split("://")[1] 28 | 29 | print("Started tf.data service at address {}".format(dispatcher.target)) 30 | 31 | w_config = tf.data.experimental.service.WorkerConfig( 32 | dispatcher_address=dispatcher_address, 33 | worker_address=args.ip_addr + ":" + str(WORKER_PORT), 34 | port=WORKER_PORT, 35 | ) 36 | worker = tf.data.experimental.service.WorkerServer(w_config) # noqa:F841 37 | 38 | while True: 39 | time.sleep(1) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /cedar/pipes/custom/simclrv2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_addons as tfa 3 | 4 | IMG_HEIGHT = 244 5 | IMG_WIDTH = 244 6 | GAUSSIAN_BLUR_KERNEL_SIZE = 11 7 | 8 | 9 | def read_image(x): 10 | img = tf.io.read_file(x) 11 | return img 12 | 13 | 14 | def decode_jpeg(x): 15 | img = tf.image.decode_jpeg(x, channels=3) 16 | img = tf.expand_dims(img, axis=0) 17 | return img 18 | 19 | 20 | def convert_to_float(x): 21 | return tf.image.convert_image_dtype(x, tf.float32) 22 | 23 | 24 | def crop_and_resize(x): 25 | boxes = tf.random.uniform(shape=(1, 4)) 26 | return tf.image.crop_and_resize(x, boxes, [0], [IMG_HEIGHT, IMG_WIDTH]) 27 | 28 | 29 | def random_flip(x): 30 | return tf.image.random_flip_left_right(x) 31 | 32 | 33 | def color_jitter(x): 34 | img = tf.image.random_brightness(x, max_delta=0.1) 35 | img = tf.image.random_contrast(img, lower=0.9, upper=1.1) 36 | if img.shape[-1] == 3: 37 | img = tf.image.random_saturation(img, lower=0.9, upper=1.1) 38 | if img.shape[-1] == 3: 39 | img = tf.image.random_hue(img, max_delta=0.1) 40 | return img 41 | 42 | 43 | def gaussian_blur(x): 44 | return tfa.image.gaussian_filter2d( 45 | x, 46 | filter_shape=[GAUSSIAN_BLUR_KERNEL_SIZE, GAUSSIAN_BLUR_KERNEL_SIZE], 47 | ) 48 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_p.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '' 11 | n_local_workers: 8 12 | pipes: 13 | 0: 14 | name: MapperPipe_to_tensor 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_distort 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_RandomHorizontalFlip(p=1) 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 3: 29 | name: MapperPipe_SanitizeBoundingBox(min_size=1.0, labels_getter=boxes) 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | 4: 34 | name: MapperPipe_crop 35 | variant: INPROCESS 36 | variant_ctx: 37 | variant_type: INPROCESS 38 | 5: 39 | name: MapperPipe_zoom_out 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 6: 44 | name: COCOSourcePipe 45 | variant: INPROCESS 46 | variant_ctx: 47 | variant_type: INPROCESS 48 | 7: 49 | name: PrefetcherPipe 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_pr.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '5' 5 | 2: '0' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '1' 10 | 7: '' 11 | n_local_workers: 8 12 | pipes: 13 | 0: 14 | name: MapperPipe_to_tensor 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_distort 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_RandomHorizontalFlip(p=1) 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 3: 29 | name: MapperPipe_SanitizeBoundingBox(min_size=1.0, labels_getter=boxes) 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | 4: 34 | name: MapperPipe_crop 35 | variant: INPROCESS 36 | variant_ctx: 37 | variant_type: INPROCESS 38 | 5: 39 | name: MapperPipe_zoom_out 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 6: 44 | name: COCOSourcePipe 45 | variant: INPROCESS 46 | variant_ctx: 47 | variant_type: INPROCESS 48 | 7: 49 | name: PrefetcherPipe 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_baseline.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: MapperPipe_to_tensor 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_distort 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_RandomHorizontalFlip(p=1) 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 3: 29 | name: MapperPipe_SanitizeBoundingBox(min_size=1.0, labels_getter=boxes) 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | 4: 34 | name: MapperPipe_crop 35 | variant: INPROCESS 36 | variant_ctx: 37 | variant_type: INPROCESS 38 | 5: 39 | name: MapperPipe_zoom_out 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 6: 44 | name: COCOSourcePipe 45 | variant: INPROCESS 46 | variant_ctx: 47 | variant_type: INPROCESS 48 | 7: 49 | name: PrefetcherPipe 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/cedar_local_plan.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '5' 5 | 2: '0' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '1' 10 | 7: '' 11 | n_local_workers: 8 12 | pipes: 13 | 0: 14 | name: MapperPipe_to_tensor 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_distort 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_RandomHorizontalFlip(p=1) 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 3: 29 | name: MapperPipe_SanitizeBoundingBox(min_size=1.0, labels_getter=boxes) 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | 4: 34 | name: MapperPipe_crop 35 | variant: INPROCESS 36 | variant_ctx: 37 | variant_type: INPROCESS 38 | 5: 39 | name: MapperPipe_zoom_out 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 6: 44 | name: COCOSourcePipe 45 | variant: INPROCESS 46 | variant_ctx: 47 | variant_type: INPROCESS 48 | 7: 49 | name: PrefetcherPipe 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads the dataset for this pipeline 3 | """ 4 | 5 | import pathlib 6 | import tarfile 7 | import urllib.request 8 | 9 | DATASET_NAME = "imagenette2" 10 | DATASET_LOC = "datasets/imagenette2" 11 | DATASET_FILE = "imagenette2.tgz" 12 | DATASET_SOURCE = "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz" 13 | 14 | 15 | def download_dataset(): 16 | # Assume if tar file exists, dataset exists 17 | data_dir = ( 18 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 19 | ) 20 | 21 | extract_dir = data_dir / "imagenette2" 22 | if extract_dir.exists(): 23 | print("Dataset already downloaded...") 24 | return 25 | 26 | dataset_file = data_dir / DATASET_FILE 27 | print(dataset_file) 28 | if dataset_file.is_file(): 29 | return 30 | 31 | if not data_dir.exists(): 32 | data_dir.mkdir(parents=True, exist_ok=True) 33 | 34 | print(f"Downloading dataset to {str(dataset_file)}...") 35 | urllib.request.urlretrieve(DATASET_SOURCE, str(dataset_file)) 36 | 37 | tar_path = dataset_file.parent 38 | 39 | print("Extracting dataset...") 40 | with tarfile.open(dataset_file, "r:gz") as tar: 41 | tar.extractall(path=tar_path) 42 | 43 | dataset_file.unlink() 44 | 45 | 46 | if __name__ == "__main__": 47 | download_dataset() 48 | -------------------------------------------------------------------------------- /evaluation/plots/ablation.csv: -------------------------------------------------------------------------------- 1 | Pipeline,Setup,Runtime 2 | CV-torch,Baseline,242.3816667 3 | CV-torch,plus parallelism,49.378 4 | CV-torch,plus reorder,28.204 5 | CV-torch,plus offload,16.98566667 6 | CV-torch,plus fusion,11.11533333 7 | CV-tf,Baseline,45.455 8 | CV-tf,plus parallelism,43.214 9 | CV-tf,plus reorder,38.269 10 | CV-tf,plus offload,38.2 11 | CV-tf,plus fusion,10.627 12 | SSD-torch,Baseline,960.229 13 | SSD-torch,plus parallelism,191.033 14 | SSD-torch,plus reorder,88.535 15 | SSD-torch,plus offload,64.593 16 | SSD-torch,plus fusion,52.788 17 | SSD-tf,Baseline,235.187 18 | SSD-tf,plus parallelism,234.49 19 | SSD-tf,plus reorder,40.469 20 | SSD-tf,plus offload,40.373 21 | SSD-tf,plus fusion,31.32 22 | NLP-torch,Baseline,53.295 23 | NLP-torch,plus parallelism,54.09 24 | NLP-torch,plus reorder,53.664 25 | NLP-torch,plus offload,25.692 26 | NLP-torch,plus fusion,20.98866667 27 | NLP-hf-tf,Baseline,248.723 28 | NLP-hf-tf,plus parallelism,250.318 29 | NLP-hf-tf,plus reorder,251.236 30 | NLP-hf-tf,plus offload,93.021 31 | NLP-hf-tf,plus fusion,41.52766667 32 | NLP-tf,Baseline,241.764 33 | NLP-tf,plus parallelism,238.533 34 | NLP-tf,plus reorder,239.402 35 | NLP-tf,plus offload,208.491 36 | NLP-tf,plus fusion,35.94933333 37 | ASR,Baseline,878.4 38 | ASR,plus parallelism,389.28 39 | ASR,plus reorder,376.75 40 | ASR,plus offload,225.05 41 | ASR,plus fusion,20.043 -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/configs/eval_remote.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 7: '' 4 | n_local_workers: 1 5 | pipes: 6 | 0: 7 | name: MapperPipe_mel 8 | variant: INPROCESS 9 | 1: 10 | name: MapperPipe_frequency_mask 11 | variant: INPROCESS 12 | 2: 13 | name: MapperPipe_time_mask 14 | variant: INPROCESS 15 | 3: 16 | name: MapperPipe__stretch 17 | variant: INPROCESS 18 | 4: 19 | name: MapperPipe__spec 20 | variant: INPROCESS 21 | 5: 22 | name: MapperPipe__resample 23 | variant: INPROCESS 24 | 6: 25 | name: MapperPipe__read 26 | variant: INPROCESS 27 | 7: 28 | fused_pipes: 29 | - 6 30 | - 5 31 | - 4 32 | - 2 33 | - 1 34 | - 3 35 | - 0 36 | name: LocalFSListerPipe 37 | variant: RAY_DS 38 | variant_ctx: 39 | variant_type: RAY_DS 40 | 8: 41 | name: PrefetcherPipe 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 9: 46 | fused_pipes: 47 | - 6 48 | - 5 49 | - 4 50 | - 2 51 | - 1 52 | - 3 53 | - 0 54 | name: FusedPipe 55 | variant: RAY 56 | variant_ctx: 57 | max_inflight: 100 58 | max_prefetch: 100 59 | n_actors: 4 60 | submit_batch_size: 3 61 | use_threads: true 62 | variant_type: RAY 63 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_no_caching_plan.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '9' 3 | 1: '0' 4 | 2: '1' 5 | 8: '10' 6 | 9: '' 7 | 10: '2' 8 | n_local_workers: 1 9 | pipes: 10 | 0: 11 | name: BatcherPipe(batch_size=1) 12 | variant: INPROCESS 13 | variant_ctx: 14 | variant_type: INPROCESS 15 | 1: 16 | name: MapperPipe_Embedding(50257, 764) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 2: 21 | name: MapperPipe_ToTensor() 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 3: 26 | name: MapperPipe_AddToken() 27 | 4: 28 | name: MapperPipe_AddToken() 29 | 5: 30 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 31 | 6: 32 | name: MapperPipe_Truncate() 33 | 7: 34 | name: MapperPipe_GPT2BPETokenizer() 35 | 8: 36 | name: LocalLinePipe 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 9: 41 | name: PrefetcherPipe 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 10: 46 | fused_pipes: 47 | - 7 48 | - 6 49 | - 5 50 | - 4 51 | - 3 52 | name: FusedPipe 53 | variant: RAY 54 | variant_ctx: 55 | max_inflight: 48000 56 | max_prefetch: 48000 57 | n_actors: 32 58 | submit_batch_size: 500 59 | use_threads: true 60 | variant_type: RAY 61 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_tf_p.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '8' 13 | 10: '9' 14 | 11: '' 15 | n_local_workers: 8 16 | pipes: 17 | 0: 18 | name: BatcherPipe(batch_size=1) 19 | variant: INPROCESS 20 | variant_ctx: 21 | variant_type: INPROCESS 22 | 1: 23 | name: MapperPipe_per_image_standardization 24 | variant: INPROCESS 25 | 2: 26 | name: MapperPipe_gaussian_blur 27 | variant: INPROCESS 28 | 3: 29 | name: MapperPipe_rgb_to_grayscale 30 | variant: INPROCESS 31 | 4: 32 | name: MapperPipe_color_jitter 33 | variant: INPROCESS 34 | 5: 35 | name: MapperPipe_random_flip 36 | variant: INPROCESS 37 | 6: 38 | name: MapperPipe_crop_and_resize 39 | variant: INPROCESS 40 | 7: 41 | name: MapperPipe_convert_to_float 42 | variant: INPROCESS 43 | 8: 44 | name: MapperPipe_decode_jpeg 45 | variant: INPROCESS 46 | 9: 47 | name: MapperPipe_read_file 48 | variant: INPROCESS 49 | 10: 50 | name: LocalFSListerPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 11: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_tf_p_r.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 1: '0' 5 | 2: '5' 6 | 3: '6' 7 | 4: '7' 8 | 5: '4' 9 | 6: '2' 10 | 7: '1' 11 | 8: '3' 12 | 9: '8' 13 | 10: '9' 14 | 11: '' 15 | n_local_workers: 8 16 | pipes: 17 | 0: 18 | name: BatcherPipe(batch_size=1) 19 | variant: INPROCESS 20 | variant_ctx: 21 | variant_type: INPROCESS 22 | 1: 23 | name: MapperPipe_per_image_standardization 24 | variant: INPROCESS 25 | 2: 26 | name: MapperPipe_gaussian_blur 27 | variant: INPROCESS 28 | 3: 29 | name: MapperPipe_rgb_to_grayscale 30 | variant: INPROCESS 31 | 4: 32 | name: MapperPipe_color_jitter 33 | variant: INPROCESS 34 | 5: 35 | name: MapperPipe_random_flip 36 | variant: INPROCESS 37 | 6: 38 | name: MapperPipe_crop_and_resize 39 | variant: INPROCESS 40 | 7: 41 | name: MapperPipe_convert_to_float 42 | variant: INPROCESS 43 | 8: 44 | name: MapperPipe_decode_jpeg 45 | variant: INPROCESS 46 | 9: 47 | name: MapperPipe_read_file 48 | variant: INPROCESS 49 | 10: 50 | name: LocalFSListerPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 11: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_tf_p_r_o.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 1: '0' 5 | 2: '5' 6 | 3: '6' 7 | 4: '7' 8 | 5: '4' 9 | 6: '2' 10 | 7: '1' 11 | 8: '3' 12 | 9: '8' 13 | 10: '9' 14 | 11: '' 15 | n_local_workers: 8 16 | pipes: 17 | 0: 18 | name: BatcherPipe(batch_size=1) 19 | variant: INPROCESS 20 | variant_ctx: 21 | variant_type: INPROCESS 22 | 1: 23 | name: MapperPipe_per_image_standardization 24 | variant: INPROCESS 25 | 2: 26 | name: MapperPipe_gaussian_blur 27 | variant: INPROCESS 28 | 3: 29 | name: MapperPipe_rgb_to_grayscale 30 | variant: INPROCESS 31 | 4: 32 | name: MapperPipe_color_jitter 33 | variant: INPROCESS 34 | 5: 35 | name: MapperPipe_random_flip 36 | variant: INPROCESS 37 | 6: 38 | name: MapperPipe_crop_and_resize 39 | variant: INPROCESS 40 | 7: 41 | name: MapperPipe_convert_to_float 42 | variant: INPROCESS 43 | 8: 44 | name: MapperPipe_decode_jpeg 45 | variant: INPROCESS 46 | 9: 47 | name: MapperPipe_read_file 48 | variant: INPROCESS 49 | 10: 50 | name: LocalFSListerPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 11: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_tf_p_r_o.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '5' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: MapperPipe__embedding 13 | variant: TF 14 | variant_ctx: 15 | num_parallel_calls: null 16 | variant_type: TF 17 | 1: 18 | name: MapperPipe__truncate 19 | variant: TF 20 | variant_ctx: 21 | num_parallel_calls: null 22 | variant_type: TF 23 | 2: 24 | name: MapperPipe__tokenize 25 | variant: TF_RAY 26 | variant_ctx: 27 | max_inflight: 3000 28 | max_prefetch: 3000 29 | n_actors: 2 30 | num_parallel_calls: null 31 | submit_batch_size: 500 32 | use_threads: true 33 | variant_type: TF_RAY 34 | 3: 35 | name: MapperPipe_convert_to_tensor_v2_with_dispatch 36 | variant: TF_RAY 37 | variant_ctx: 38 | max_inflight: 3000 39 | max_prefetch: 3000 40 | n_actors: 2 41 | num_parallel_calls: null 42 | submit_batch_size: 500 43 | use_threads: true 44 | variant_type: TF_RAY 45 | 4: 46 | name: LocalLinePipe 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | 5: 51 | name: PrefetcherPipe 52 | variant: INPROCESS 53 | variant_ctx: 54 | variant_type: INPROCESS 55 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/configs/eval_local.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 7: '' 4 | n_local_workers: 1 5 | pipes: 6 | 0: 7 | name: MapperPipe_mel 8 | variant: INPROCESS 9 | variant_ctx: 10 | variant_type: INPROCESS 11 | 1: 12 | name: MapperPipe_frequency_mask 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 2: 17 | name: MapperPipe_time_mask 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 3: 22 | name: MapperPipe__stretch 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 4: 27 | name: MapperPipe__spec 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | 5: 32 | name: MapperPipe__resample 33 | variant: INPROCESS 34 | variant_ctx: 35 | variant_type: INPROCESS 36 | 6: 37 | name: MapperPipe__read 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 7: 42 | fused_pipes: 43 | - 6 44 | - 5 45 | - 4 46 | - 2 47 | - 1 48 | - 3 49 | - 0 50 | name: LocalFSListerPipe 51 | variant: RAY_DS 52 | variant_ctx: 53 | variant_type: RAY_DS 54 | 8: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/configs/ablation_p.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '8' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '' 12 | n_local_workers: 8 13 | pipes: 14 | 0: 15 | name: MapperPipe_mel 16 | variant: INPROCESS 17 | variant_ctx: 18 | variant_type: INPROCESS 19 | 1: 20 | name: MapperPipe_frequency_mask 21 | variant: INPROCESS 22 | variant_ctx: 23 | variant_type: INPROCESS 24 | 2: 25 | name: MapperPipe_time_mask 26 | variant: INPROCESS 27 | variant_ctx: 28 | variant_type: INPROCESS 29 | 3: 30 | name: MapperPipe__stretch 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 4: 35 | name: MapperPipe__spec 36 | variant: INPROCESS 37 | variant_ctx: 38 | variant_type: INPROCESS 39 | 5: 40 | name: MapperPipe__resample 41 | variant: INPROCESS 42 | variant_ctx: 43 | variant_type: INPROCESS 44 | 6: 45 | name: MapperPipe__read 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 7: 50 | name: LocalFSListerPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 8: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/configs/ablation_p_r.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '8' 4 | 1: '3' 5 | 2: '1' 6 | 3: '0' 7 | 4: '2' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '' 12 | n_local_workers: 8 13 | pipes: 14 | 0: 15 | name: MapperPipe_mel 16 | variant: INPROCESS 17 | variant_ctx: 18 | variant_type: INPROCESS 19 | 1: 20 | name: MapperPipe_frequency_mask 21 | variant: INPROCESS 22 | variant_ctx: 23 | variant_type: INPROCESS 24 | 2: 25 | name: MapperPipe_time_mask 26 | variant: INPROCESS 27 | variant_ctx: 28 | variant_type: INPROCESS 29 | 3: 30 | name: MapperPipe__stretch 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 4: 35 | name: MapperPipe__spec 36 | variant: INPROCESS 37 | variant_ctx: 38 | variant_type: INPROCESS 39 | 5: 40 | name: MapperPipe__resample 41 | variant: INPROCESS 42 | variant_ctx: 43 | variant_type: INPROCESS 44 | 6: 45 | name: MapperPipe__read 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 7: 50 | name: LocalFSListerPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 8: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/configs/ablation_baseline.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '8' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '' 12 | n_local_workers: 1 13 | pipes: 14 | 0: 15 | name: MapperPipe_mel 16 | variant: INPROCESS 17 | variant_ctx: 18 | variant_type: INPROCESS 19 | 1: 20 | name: MapperPipe_frequency_mask 21 | variant: INPROCESS 22 | variant_ctx: 23 | variant_type: INPROCESS 24 | 2: 25 | name: MapperPipe_time_mask 26 | variant: INPROCESS 27 | variant_ctx: 28 | variant_type: INPROCESS 29 | 3: 30 | name: MapperPipe__stretch 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 4: 35 | name: MapperPipe__spec 36 | variant: INPROCESS 37 | variant_ctx: 38 | variant_type: INPROCESS 39 | 5: 40 | name: MapperPipe__resample 41 | variant: INPROCESS 42 | variant_ctx: 43 | variant_type: INPROCESS 44 | 6: 45 | name: MapperPipe__read 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 7: 50 | name: LocalFSListerPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 8: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /cedar/client/logger.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import queue 3 | 4 | 5 | class LoggerThread(threading.Thread): 6 | def __init__(self, log_queue: queue.Queue[str], log_file: str) -> None: 7 | super().__init__(daemon=True) 8 | self.log_queue = log_queue 9 | self.log_file = log_file 10 | self.running = True 11 | 12 | def run(self) -> None: 13 | with open(self.log_file, "a") as f: 14 | while self.running: 15 | try: 16 | entry = self.log_queue.get(timeout=0.1) 17 | f.write(entry + "\n") 18 | f.flush() 19 | except queue.Empty: 20 | continue 21 | 22 | def stop(self) -> None: 23 | self.running = False 24 | 25 | 26 | class DataSetLogger: 27 | """ 28 | Encapsulates a thread which logs to a file. 29 | 30 | Args: 31 | log_file (str): File to log data to 32 | """ 33 | 34 | def __init__(self, log_file: str) -> None: 35 | self.log_file = log_file 36 | self.log_queue = queue.Queue() 37 | self.logger = LoggerThread(self.log_queue, self.log_file) 38 | self.logger.start() 39 | 40 | def log(self, entry: str) -> None: 41 | self.log_queue.put(entry) 42 | 43 | def close(self) -> None: 44 | self.logger.stop() 45 | self.logger.join() 46 | 47 | def __del__(self) -> None: 48 | if self.logger.is_alive(): 49 | self.close() 50 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_tf_p.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: MapperPipe_normalize 15 | variant: TF 16 | variant_ctx: 17 | num_parallel_calls: null 18 | variant_type: TF 19 | 1: 20 | name: MapperPipe_distort 21 | variant: TF 22 | variant_ctx: 23 | num_parallel_calls: null 24 | variant_type: TF 25 | 2: 26 | name: MapperPipe_random_flip 27 | variant: TF 28 | variant_ctx: 29 | num_parallel_calls: null 30 | variant_type: TF 31 | 3: 32 | name: MapperPipe_resize_image 33 | variant: TF 34 | variant_ctx: 35 | num_parallel_calls: null 36 | variant_type: TF 37 | 4: 38 | name: MapperPipe_distorted_bounding_box_crop 39 | variant: TF 40 | variant_ctx: 41 | num_parallel_calls: null 42 | variant_type: TF 43 | 5: 44 | name: MapperPipe_read_image 45 | variant: TF 46 | variant_ctx: 47 | num_parallel_calls: null 48 | variant_type: TF 49 | 6: 50 | name: COCOFileSourcePipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 7: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_tf_pr.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '3' 5 | 2: '0' 6 | 3: '2' 7 | 4: '1' 8 | 5: '4' 9 | 6: '5' 10 | 7: '' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: MapperPipe_normalize 15 | variant: TF 16 | variant_ctx: 17 | num_parallel_calls: null 18 | variant_type: TF 19 | 1: 20 | name: MapperPipe_distort 21 | variant: TF 22 | variant_ctx: 23 | num_parallel_calls: null 24 | variant_type: TF 25 | 2: 26 | name: MapperPipe_random_flip 27 | variant: TF 28 | variant_ctx: 29 | num_parallel_calls: null 30 | variant_type: TF 31 | 3: 32 | name: MapperPipe_resize_image 33 | variant: TF 34 | variant_ctx: 35 | num_parallel_calls: null 36 | variant_type: TF 37 | 4: 38 | name: MapperPipe_distorted_bounding_box_crop 39 | variant: TF 40 | variant_ctx: 41 | num_parallel_calls: null 42 | variant_type: TF 43 | 5: 44 | name: MapperPipe_read_image 45 | variant: TF 46 | variant_ctx: 47 | num_parallel_calls: null 48 | variant_type: TF 49 | 6: 50 | name: COCOFileSourcePipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 7: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_tf_pro.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '3' 5 | 2: '0' 6 | 3: '2' 7 | 4: '1' 8 | 5: '4' 9 | 6: '5' 10 | 7: '' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: MapperPipe_normalize 15 | variant: TF 16 | variant_ctx: 17 | num_parallel_calls: null 18 | variant_type: TF 19 | 1: 20 | name: MapperPipe_distort 21 | variant: TF 22 | variant_ctx: 23 | num_parallel_calls: null 24 | variant_type: TF 25 | 2: 26 | name: MapperPipe_random_flip 27 | variant: TF 28 | variant_ctx: 29 | num_parallel_calls: null 30 | variant_type: TF 31 | 3: 32 | name: MapperPipe_resize_image 33 | variant: TF 34 | variant_ctx: 35 | num_parallel_calls: null 36 | variant_type: TF 37 | 4: 38 | name: MapperPipe_distorted_bounding_box_crop 39 | variant: TF 40 | variant_ctx: 41 | num_parallel_calls: null 42 | variant_type: TF 43 | 5: 44 | name: MapperPipe_read_image 45 | variant: TF 46 | variant_ctx: 47 | num_parallel_calls: null 48 | variant_type: TF 49 | 6: 50 | name: COCOFileSourcePipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 7: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_tf_baseline.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '7' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: MapperPipe_normalize 15 | variant: TF 16 | variant_ctx: 17 | num_parallel_calls: null 18 | variant_type: TF 19 | 1: 20 | name: MapperPipe_distort 21 | variant: TF 22 | variant_ctx: 23 | num_parallel_calls: null 24 | variant_type: TF 25 | 2: 26 | name: MapperPipe_random_flip 27 | variant: TF 28 | variant_ctx: 29 | num_parallel_calls: null 30 | variant_type: TF 31 | 3: 32 | name: MapperPipe_resize_image 33 | variant: TF 34 | variant_ctx: 35 | num_parallel_calls: null 36 | variant_type: TF 37 | 4: 38 | name: MapperPipe_distorted_bounding_box_crop 39 | variant: TF 40 | variant_ctx: 41 | num_parallel_calls: null 42 | variant_type: TF 43 | 5: 44 | name: MapperPipe_read_image 45 | variant: TF 46 | variant_ctx: 47 | num_parallel_calls: null 48 | variant_type: TF 49 | 6: 50 | name: COCOFileSourcePipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 7: 55 | name: PrefetcherPipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_cache_after_truncate.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '10' 3 | 1: '0' 4 | 2: '9' 5 | 8: '11' 6 | 9: '1' 7 | 10: '' 8 | 11: '2' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Embedding(50257, 764) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_ToTensor() 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 3: 27 | name: MapperPipe_AddToken() 28 | 4: 29 | name: MapperPipe_AddToken() 30 | 5: 31 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 32 | 6: 33 | name: MapperPipe_Truncate() 34 | 7: 35 | name: MapperPipe_GPT2BPETokenizer() 36 | 8: 37 | name: LocalLinePipe 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 9: 42 | name: ObjectDiskCachePipe 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 10: 47 | name: PrefetcherPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 11: 52 | fused_pipes: 53 | - 7 54 | - 6 55 | - 5 56 | - 4 57 | - 3 58 | name: FusedPipe 59 | variant: RAY 60 | variant_ctx: 61 | max_inflight: 48000 62 | max_prefetch: 48000 63 | n_actors: 32 64 | submit_batch_size: 500 65 | use_threads: true 66 | variant_type: RAY 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/new_wikitext_optimal_cache_plan.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '10' 3 | 1: '0' 4 | 2: '9' 5 | 8: '11' 6 | 9: '1' 7 | 10: '' 8 | 11: '2' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Embedding(50257, 764) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_ToTensor() 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 3: 27 | name: MapperPipe_AddToken() 28 | 4: 29 | name: MapperPipe_AddToken() 30 | 5: 31 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 32 | 6: 33 | name: MapperPipe_Truncate() 34 | 7: 35 | name: MapperPipe_GPT2BPETokenizer() 36 | 8: 37 | name: LocalLinePipe 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 9: 42 | name: ObjectDiskCachePipe 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 10: 47 | name: PrefetcherPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 11: 52 | fused_pipes: 53 | - 7 54 | - 6 55 | - 5 56 | - 4 57 | - 3 58 | name: FusedPipe 59 | variant: RAY 60 | variant_ctx: 61 | max_inflight: 48000 62 | max_prefetch: 48000 63 | n_actors: 32 64 | submit_batch_size: 500 65 | use_threads: true 66 | variant_type: RAY 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_cache_after_tensor_conversion.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '10' 3 | 1: '0' 4 | 2: '9' 5 | 8: '11' 6 | 9: '1' 7 | 10: '' 8 | 11: '2' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Embedding(50257, 764) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_ToTensor() 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 3: 27 | name: MapperPipe_AddToken() 28 | 4: 29 | name: MapperPipe_AddToken() 30 | 5: 31 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 32 | 6: 33 | name: MapperPipe_Truncate() 34 | 7: 35 | name: MapperPipe_GPT2BPETokenizer() 36 | 8: 37 | name: LocalLinePipe 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 9: 42 | name: ObjectDiskCachePipe 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 10: 47 | name: PrefetcherPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 11: 52 | fused_pipes: 53 | - 7 54 | - 6 55 | - 5 56 | - 4 57 | - 3 58 | name: FusedPipe 59 | variant: RAY 60 | variant_ctx: 61 | max_inflight: 48000 62 | max_prefetch: 48000 63 | n_actors: 32 64 | submit_batch_size: 500 65 | use_threads: true 66 | variant_type: RAY 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/ablation_pro.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '8' 4 | 1: '5' 5 | 2: '0' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '1' 10 | 7: '6' 11 | 8: '' 12 | n_local_workers: 8 13 | pipes: 14 | 0: 15 | name: MapperPipe_to_tensor 16 | variant: INPROCESS 17 | variant_ctx: 18 | variant_type: INPROCESS 19 | 1: 20 | name: MapperPipe_distort 21 | variant: RAY 22 | variant_ctx: 23 | max_inflight: 100 24 | max_prefetch: 100 25 | n_actors: 4 26 | submit_batch_size: 1 27 | use_threads: true 28 | variant_type: RAY 29 | 2: 30 | name: MapperPipe_RandomHorizontalFlip(p=1) 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 3: 35 | name: MapperPipe_SanitizeBoundingBox(min_size=1.0, labels_getter=boxes) 36 | variant: INPROCESS 37 | variant_ctx: 38 | variant_type: INPROCESS 39 | 4: 40 | name: MapperPipe_crop 41 | variant: INPROCESS 42 | variant_ctx: 43 | variant_type: INPROCESS 44 | 5: 45 | name: MapperPipe_zoom_out 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 6: 50 | name: MapperPipe_read_image 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 7: 55 | name: COCOFileSourcePipe 56 | variant: INPROCESS 57 | variant_ctx: 58 | variant_type: INPROCESS 59 | 8: 60 | name: PrefetcherPipe 61 | variant: INPROCESS 62 | variant_ctx: 63 | variant_type: INPROCESS 64 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/cedar_tf_local_plan.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 6: '8' 4 | 7: '' 5 | 8: '7' 6 | n_local_workers: 1 7 | pipes: 8 | 0: 9 | name: MapperPipe_normalize 10 | variant: TF 11 | variant_ctx: 12 | num_parallel_calls: null 13 | variant_type: TF 14 | 1: 15 | name: MapperPipe_distort 16 | variant: TF 17 | variant_ctx: 18 | num_parallel_calls: null 19 | variant_type: TF 20 | 2: 21 | name: MapperPipe_random_flip 22 | variant: TF 23 | variant_ctx: 24 | num_parallel_calls: null 25 | variant_type: TF 26 | 3: 27 | name: MapperPipe_resize_image 28 | variant: TF 29 | variant_ctx: 30 | num_parallel_calls: null 31 | variant_type: TF 32 | 4: 33 | name: MapperPipe_distorted_bounding_box_crop 34 | variant: TF 35 | variant_ctx: 36 | num_parallel_calls: null 37 | variant_type: TF 38 | 5: 39 | name: MapperPipe_read_image 40 | variant: TF 41 | variant_ctx: 42 | num_parallel_calls: null 43 | variant_type: TF 44 | 6: 45 | name: COCOFileSourcePipe 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 7: 50 | name: PrefetcherPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 8: 55 | fused_pipes: 56 | - 5 57 | - 4 58 | - 1 59 | - 3 60 | - 2 61 | - 0 62 | name: FusedPipe 63 | variant: TF 64 | variant_ctx: 65 | num_parallel_calls: -1 66 | variant_type: TF 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/cedar_tf_remote_plan.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 6: '8' 4 | 7: '' 5 | 8: '7' 6 | n_local_workers: 1 7 | pipes: 8 | 0: 9 | name: MapperPipe_normalize 10 | variant: TF 11 | variant_ctx: 12 | num_parallel_calls: null 13 | variant_type: TF 14 | 1: 15 | name: MapperPipe_distort 16 | variant: TF 17 | variant_ctx: 18 | num_parallel_calls: null 19 | variant_type: TF 20 | 2: 21 | name: MapperPipe_random_flip 22 | variant: TF 23 | variant_ctx: 24 | num_parallel_calls: null 25 | variant_type: TF 26 | 3: 27 | name: MapperPipe_resize_image 28 | variant: TF 29 | variant_ctx: 30 | num_parallel_calls: null 31 | variant_type: TF 32 | 4: 33 | name: MapperPipe_distorted_bounding_box_crop 34 | variant: TF 35 | variant_ctx: 36 | num_parallel_calls: null 37 | variant_type: TF 38 | 5: 39 | name: MapperPipe_read_image 40 | variant: TF 41 | variant_ctx: 42 | num_parallel_calls: null 43 | variant_type: TF 44 | 6: 45 | name: COCOFileSourcePipe 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 7: 50 | name: PrefetcherPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 8: 55 | fused_pipes: 56 | - 5 57 | - 4 58 | - 1 59 | - 3 60 | - 2 61 | - 0 62 | name: FusedPipe 63 | variant: TF 64 | variant_ctx: 65 | num_parallel_calls: -1 66 | variant_type: TF 67 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/test_app.py: -------------------------------------------------------------------------------- 1 | import fastflow as ff 2 | import tensorflow as tf 3 | 4 | from eval_app_runner import App 5 | 6 | class TestModel(ff.FastFlowModel): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def call(self, inputs): 11 | # do nothing 12 | return inputs 13 | 14 | def __deepcopy__(self): 15 | return TestModel() 16 | 17 | class TestApp(App): 18 | def __init__(self, args, config): 19 | super().__init__(args, config) 20 | 21 | self.ds = tf.data.Dataset.from_tensor_slices((tf.random.uniform([100000, 32], maxval=100, dtype=tf.int32),)) 22 | 23 | 24 | def dummy_loss(self, y_true, y_pred): 25 | return tf.constant(0.0) 26 | 27 | def create_model(self): 28 | model = TestModel() 29 | 30 | model.compile(optimizer="adam", loss=self.dummy_loss) 31 | return model 32 | 33 | def create_dataset(self, num_parallel): 34 | dataset = self.ds.map(lambda x: (x+1, x), num_parallel_calls=num_parallel, name="prep_begin") 35 | dataset = dataset.batch(32) 36 | dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE) 37 | 38 | return dataset 39 | 40 | def create_valid_dataset(self, num_parallel): 41 | return None 42 | 43 | 44 | def main(): 45 | ds = dataloader() 46 | valid_ds = dataloader() 47 | 48 | model = TestModel() 49 | model.compile(optimizer="adam", loss=dummy_loss) 50 | 51 | config = ff.FastFlowConfig.from_yaml("/home/myzhao/FastFlow/eval/test/config.yaml") 52 | 53 | model.fit(x=ds, auto_offload_conf=config, epochs=10) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() -------------------------------------------------------------------------------- /evaluation/run_caching.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note, we provide the exact stats and config files produced by the optimizer on our setup in order to enable reproducibility. 4 | # To generate new profiling stats and re-run the optimizer, use the --run_profiling and --generate_plan flags in eval_cedar.py. 5 | # Replace the stats and optimizer-produced config in the following commands 6 | 7 | # cv-torch 8 | # with caching 9 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_cache_dataset.py --master_feature_config pipelines/simclrv2/cache_results/configs/new_simclrv2_optimized_plan.yml --use_ray --ray_ip 10.138.0.8 10 | # without caching 11 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_cache_dataset.py --master_feature_config pipelines/simclrv2/cache_results/configs/no_cache_plan.yml --use_ray --ray_ip 10.138.0.8 12 | 13 | # nlp-torch 14 | # with caching 15 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_cache_dataset.py --use_ray --ray_ip 10.138.0.8 --num_total_samples 100000 --master_feature_config pipelines/wikitext103/cache_results/configs/new_wikitext_optimal_cache_plan.yml 16 | # without caching 17 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_cache_dataset.py --use_ray --ray_ip 10.138.0.8 --num_total_samples 100000 --master_feature_config pipelines/wikitext103/cache_results/configs/wikitext_no_caching_plan.yml 18 | 19 | # asr 20 | # without caching note that the optimizer generates the optimal plan, which does not cache 21 | python eval_cedar.py --dataset_file pipelines/commonvoice/cedar_cache_dataset.py --master_feature_config pipelines/commonvoice/cache_results/configs/no_caching_eval_remote.yml --num_total_samples 10000 -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_p.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '9' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '' 13 | n_local_workers: 1 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Embedding(50257, 764) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_ToTensor() 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_AddToken() 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_AddToken() 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 5: 41 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 6: 46 | name: MapperPipe_Truncate() 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | 7: 51 | name: MapperPipe_GPT2BPETokenizer() 52 | variant: INPROCESS 53 | variant_ctx: 54 | variant_type: INPROCESS 55 | 8: 56 | name: LocalLinePipe 57 | variant: INPROCESS 58 | variant_ctx: 59 | variant_type: INPROCESS 60 | 9: 61 | name: PrefetcherPipe 62 | variant: INPROCESS 63 | variant_ctx: 64 | variant_type: INPROCESS 65 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_p_r.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '9' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '' 13 | n_local_workers: 1 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Embedding(50257, 764) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_ToTensor() 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_AddToken() 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_AddToken() 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 5: 41 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 6: 46 | name: MapperPipe_Truncate() 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | 7: 51 | name: MapperPipe_GPT2BPETokenizer() 52 | variant: INPROCESS 53 | variant_ctx: 54 | variant_type: INPROCESS 55 | 8: 56 | name: LocalLinePipe 57 | variant: INPROCESS 58 | variant_ctx: 59 | variant_type: INPROCESS 60 | 9: 61 | name: PrefetcherPipe 62 | variant: INPROCESS 63 | variant_ctx: 64 | variant_type: INPROCESS 65 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_baseline.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '9' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '' 13 | n_local_workers: 1 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Embedding(50257, 764) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_ToTensor() 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_AddToken() 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_AddToken() 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 5: 41 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 6: 46 | name: MapperPipe_Truncate() 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | 7: 51 | name: MapperPipe_GPT2BPETokenizer() 52 | variant: INPROCESS 53 | variant_ctx: 54 | variant_type: INPROCESS 55 | 8: 56 | name: LocalLinePipe 57 | variant: INPROCESS 58 | variant_ctx: 59 | variant_type: INPROCESS 60 | 9: 61 | name: PrefetcherPipe 62 | variant: INPROCESS 63 | variant_ctx: 64 | variant_type: INPROCESS 65 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/eval_remote.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '9' 4 | 1: '0' 5 | 2: '1' 6 | 8: '10' 7 | 9: '' 8 | 10: '2' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Embedding(50257, 764) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_ToTensor() 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 3: 27 | name: MapperPipe_AddToken() 28 | variant: INPROCESS 29 | 4: 30 | name: MapperPipe_AddToken() 31 | variant: INPROCESS 32 | 5: 33 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 34 | variant: INPROCESS 35 | 6: 36 | name: MapperPipe_Truncate() 37 | variant: INPROCESS 38 | 7: 39 | name: MapperPipe_GPT2BPETokenizer() 40 | variant: INPROCESS 41 | 8: 42 | name: LocalLinePipe 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 9: 47 | name: PrefetcherPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 10: 52 | fused_pipes: 53 | - 7 54 | - 6 55 | - 5 56 | - 4 57 | - 3 58 | name: FusedPipe 59 | variant: RAY 60 | variant_ctx: 61 | max_inflight: 1500 62 | max_prefetch: 1500 63 | n_actors: 32 64 | submit_batch_size: 500 65 | use_threads: true 66 | variant_type: RAY 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/eval_local.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '9' 4 | 1: '0' 5 | 2: '1' 6 | 8: '10' 7 | 9: '' 8 | 10: '2' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Embedding(50257, 764) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_ToTensor() 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 3: 27 | name: MapperPipe_AddToken() 28 | variant: INPROCESS 29 | 4: 30 | name: MapperPipe_AddToken() 31 | variant: INPROCESS 32 | 5: 33 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 34 | variant: INPROCESS 35 | 6: 36 | name: MapperPipe_Truncate() 37 | variant: INPROCESS 38 | 7: 39 | name: MapperPipe_GPT2BPETokenizer() 40 | variant: INPROCESS 41 | 8: 42 | name: LocalLinePipe 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 9: 47 | name: PrefetcherPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 10: 52 | fused_pipes: 53 | - 7 54 | - 6 55 | - 5 56 | - 4 57 | - 3 58 | name: FusedPipe 59 | variant: SMP 60 | variant_ctx: 61 | disable_torch_parallelism: true 62 | max_inflight: 50 63 | max_prefetch: 50 64 | n_procs: 8 65 | use_threads: true 66 | variant_type: SMP 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/coco/configs/cedar_remote_plan.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '8' 4 | 2: '0' 5 | 3: '2' 6 | 4: '3' 7 | 5: '4' 8 | 7: '9' 9 | 8: '' 10 | 9: '5' 11 | n_local_workers: 8 12 | pipes: 13 | 0: 14 | name: MapperPipe_to_tensor 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_distort 20 | variant: INPROCESS 21 | 2: 22 | name: MapperPipe_RandomHorizontalFlip(p=1) 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 3: 27 | name: MapperPipe_SanitizeBoundingBox(min_size=1.0, labels_getter=boxes) 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | 4: 32 | name: MapperPipe_crop 33 | variant: INPROCESS 34 | variant_ctx: 35 | variant_type: INPROCESS 36 | 5: 37 | name: MapperPipe_zoom_out 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 6: 42 | name: MapperPipe_read_image 43 | variant: INPROCESS 44 | 7: 45 | name: COCOFileSourcePipe 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 8: 50 | name: PrefetcherPipe 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 9: 55 | fused_pipes: 56 | - 6 57 | - 1 58 | name: FusedPipe 59 | variant: RAY 60 | variant_ctx: 61 | max_inflight: 100 62 | max_prefetch: 100 63 | n_actors: 4 64 | submit_batch_size: 2 65 | use_threads: true 66 | variant_type: RAY 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download the WikiText103 dataset to local filesystem. 3 | """ 4 | 5 | import logging 6 | import pathlib 7 | import tarfile 8 | from google.cloud import storage 9 | 10 | DATASET_NAME = "cv-corpus-15.0-delta-2023-09-08" 11 | DATASET_LOC = "datasets/commonvoice" 12 | DATASET_FILE = "cv-corpus-15.0-delta-2023-09-08-en.tar" 13 | BUCKET_NAME = "ember-data" 14 | SOURCE_BLOB_NAME = "cv-corpus-15.0-delta-2023-09-08-en.tar" 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def download_if_not_exists(path: pathlib.Path): 21 | if not path.is_file(): 22 | storage_client = storage.Client() 23 | bucket = storage_client.bucket(BUCKET_NAME) 24 | blob = bucket.blob(SOURCE_BLOB_NAME) 25 | blob.download_to_filename(str(path)) 26 | print("Downloaded {}".format(str(path))) 27 | else: 28 | print("Path already exists: {}".format(str(path))) 29 | 30 | 31 | def download_dataset() -> None: 32 | logger.info("Downloading Commonvoice Dataset") 33 | data_dir = ( 34 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 35 | ) 36 | if not data_dir.exists(): 37 | data_dir.mkdir(parents=True, exist_ok=True) 38 | 39 | dataset_file = data_dir / pathlib.Path(DATASET_FILE) 40 | zip_dir = dataset_file.parent 41 | 42 | if not (zip_dir / DATASET_NAME).exists(): 43 | print(f"Downloading dataset to {str(dataset_file)}...") 44 | download_if_not_exists(dataset_file) 45 | 46 | with tarfile.open(dataset_file, "r") as tar: 47 | tar.extractall(path=str(zip_dir)) 48 | 49 | dataset_file.unlink() 50 | 51 | 52 | if __name__ == "__main__": 53 | download_dataset() 54 | -------------------------------------------------------------------------------- /evaluation/plots/aggregate_data.csv: -------------------------------------------------------------------------------- 1 | Pipeline,System,Throughput 2 | CV-torch,torch,165.1435348 3 | CV-torch,ember-local,338.9533219 4 | CV-torch,ray-local,182.7110468 5 | CV-torch,ember-remote,851.9118309 6 | CV-torch,ray-remote,613.7542131 7 | CV-tf,tf,434.7167386 8 | CV-tf,ember-local,438.4405241 9 | CV-tf,ray-local,157.9693704 10 | CV-tf,plumber,338.0819766 11 | CV-tf,ember-remote,965.1411681 12 | CV-tf,ray-remote,548.0062504 13 | CV-tf,tfdata-service,947.6581265 14 | CV-tf,fastflow,772.5337355 15 | NLP-torch,torch,1563.607555 16 | NLP-torch,ember-local,2960.273135 17 | NLP-torch,ray-local,1412.855098 18 | NLP-torch,ember-remote,4764.476066 19 | NLP-torch,ray-remote,1465.342987 20 | NLP-hf-tf,tf,634.4627475 21 | NLP-hf-tf,ember-local,1205.012853 22 | NLP-hf-tf,ray-local,1413.387607 23 | NLP-hf-tf,ember-remote,2408.033199 24 | NLP-hf-tf,ray-remote,798.9287172 25 | NLP-tf,tf,6230.335504 26 | NLP-tf,ember-local,5695.571693 27 | NLP-tf,ray-local,1257.46357 28 | NLP-tf,ember-remote,5563.385506 29 | NLP-tf,ray-remote,2053.085996 30 | NLP-tf,tfdata-service,2045.847441 31 | NLP-tf,fastflow,1610.742536 32 | ASR,torch,24.65157059 33 | ASR,tf,17.22063537 34 | ASR,ember-local,105.7026584 35 | ASR,ray-local,105.724439 36 | ASR,ember-remote,498.9273063 37 | ASR,ray-remote,505.7816422 38 | SSD-torch,ray-local,21.36905199 39 | SSD-torch,ember-local,58.36089788 40 | SSD-torch,torch,19.92272835 41 | SSD-torch,ember-remote,92.27214569 42 | SSD-torch,ray-remote,56.42511784 43 | SSD-tf,ember-local,162.1409238 44 | SSD-tf,tf,72.19816589 45 | SSD-tf,ray-local,15.21952602 46 | SSD-tf,plumber,86.74691139 47 | SSD-tf,ember-remote,158.623046 48 | SSD-tf,tfdata-service,32.12837509 49 | SSD-tf,ray-remote,31.81599492 50 | SSD-tf,fastflow,56.7238234 -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Download the WikiText103 dataset to local filesystem. 3 | """ 4 | 5 | import logging 6 | import pathlib 7 | import urllib.request 8 | import zipfile 9 | 10 | DATASET_NAME = "wikitext103" 11 | DATASET_LOC = "datasets/wikitext103" 12 | DATASET_FILE = "wikitext-103-v1.zip" 13 | DATASET_SOURCE = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip" # noqa: E501 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def download_if_not_exists(url: str, path: pathlib.Path): 20 | if not path.is_file(): 21 | urllib.request.urlretrieve(url, str(path)) 22 | print("Downloaded {}".format(str(path))) 23 | else: 24 | print("Path already exists: {}".format(str(path))) 25 | 26 | 27 | def download_dataset() -> None: 28 | logger.info("Downloading Wikitext103 Dataset") 29 | data_dir = ( 30 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 31 | ) 32 | if not data_dir.exists(): 33 | data_dir.mkdir(parents=True, exist_ok=True) 34 | 35 | dataset_file = data_dir / pathlib.Path(DATASET_FILE) 36 | zip_dir = dataset_file.parent 37 | 38 | print(zip_dir) 39 | if not (zip_dir / "wikitext-103").exists(): 40 | print(f"Downloading dataset to {str(dataset_file)}...") 41 | urllib.request.urlretrieve(DATASET_SOURCE, str(dataset_file)) 42 | logger.info("Extracting Wikitext103 data from zip file.") 43 | with zipfile.ZipFile(dataset_file, "r") as zip_ref: 44 | zip_ref.extractall(path=zip_dir) 45 | logger.info("Done extracting Wikitext103 data from zip file.") 46 | 47 | dataset_file.unlink() 48 | 49 | 50 | if __name__ == "__main__": 51 | download_dataset() 52 | -------------------------------------------------------------------------------- /cedar/pipes/optimize/noop.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from .registry import register_optimizer_pipe 3 | from ..context import InProcessPipeVariantContext 4 | from ..pipe import ( 5 | Pipe, 6 | ) 7 | from ..variant import ( 8 | InProcessPipeVariant, 9 | PipeVariant, 10 | ) 11 | 12 | 13 | @register_optimizer_pipe("NoopOptimizerPipe") 14 | class NoopOptimizerPipe(Pipe): 15 | """ 16 | A noop pipe, that effectively just forwards the output of the input pipe. 17 | Intended to be used as an optimization, and not directly defined 18 | within the feature. 19 | 20 | Primarily intenteded for testing. 21 | """ 22 | 23 | def __init__( 24 | self, input_pipe: Optional[Pipe] = None, is_random: bool = False 25 | ): 26 | if input_pipe: 27 | super().__init__( 28 | "NoopOptimizerPipe", [input_pipe], is_random=is_random 29 | ) 30 | else: 31 | super().__init__("NoopOptimizerPipe", [], is_random=is_random) 32 | 33 | def _to_inprocess( 34 | self, variant_ctx: InProcessPipeVariantContext 35 | ) -> InProcessPipeVariant: 36 | variant = InProcessNoopOptimizerPipeVariant( 37 | self.input_pipes[0].pipe_variant 38 | ) 39 | return variant 40 | 41 | def _check_mutation(self) -> None: 42 | super()._check_mutation() 43 | 44 | if len(self.input_pipes) != 1: 45 | raise RuntimeError("NoopOptimizerPipe only accepts one input.") 46 | 47 | 48 | class InProcessNoopOptimizerPipeVariant(InProcessPipeVariant): 49 | def __init__(self, input_pipe_variant: Optional[PipeVariant]): 50 | super().__init__(input_pipe_variant) 51 | 52 | def _iter_impl(self): 53 | for x in self.input_pipe_variant: 54 | yield x 55 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/configs/ablation_p_r_o.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '9' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '' 13 | n_local_workers: 1 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Embedding(50257, 764) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_ToTensor() 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_AddToken() 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_AddToken() 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 5: 41 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 6: 46 | name: MapperPipe_Truncate() 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | 7: 51 | name: MapperPipe_GPT2BPETokenizer() 52 | variant: RAY 53 | variant_ctx: 54 | max_inflight: 35040 55 | max_prefetch: 35040 56 | n_actors: 32 57 | submit_batch_size: 365 58 | use_threads: true 59 | variant_type: RAY 60 | 8: 61 | name: LocalLinePipe 62 | variant: INPROCESS 63 | variant_ctx: 64 | variant_type: INPROCESS 65 | 9: 66 | name: PrefetcherPipe 67 | variant: INPROCESS 68 | variant_ctx: 69 | variant_type: INPROCESS 70 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/eval_controller_local.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 7: '1' 6 | 9: '11' 7 | 10: '' 8 | 11: '7' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 23 | variant: INPROCESS 24 | 3: 25 | name: MapperPipe_Grayscale(num_output_channels=1) 26 | variant: INPROCESS 27 | 4: 28 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 29 | 1.1), hue=(-0.1, 0.1)) 30 | variant: INPROCESS 31 | 5: 32 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 33 | variant: INPROCESS 34 | 6: 35 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 36 | 1.3333), interpolation=bilinear, antialias=warn) 37 | variant: INPROCESS 38 | 7: 39 | name: MapperPipe_to_float 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 8: 44 | name: MapperPipe_read_image_pytorch 45 | variant: INPROCESS 46 | 9: 47 | name: LocalFSListerPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 10: 52 | name: PrefetcherPipe 53 | variant: INPROCESS 54 | variant_ctx: 55 | variant_type: INPROCESS 56 | 11: 57 | fused_pipes: 58 | - 8 59 | - 3 60 | - 6 61 | - 2 62 | - 5 63 | - 4 64 | name: FusedPipe 65 | variant: INPROCESS 66 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/configs/ablation_p_r_o.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '8' 4 | 1: '3' 5 | 2: '1' 6 | 3: '0' 7 | 4: '2' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '' 12 | n_local_workers: 8 13 | pipes: 14 | 0: 15 | name: MapperPipe_mel 16 | variant: RAY 17 | variant_ctx: 18 | max_inflight: 100 19 | max_prefetch: 100 20 | n_actors: 2 21 | submit_batch_size: 1 22 | use_threads: true 23 | variant_type: RAY 24 | 1: 25 | name: MapperPipe_frequency_mask 26 | variant: INPROCESS 27 | variant_ctx: 28 | variant_type: INPROCESS 29 | 2: 30 | name: MapperPipe_time_mask 31 | variant: INPROCESS 32 | variant_ctx: 33 | variant_type: INPROCESS 34 | 3: 35 | name: MapperPipe__stretch 36 | variant: RAY 37 | variant_ctx: 38 | max_inflight: 100 39 | max_prefetch: 100 40 | n_actors: 2 41 | submit_batch_size: 2 42 | use_threads: true 43 | variant_type: RAY 44 | 4: 45 | name: MapperPipe__spec 46 | variant: INPROCESS 47 | variant_ctx: 48 | variant_type: INPROCESS 49 | 5: 50 | name: MapperPipe__resample 51 | variant: INPROCESS 52 | variant_ctx: 53 | variant_type: INPROCESS 54 | 6: 55 | name: MapperPipe__read 56 | variant: RAY 57 | variant_ctx: 58 | max_inflight: 100 59 | max_prefetch: 100 60 | n_actors: 2 61 | submit_batch_size: 4 62 | use_threads: true 63 | variant_type: RAY 64 | 7: 65 | name: LocalFSListerPipe 66 | variant: INPROCESS 67 | variant_ctx: 68 | variant_type: INPROCESS 69 | 8: 70 | name: PrefetcherPipe 71 | variant: INPROCESS 72 | variant_ctx: 73 | variant_type: INPROCESS 74 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_p.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '8' 13 | n_local_workers: 8 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_Grayscale(num_output_channels=1) 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 37 | 1.1), hue=(-0.1, 0.1)) 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 5: 42 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 6: 47 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 48 | 1.3333), interpolation=bilinear, antialias=warn) 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 7: 53 | name: MapperPipe_to_float 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 8: 58 | name: ImageReaderPipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 9: 63 | name: LocalFSListerPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_p_r.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '5' 6 | 3: '6' 7 | 4: '7' 8 | 5: '4' 9 | 6: '2' 10 | 7: '1' 11 | 8: '3' 12 | 9: '8' 13 | n_local_workers: 8 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_Grayscale(num_output_channels=1) 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 37 | 1.1), hue=(-0.1, 0.1)) 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 5: 42 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 6: 47 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 48 | 1.3333), interpolation=bilinear, antialias=warn) 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 7: 53 | name: MapperPipe_to_float 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 8: 58 | name: ImageReaderPipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 9: 63 | name: LocalFSListerPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | -------------------------------------------------------------------------------- /cedar/service/actor.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | import queue 4 | import multiprocessing as mp 5 | import torch 6 | from typing import Any 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class SMPActor(mp.Process): 12 | def __init__(self, name: str, disable_torch_parallelism: bool = True): 13 | super().__init__() 14 | self.req_q = None 15 | self.resp_q = None 16 | self.name = name 17 | self.shutdown_event = mp.Event() 18 | self.disable_torch_parallelism = disable_torch_parallelism 19 | 20 | def register(self, req_q: mp.Queue, resp_q: mp.Queue): 21 | logger.info(f"Registered SMPActor for {self.name}.") 22 | self.req_q = req_q 23 | self.resp_q = resp_q 24 | 25 | def run(self): 26 | # Need to set this to reduce contention in torch threads... 27 | if self.disable_torch_parallelism: 28 | torch.set_num_threads(1) 29 | torch.set_num_interop_threads(1) 30 | logger.info(f"Running SMPActor for {self.name}.") 31 | if self.req_q is None or self.resp_q is None: 32 | logger.error("SMPActor not registered!") 33 | raise AssertionError("SMPActor not registered.") 34 | 35 | while not self.shutdown_event.is_set(): 36 | try: 37 | sample = self.req_q.get(block=True, timeout=1) 38 | except queue.Empty: 39 | continue 40 | if hasattr(sample, "data"): 41 | sample.data = self.process(sample.data) 42 | else: 43 | sample = self.process(sample) 44 | self.resp_q.put(sample, block=True) 45 | 46 | @abc.abstractmethod 47 | def process(self, data: Any) -> None: 48 | pass 49 | 50 | def stop(self) -> None: 51 | """ 52 | Gracefully shuts down this process 53 | """ 54 | logger.info(f"Stopping SMPActor for {self.name}.") 55 | self.shutdown_event.set() 56 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_baseline.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '8' 13 | n_local_workers: 1 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_Grayscale(num_output_channels=1) 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 37 | 1.1), hue=(-0.1, 0.1)) 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 5: 42 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 6: 47 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 48 | 1.3333), interpolation=bilinear, antialias=warn) 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 7: 53 | name: MapperPipe_to_float 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 8: 58 | name: ImageReaderPipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 9: 63 | name: LocalFSListerPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/cache_results/configs/no_cache_plan.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '10' 3 | 1: '0' 4 | 3: '6' 5 | 6: '11' 6 | 7: '1' 7 | 8: '3' 8 | 9: '8' 9 | 10: '' 10 | 11: '7' 11 | n_local_workers: 8 12 | pipes: 13 | 0: 14 | name: BatcherPipe(batch_size=8) 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 25 | 3: 26 | name: MapperPipe_Grayscale(num_output_channels=1) 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 4: 31 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 32 | 1.1), hue=(-0.1, 0.1)) 33 | 5: 34 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 35 | 6: 36 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 37 | 1.3333), interpolation=bilinear, antialias=warn) 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 7: 42 | name: MapperPipe_to_float 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 8: 47 | name: ImageReaderPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 9: 52 | name: LocalFSListerPipe 53 | variant: INPROCESS 54 | variant_ctx: 55 | variant_type: INPROCESS 56 | 10: 57 | name: PrefetcherPipe 58 | variant: INPROCESS 59 | variant_ctx: 60 | variant_type: INPROCESS 61 | 11: 62 | fused_pipes: 63 | - 2 64 | - 5 65 | - 4 66 | name: FusedPipe 67 | variant: RAY 68 | variant_ctx: 69 | max_inflight: 100 70 | max_prefetch: 100 71 | n_actors: 4 72 | submit_batch_size: 16 73 | use_threads: true 74 | variant_type: RAY 75 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_cache_after_tokenizer_one_offload.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '9' 11 | 8: '7' 12 | 9: '6' 13 | 10: '' 14 | n_local_workers: 1 15 | pipes: 16 | 0: 17 | name: BatcherPipe(batch_size=1) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 1: 22 | name: MapperPipe_Embedding(50257, 764) 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 2: 27 | name: MapperPipe_ToTensor() 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | 3: 32 | name: MapperPipe_AddToken() 33 | variant: INPROCESS 34 | variant_ctx: 35 | variant_type: INPROCESS 36 | 4: 37 | name: MapperPipe_AddToken() 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 5: 42 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 6: 47 | name: MapperPipe_Truncate() 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 7: 52 | name: MapperPipe_GPT2BPETokenizer() 53 | variant: RAY 54 | variant_ctx: 55 | max_inflight: 10000 56 | max_prefetch: 10000 57 | n_actors: 16 58 | submit_batch_size: 500 59 | use_threads: true 60 | variant_type: RAY 61 | 8: 62 | name: LocalLinePipe 63 | variant: INPROCESS 64 | variant_ctx: 65 | variant_type: INPROCESS 66 | 9: 67 | name: ObjectDiskCachePipe 68 | variant: INPROCESS 69 | variant_ctx: 70 | variant_type: INPROCESS 71 | 10: 72 | name: PrefetcherPipe 73 | variant: INPROCESS 74 | variant_ctx: 75 | variant_type: INPROCESS 76 | -------------------------------------------------------------------------------- /cedar/service/multithread.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | 4 | from concurrent.futures import ThreadPoolExecutor, Future 5 | 6 | from .task import MultithreadedTask 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class MultithreadedService: 12 | """ 13 | A multithread service that executes preprocessing tasks using 14 | a thread pool. 15 | 16 | Compared to the MultiprocessService, using MultithreadedService 17 | is lighter weight, as tasks are executed in the same process. 18 | However, threads are subject to the GIL, so CPU-bound workloads 19 | may be better executed in the MultiprocessService 20 | 21 | Args: 22 | num_threads: Number of threads in the pool 23 | """ 24 | 25 | def __init__(self, num_threads: int): 26 | if num_threads < 1: 27 | raise ValueError( 28 | "Cannot create a mutlithreaded " 29 | "service with {} threads.".format(num_threads) 30 | ) 31 | 32 | self.executor = ThreadPoolExecutor(max_workers=num_threads) 33 | self.n_threads = num_threads 34 | logger.info(f"Started MUltithread Service with {num_threads} threads.") 35 | 36 | # Lock for resizing executor 37 | self._lock = threading.Lock() 38 | 39 | def shutdown(self) -> None: 40 | self.executor.shutdown() 41 | 42 | def resize(self, num_threads: int) -> None: 43 | prev_num_threads = self.n_threads 44 | with self._lock: 45 | self.executor.shutdown(wait=True, cancel_futures=False) 46 | self.executor = ThreadPoolExecutor(max_workers=num_threads) 47 | self.n_threads = num_threads 48 | logger.info( 49 | f"Resized Multithreaded Pool from {prev_num_threads}" 50 | f" to {num_threads} threads" 51 | ) 52 | 53 | def submit(self, task: MultithreadedTask) -> Future: 54 | with self._lock: 55 | future = self.executor.submit(task.process) 56 | return future 57 | 58 | def __del__(self): 59 | self.shutdown() 60 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/eval_ember_remote.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 7: '1' 6 | 9: '11' 7 | 10: '' 8 | 11: '7' 9 | n_local_workers: 8 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 23 | variant: INPROCESS 24 | 3: 25 | name: MapperPipe_Grayscale(num_output_channels=1) 26 | variant: INPROCESS 27 | 4: 28 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 29 | 1.1), hue=(-0.1, 0.1)) 30 | variant: INPROCESS 31 | 5: 32 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 33 | variant: INPROCESS 34 | 6: 35 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 36 | 1.3333), interpolation=bilinear, antialias=warn) 37 | variant: INPROCESS 38 | 7: 39 | name: MapperPipe_to_float 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 8: 44 | name: MapperPipe_read_image_pytorch 45 | variant: INPROCESS 46 | 9: 47 | name: LocalFSListerPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 10: 52 | name: PrefetcherPipe 53 | variant: INPROCESS 54 | variant_ctx: 55 | variant_type: INPROCESS 56 | 11: 57 | fused_pipes: 58 | - 8 59 | - 3 60 | - 6 61 | - 2 62 | - 5 63 | - 4 64 | name: FusedPipe 65 | variant: RAY 66 | variant_ctx: 67 | max_inflight: 100 68 | max_prefetch: 100 69 | n_actors: 4 70 | submit_batch_size: 33 71 | use_threads: true 72 | variant_type: RAY 73 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/eval_controller_remote.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 7: '1' 6 | 9: '11' 7 | 10: '' 8 | 11: '7' 9 | n_local_workers: 1 10 | pipes: 11 | 0: 12 | name: BatcherPipe(batch_size=1) 13 | variant: INPROCESS 14 | variant_ctx: 15 | variant_type: INPROCESS 16 | 1: 17 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 2: 22 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 23 | variant: INPROCESS 24 | 3: 25 | name: MapperPipe_Grayscale(num_output_channels=1) 26 | variant: INPROCESS 27 | 4: 28 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 29 | 1.1), hue=(-0.1, 0.1)) 30 | variant: INPROCESS 31 | 5: 32 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 33 | variant: INPROCESS 34 | 6: 35 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 36 | 1.3333), interpolation=bilinear, antialias=warn) 37 | variant: INPROCESS 38 | 7: 39 | name: MapperPipe_to_float 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 8: 44 | name: MapperPipe_read_image_pytorch 45 | variant: INPROCESS 46 | 9: 47 | name: LocalFSListerPipe 48 | variant: INPROCESS 49 | variant_ctx: 50 | variant_type: INPROCESS 51 | 10: 52 | name: PrefetcherPipe 53 | variant: INPROCESS 54 | variant_ctx: 55 | variant_type: INPROCESS 56 | 11: 57 | fused_pipes: 58 | - 8 59 | - 3 60 | - 6 61 | - 2 62 | - 5 63 | - 4 64 | name: FusedPipe 65 | variant: RAY 66 | variant_ctx: 67 | max_inflight: 1000 68 | max_prefetch: 1000 69 | n_actors: 1 70 | submit_batch_size: 33 71 | use_threads: true 72 | variant_type: RAY 73 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/eval_ember_local.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 2: '5' 6 | 3: '6' 7 | 4: '7' 8 | 5: '4' 9 | 6: '2' 10 | 7: '1' 11 | 8: '3' 12 | 9: '8' 13 | 10: '' 14 | n_local_workers: 8 15 | pipes: 16 | 0: 17 | name: BatcherPipe(batch_size=1) 18 | variant: INPROCESS 19 | variant_ctx: 20 | variant_type: INPROCESS 21 | 1: 22 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 23 | variant: INPROCESS 24 | variant_ctx: 25 | variant_type: INPROCESS 26 | 2: 27 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | 3: 32 | name: MapperPipe_Grayscale(num_output_channels=1) 33 | variant: INPROCESS 34 | variant_ctx: 35 | variant_type: INPROCESS 36 | 4: 37 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 38 | 1.1), hue=(-0.1, 0.1)) 39 | variant: INPROCESS 40 | variant_ctx: 41 | variant_type: INPROCESS 42 | 5: 43 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 44 | variant: INPROCESS 45 | variant_ctx: 46 | variant_type: INPROCESS 47 | 6: 48 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 49 | 1.3333), interpolation=bilinear, antialias=warn) 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | 7: 54 | name: MapperPipe_to_float 55 | variant: INPROCESS 56 | variant_ctx: 57 | variant_type: INPROCESS 58 | 8: 59 | name: ImageReaderPipe 60 | variant: INPROCESS 61 | variant_ctx: 62 | variant_type: INPROCESS 63 | 9: 64 | name: LocalFSListerPipe 65 | variant: INPROCESS 66 | variant_ctx: 67 | variant_type: INPROCESS 68 | 10: 69 | name: PrefetcherPipe 70 | variant: INPROCESS 71 | variant_ctx: 72 | variant_type: INPROCESS 73 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/cache_results/configs/cache_after_grayscale.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '11' 3 | 1: '0' 4 | 3: '10' 5 | 6: '12' 6 | 7: '1' 7 | 8: '3' 8 | 9: '8' 9 | 10: '6' 10 | 11: '' 11 | 12: '7' 12 | n_local_workers: 8 13 | pipes: 14 | 0: 15 | name: BatcherPipe(batch_size=8) 16 | variant: INPROCESS 17 | variant_ctx: 18 | variant_type: INPROCESS 19 | 1: 20 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 21 | variant: INPROCESS 22 | variant_ctx: 23 | variant_type: INPROCESS 24 | 2: 25 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 26 | 3: 27 | name: MapperPipe_Grayscale(num_output_channels=1) 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | 4: 32 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 33 | 1.1), hue=(-0.1, 0.1)) 34 | 5: 35 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 36 | 6: 37 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 38 | 1.3333), interpolation=bilinear, antialias=warn) 39 | variant: INPROCESS 40 | variant_ctx: 41 | variant_type: INPROCESS 42 | 7: 43 | name: MapperPipe_to_float 44 | variant: INPROCESS 45 | variant_ctx: 46 | variant_type: INPROCESS 47 | 8: 48 | name: ImageReaderPipe 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 9: 53 | name: LocalFSListerPipe 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 10: 58 | name: ObjectDiskCachePipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 11: 63 | name: PrefetcherPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | 12: 68 | fused_pipes: 69 | - 2 70 | - 5 71 | - 4 72 | name: FusedPipe 73 | variant: RAY 74 | variant_ctx: 75 | max_inflight: 100 76 | max_prefetch: 100 77 | n_actors: 4 78 | submit_batch_size: 16 79 | use_threads: true 80 | variant_type: RAY 81 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/cache_results/configs/new_simclrv2_optimized_plan.yml: -------------------------------------------------------------------------------- 1 | graph: 2 | 0: '11' 3 | 1: '0' 4 | 3: '10' 5 | 6: '12' 6 | 7: '1' 7 | 8: '3' 8 | 9: '8' 9 | 10: '6' 10 | 11: '' 11 | 12: '7' 12 | n_local_workers: 8 13 | pipes: 14 | 0: 15 | name: BatcherPipe(batch_size=8) 16 | variant: INPROCESS 17 | variant_ctx: 18 | variant_type: INPROCESS 19 | 1: 20 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 21 | variant: INPROCESS 22 | variant_ctx: 23 | variant_type: INPROCESS 24 | 2: 25 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 26 | 3: 27 | name: MapperPipe_Grayscale(num_output_channels=1) 28 | variant: INPROCESS 29 | variant_ctx: 30 | variant_type: INPROCESS 31 | 4: 32 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 33 | 1.1), hue=(-0.1, 0.1)) 34 | 5: 35 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 36 | 6: 37 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 38 | 1.3333), interpolation=bilinear, antialias=warn) 39 | variant: INPROCESS 40 | variant_ctx: 41 | variant_type: INPROCESS 42 | 7: 43 | name: MapperPipe_to_float 44 | variant: INPROCESS 45 | variant_ctx: 46 | variant_type: INPROCESS 47 | 8: 48 | name: ImageReaderPipe 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 9: 53 | name: LocalFSListerPipe 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 10: 58 | name: ObjectDiskCachePipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 11: 63 | name: PrefetcherPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | 12: 68 | fused_pipes: 69 | - 2 70 | - 5 71 | - 4 72 | name: FusedPipe 73 | variant: RAY 74 | variant_ctx: 75 | max_inflight: 192 76 | max_prefetch: 192 77 | n_actors: 4 78 | submit_batch_size: 16 79 | use_threads: true 80 | variant_type: RAY 81 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/eval_ember_remote_tf.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 10: '12' 5 | 11: '' 6 | 12: '13' 7 | 13: '0' 8 | n_local_workers: 8 9 | pipes: 10 | 0: 11 | name: BatcherPipe(batch_size=1) 12 | variant: INPROCESS 13 | variant_ctx: 14 | variant_type: INPROCESS 15 | 1: 16 | name: MapperPipe_per_image_standardization 17 | variant: TF 18 | variant_ctx: 19 | num_parallel_calls: null 20 | variant_type: TF 21 | 2: 22 | name: MapperPipe_gaussian_blur 23 | variant: INPROCESS 24 | 3: 25 | name: MapperPipe_rgb_to_grayscale 26 | variant: INPROCESS 27 | 4: 28 | name: MapperPipe_color_jitter 29 | variant: INPROCESS 30 | 5: 31 | name: MapperPipe_random_flip 32 | variant: INPROCESS 33 | 6: 34 | name: MapperPipe_crop_and_resize 35 | variant: INPROCESS 36 | 7: 37 | name: MapperPipe_convert_to_float 38 | variant: TF 39 | variant_ctx: 40 | num_parallel_calls: null 41 | variant_type: TF 42 | 8: 43 | name: MapperPipe_decode_jpeg 44 | variant: INPROCESS 45 | 9: 46 | name: MapperPipe_read_file 47 | variant: INPROCESS 48 | 10: 49 | name: LocalFSListerPipe 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | 11: 54 | name: PrefetcherPipe 55 | variant: INPROCESS 56 | variant_ctx: 57 | variant_type: INPROCESS 58 | 12: 59 | fused_pipes: 60 | - 9 61 | - 8 62 | - 3 63 | - 6 64 | - 2 65 | - 5 66 | - 4 67 | name: FusedPipe 68 | variant: TF_RAY 69 | variant_ctx: 70 | max_inflight: 100 71 | max_prefetch: 100 72 | n_actors: 4 73 | num_parallel_calls: null 74 | submit_batch_size: 33 75 | use_threads: true 76 | variant_type: TF_RAY 77 | 13: 78 | fused_pipes: 79 | - 7 80 | - 1 81 | name: FusedPipe 82 | variant: TF 83 | variant_ctx: 84 | num_parallel_calls: null 85 | variant_type: TF 86 | -------------------------------------------------------------------------------- /evaluation/cedar_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union, Dict 2 | from cedar.config import RayConfig 3 | 4 | 5 | class CedarEvalSpec: 6 | def __init__( 7 | self, 8 | batch_size: int, 9 | num_total_samples: Optional[int], 10 | num_epochs: int, 11 | config: Optional[Union[str, Dict[str, str]]] = None, 12 | kwargs: Dict[str, str] = None, 13 | use_ray: bool = False, 14 | ray_ip: str = "", 15 | iteration_time: Optional[float] = None, 16 | profiled_stats: str = "", 17 | run_profiling: bool = False, 18 | disable_optimizer: bool = False, 19 | disable_controller: bool = False, 20 | disable_prefetch: bool = False, 21 | disable_offload: bool = False, 22 | disable_parallelism: bool = False, 23 | disable_reorder: bool = False, 24 | disable_fusion: bool = False, 25 | disable_caching: bool = False, 26 | generate_plan: bool = False, 27 | ): 28 | self.batch_size = batch_size 29 | self.num_total_samples = num_total_samples 30 | self.num_epochs = num_epochs 31 | self.config = config 32 | self.kwargs = kwargs 33 | self.use_ray = use_ray 34 | self.ray_ip = ray_ip 35 | self.iteration_time = iteration_time 36 | self.profiled_stats = profiled_stats 37 | self.run_profiling = run_profiling 38 | self.disable_optimizer = disable_optimizer 39 | self.disable_controller = disable_controller 40 | self.disable_prefetch = disable_prefetch 41 | self.disable_offload = disable_offload 42 | self.disable_parallelism = disable_parallelism 43 | self.disable_reorder = disable_reorder 44 | self.disable_fusion = disable_fusion 45 | self.disable_caching = disable_caching 46 | self.generate_plan = generate_plan 47 | 48 | def to_ray_config(self) -> Optional[RayConfig]: 49 | """ 50 | Returns a Ray spec for the CedarContext, if specified by 51 | the profiler spec 52 | """ 53 | if not self.use_ray: 54 | return None 55 | 56 | return RayConfig(self.ray_ip) 57 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_cache_after_truncate_one_offload.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 8: '11' 10 | 9: '5' 11 | 10: '' 12 | 11: '9' 13 | n_local_workers: 1 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Embedding(50257, 764) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_ToTensor() 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_AddToken() 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_AddToken() 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 5: 41 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 42 | variant: INPROCESS 43 | variant_ctx: 44 | variant_type: INPROCESS 45 | 6: 46 | name: MapperPipe_Truncate() 47 | variant: INPROCESS 48 | variant_ctx: 49 | variant_type: INPROCESS 50 | 7: 51 | name: MapperPipe_GPT2BPETokenizer() 52 | variant: INPROCESS 53 | variant_ctx: 54 | variant_type: INPROCESS 55 | 8: 56 | name: LocalLinePipe 57 | variant: INPROCESS 58 | variant_ctx: 59 | variant_type: INPROCESS 60 | 9: 61 | name: ObjectDiskCachePipe 62 | variant: INPROCESS 63 | variant_ctx: 64 | variant_type: INPROCESS 65 | 10: 66 | name: PrefetcherPipe 67 | variant: INPROCESS 68 | variant_ctx: 69 | variant_type: INPROCESS 70 | 11: 71 | fused_pipes: 72 | - 7 73 | - 6 74 | name: FusedPipe 75 | variant: RAY 76 | variant_ctx: 77 | max_inflight: 10000 78 | max_prefetch: 10000 79 | n_actors: 16 80 | submit_batch_size: 500 81 | use_threads: true 82 | variant_type: RAY 83 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_tf_baseline.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 1: '0' 5 | 2: '1' 6 | 3: '2' 7 | 4: '3' 8 | 5: '4' 9 | 6: '5' 10 | 7: '6' 11 | 8: '7' 12 | 9: '8' 13 | 10: '9' 14 | 11: '' 15 | n_local_workers: 1 16 | pipes: 17 | 0: 18 | name: BatcherPipe(batch_size=1) 19 | variant: INPROCESS 20 | variant_ctx: 21 | variant_type: INPROCESS 22 | 1: 23 | name: MapperPipe_per_image_standardization 24 | variant: TF 25 | variant_ctx: 26 | num_parallel_calls: null 27 | variant_type: TF 28 | 2: 29 | name: MapperPipe_gaussian_blur 30 | variant: TF 31 | variant_ctx: 32 | num_parallel_calls: null 33 | variant_type: TF 34 | 3: 35 | name: MapperPipe_rgb_to_grayscale 36 | variant: TF 37 | variant_ctx: 38 | num_parallel_calls: null 39 | variant_type: TF 40 | 4: 41 | name: MapperPipe_color_jitter 42 | variant: TF 43 | variant_ctx: 44 | num_parallel_calls: null 45 | variant_type: TF 46 | 5: 47 | name: MapperPipe_random_flip 48 | variant: TF 49 | variant_ctx: 50 | num_parallel_calls: null 51 | variant_type: TF 52 | 6: 53 | name: MapperPipe_crop_and_resize 54 | variant: TF 55 | variant_ctx: 56 | num_parallel_calls: null 57 | variant_type: TF 58 | 7: 59 | name: MapperPipe_convert_to_float 60 | variant: TF 61 | variant_ctx: 62 | num_parallel_calls: null 63 | variant_type: TF 64 | 8: 65 | name: MapperPipe_decode_jpeg 66 | variant: TF 67 | variant_ctx: 68 | num_parallel_calls: null 69 | variant_type: TF 70 | 9: 71 | name: MapperPipe_read_file 72 | variant: TF 73 | variant_ctx: 74 | num_parallel_calls: null 75 | variant_type: TF 76 | 10: 77 | name: LocalFSListerPipe 78 | variant: INPROCESS 79 | variant_ctx: 80 | variant_type: INPROCESS 81 | 11: 82 | name: PrefetcherPipe 83 | variant: INPROCESS 84 | variant_ctx: 85 | variant_type: INPROCESS 86 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/torch_dataset.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import torch 3 | import torchdata.datapipes as dp 4 | 5 | from torchvision import transforms 6 | from torchvision.io import read_image, ImageReadMode 7 | from evaluation.torch_utils import TorchEvalSpec 8 | from torch.utils.data import DataLoader 9 | 10 | DATASET_LOC = "datasets/imagenette2" 11 | IMG_HEIGHT = 244 12 | IMG_WIDTH = 244 13 | GAUSSIAN_BLUR_KERNEL_SIZE = 11 14 | 15 | 16 | def to_float(x): 17 | return x.to(torch.float32) 18 | 19 | 20 | def build_datapipe(root, spec: TorchEvalSpec): 21 | datapipe = dp.iter.FileLister(root=root, recursive=True) 22 | # TODO: Evaluate where is a fair place to put this... 23 | datapipe = datapipe.sharding_filter() 24 | datapipe = dp.iter.Mapper( 25 | datapipe, lambda x: read_image(x, mode=ImageReadMode.RGB) 26 | ) 27 | datapipe = dp.iter.Mapper(datapipe, to_float) 28 | datapipe = dp.iter.Mapper( 29 | datapipe, transforms.RandomResizedCrop((IMG_HEIGHT, IMG_WIDTH)) 30 | ) 31 | datapipe = dp.iter.Mapper(datapipe, transforms.RandomHorizontalFlip()) 32 | datapipe = dp.iter.Mapper( 33 | datapipe, transforms.ColorJitter(0.1, 0.1, 0.1, 0.1) 34 | ) 35 | datapipe = dp.iter.Mapper( 36 | datapipe, transforms.Grayscale(num_output_channels=1) 37 | ) 38 | datapipe = dp.iter.Mapper( 39 | datapipe, transforms.GaussianBlur(GAUSSIAN_BLUR_KERNEL_SIZE) 40 | ) 41 | datapipe = dp.iter.Mapper( 42 | datapipe, transforms.Normalize((0.1307,), (0.3081,)) 43 | ) 44 | return datapipe 45 | 46 | 47 | def get_dataset(spec: TorchEvalSpec): 48 | data_dir = ( 49 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 50 | ) 51 | train_filepath = pathlib.Path(data_dir) / pathlib.Path("imagenette2/train") 52 | 53 | datapipe = build_datapipe(str(train_filepath), spec) 54 | 55 | dataloader = DataLoader( 56 | datapipe, batch_size=spec.batch_size, num_workers=spec.num_workers 57 | ) 58 | 59 | return dataloader 60 | 61 | 62 | if __name__ == "__main__": 63 | dataset = get_dataset(TorchEvalSpec(8, 1)) 64 | for x in dataset: 65 | print(x) 66 | print(x.size()) 67 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/ablation_p_r_o.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '' 4 | 1: '0' 5 | 2: '5' 6 | 3: '6' 7 | 4: '7' 8 | 5: '4' 9 | 6: '2' 10 | 7: '1' 11 | 8: '3' 12 | 9: '8' 13 | n_local_workers: 8 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=1) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 27 | variant: RAY 28 | variant_ctx: 29 | max_inflight: 100 30 | max_prefetch: 100 31 | n_actors: 2 32 | submit_batch_size: 16 33 | use_threads: true 34 | variant_type: RAY 35 | 3: 36 | name: MapperPipe_Grayscale(num_output_channels=1) 37 | variant: INPROCESS 38 | variant_ctx: 39 | variant_type: INPROCESS 40 | 4: 41 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 42 | 1.1), hue=(-0.1, 0.1)) 43 | variant: RAY 44 | variant_ctx: 45 | max_inflight: 100 46 | max_prefetch: 100 47 | n_actors: 2 48 | submit_batch_size: 16 49 | use_threads: true 50 | variant_type: RAY 51 | 5: 52 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 53 | variant: INPROCESS 54 | variant_ctx: 55 | variant_type: INPROCESS 56 | 6: 57 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 58 | 1.3333), interpolation=bilinear, antialias=warn) 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 7: 63 | name: MapperPipe_to_float 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | 8: 68 | name: ImageReaderPipe 69 | variant: INPROCESS 70 | variant_ctx: 71 | variant_type: INPROCESS 72 | 9: 73 | name: LocalFSListerPipe 74 | variant: INPROCESS 75 | variant_ctx: 76 | variant_type: INPROCESS 77 | -------------------------------------------------------------------------------- /evaluation/run_cedar_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note, we provide the exact stats and config files produced by the optimizer on our setup in order to enable reproducibility. 4 | # To generate new profiling stats and re-run the optimizer, use the --run_profiling and --generate_plan flags in eval_cedar.py 5 | # Replace the stats and optimizer-produced config in the following commands 6 | 7 | # cv-torch 8 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_dataset.py --profiled_stats pipelines/simclrv2/stats/cedar_stats.yaml --master_feature_config pipelines/simclrv2/configs/eval_cedar_local.yaml 9 | 10 | # cv-tf 11 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_tf_dataset.py --profiled_stats pipelines/simclrv2/stats/cedar_tf_stats.yaml --master_feature_config pipelines/simclrv2/configs/eval_cedar_local_tf.yaml 12 | 13 | # nlp-torch 14 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_dataset.py --profiled_stats pipelines/wikitext103/stats/cedar.yaml --master_feature_config pipelines/wikitext103/configs/eval_local.yaml --num_total_samples 100000 15 | 16 | # nlp-hf-tf 17 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_tf_dataset.py --profiled_stats pipelines/wikitext103/stats/tf.yaml --master_feature_config pipelines/wikitext103/configs/eval_local_tf.yaml --num_total_samples 100000 18 | 19 | # nlp-tf 20 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_tf_service_dataset.py --profiled_stats pipelines/wikitext103/stats/tf_service.yaml --master_feature_config pipelines/wikitext103/configs/eval_local_tf_service.yaml --num_total_samples 200000 21 | 22 | # asr 23 | python eval_cedar.py --dataset_file pipelines/commonvoice/cedar_dataset.py --profiled_stats pipelines/commonvoice/stats/cedar.yaml --master_feature_config pipelines/commonvoice/configs/eval_local.yaml --num_total_samples 10000 24 | 25 | # ssd 26 | python eval_cedar.py --dataset_file pipelines/coco/cedar_dataset.py --profiled_stats pipelines/coco/stats/coco_local_stats.yaml --master_feature_config pipelines/coco/configs/cedar_local_plan.yml 27 | 28 | python eval_cedar.py --dataset_file pipelines/coco/cedar_tf_dataset.py --profiled_stats pipelines/coco/stats/coco_tf_local_stats.yaml --master_feature_config pipelines/coco/configs/cedar_tf_local_plan.yml -------------------------------------------------------------------------------- /cedar/pipes/__init__.py: -------------------------------------------------------------------------------- 1 | from cedar.pipes.batch import BatcherPipe 2 | from cedar.pipes.common import ( 3 | DataSample, 4 | Partition, 5 | MutationError, 6 | CedarPipeSpec, 7 | cedar_pipe, 8 | ) 9 | from cedar.pipes.io import ( 10 | FileOpenerPipe, 11 | LineReaderPipe, 12 | ImageReaderPipe, 13 | WebReaderPipe, 14 | ) 15 | from cedar.pipes.map import MapperPipe 16 | from cedar.pipes.noop import NoopPipe 17 | from cedar.pipes.context import ( 18 | PipeVariantType, 19 | PipeVariantContext, 20 | InProcessPipeVariantContext, 21 | MultiprocessPipeVariantContext, 22 | MultithreadedPipeVariantContext, 23 | RayPipeVariantContext, 24 | SMPPipeVariantContext, 25 | PipeVariantContextFactory, 26 | TFPipeVariantContext, 27 | TFRayPipeVariantContext, 28 | RayDSPipeVariantContext, 29 | ) 30 | from cedar.pipes.pipe import ( 31 | Pipe, 32 | ) 33 | from cedar.pipes.variant import ( 34 | PipeVariant, 35 | InProcessPipeVariant, 36 | MultiprocessPipeVariant, 37 | MultithreadedPipeVariant, 38 | SMPPipeVariant, 39 | TFPipeVariant, 40 | RayDSPipeVariant, 41 | ) 42 | from cedar.pipes.ray_variant import RayPipeVariant 43 | from cedar.pipes.tf import TFTensorDontCare, TFOutputHint 44 | 45 | __all__ = [ 46 | "BatcherPipe", 47 | "CedarPipeSpec", 48 | "DataSample", 49 | "FileOpenerPipe", 50 | "ImageReaderPipe", 51 | "InProcessPipeVariant", 52 | "InProcessPipeVariantContext", 53 | "LineReaderPipe", 54 | "MapperPipe", 55 | "MultiprocessPipeVariant", 56 | "MultiprocessPipeVariantContext", 57 | "MultithreadedPipeVariant", 58 | "MultithreadedPipeVariantContext", 59 | "MutationError", 60 | "NoopPipe", 61 | "Partition", 62 | "Pipe", 63 | "PipeVariant", 64 | "PipeVariantContext", 65 | "PipeVariantContextFactory", 66 | "PipeVariantType", 67 | "RayDSPipeVariant", 68 | "RayDSPipeVariantContext", 69 | "RayPipeVariant", 70 | "RayPipeVariantContext", 71 | "SMPPipeVariant", 72 | "SMPPipeVariantContext", 73 | "TFOutputHint", 74 | "TFPipeVariant", 75 | "TFPipeVariantContext", 76 | "TFRayPipeVariantContext", 77 | "TFTensorDontCare", 78 | "WebReaderPipe", 79 | "cedar_pipe", 80 | ] 81 | 82 | assert __all__ == sorted(__all__) 83 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_cache_after_tokenizer_two_offloads.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 2: '1' 6 | 7: '9' 7 | 8: '7' 8 | 9: '11' 9 | 10: '' 10 | 11: '2' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: BatcherPipe(batch_size=1) 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_Embedding(50257, 764) 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_ToTensor() 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 3: 29 | name: MapperPipe_AddToken() 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | 4: 34 | name: MapperPipe_AddToken() 35 | variant: INPROCESS 36 | variant_ctx: 37 | variant_type: INPROCESS 38 | 5: 39 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 6: 44 | name: MapperPipe_Truncate() 45 | variant: INPROCESS 46 | variant_ctx: 47 | variant_type: INPROCESS 48 | 7: 49 | name: MapperPipe_GPT2BPETokenizer() 50 | variant: RAY 51 | variant_ctx: 52 | max_inflight: 10000 53 | max_prefetch: 10000 54 | n_actors: 16 55 | submit_batch_size: 500 56 | use_threads: true 57 | variant_type: RAY 58 | 8: 59 | name: LocalLinePipe 60 | variant: INPROCESS 61 | variant_ctx: 62 | variant_type: INPROCESS 63 | 9: 64 | name: ObjectDiskCachePipe 65 | variant: INPROCESS 66 | variant_ctx: 67 | variant_type: INPROCESS 68 | 10: 69 | name: PrefetcherPipe 70 | variant: INPROCESS 71 | variant_ctx: 72 | variant_type: INPROCESS 73 | 11: 74 | fused_pipes: 75 | - 6 76 | - 5 77 | - 4 78 | - 3 79 | name: FusedPipe 80 | variant: RAY 81 | variant_ctx: 82 | max_inflight: 10000 83 | max_prefetch: 10000 84 | n_actors: 16 85 | submit_batch_size: 500 86 | use_threads: true 87 | variant_type: RAY 88 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/tf_dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import pathlib 3 | 4 | from transformers import GPT2Tokenizer 5 | 6 | from evaluation.tf_utils import TFEvalSpec 7 | 8 | DATASET_LOC = "datasets/wikitext103" 9 | 10 | 11 | def _load_text(path): 12 | text = tf.io.read_file(path) 13 | return tf.data.Dataset.from_tensor_slices(tf.strings.split(text, "\n")) 14 | 15 | 16 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 17 | embedding = tf.Variable(tf.random.uniform([50257, 764], -1.0, 1.0)) 18 | 19 | 20 | @tf.py_function(Tout=tf.int32) 21 | def _tokenize(x): 22 | return tokenizer(str(x.numpy()), return_tensors="tf")["input_ids"] 23 | 24 | 25 | def _truncate(x): 26 | dim = tf.shape(x)[1] 27 | slice_size = tf.minimum(dim, 254) 28 | x = tf.slice(x, [0, 0], [1, slice_size]) 29 | return x 30 | 31 | 32 | def _embedding(x): 33 | return tf.nn.embedding_lookup(embedding, x) 34 | 35 | 36 | def build_dataset(path, spec): 37 | # ds = _load_text(path) 38 | ds = tf.data.TextLineDataset(path) 39 | 40 | ds = ds.map( 41 | lambda x: _tokenize(x), num_parallel_calls=spec.num_parallel_calls 42 | ) 43 | ds = ds.map(_truncate, num_parallel_calls=spec.num_parallel_calls) 44 | ds = ds.map(_embedding, num_parallel_calls=spec.num_parallel_calls) 45 | 46 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 47 | 48 | if spec.service_addr: 49 | print( 50 | "Using tf.data.service with address {}".format(spec.service_addr) 51 | ) 52 | ds = ds.apply( 53 | tf.data.experimental.service.distribute( 54 | processing_mode="distributed_epoch", service=spec.service_addr 55 | ) 56 | ) 57 | 58 | return ds 59 | 60 | 61 | def get_dataset(spec: TFEvalSpec): 62 | data_dir = ( 63 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 64 | ) 65 | train_filepath = pathlib.Path(data_dir) / pathlib.Path( 66 | "wikitext-103/wiki.train.tokens" 67 | ) 68 | 69 | return build_dataset( 70 | str(train_filepath), 71 | spec, 72 | ) 73 | 74 | 75 | if __name__ == "__main__": 76 | tf_dataset = get_dataset(TFEvalSpec(1, 1)) 77 | 78 | for i, x in enumerate(tf_dataset): 79 | print(x) 80 | # print(x.shape) 81 | print(i) 82 | if i == 10: 83 | break 84 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/configs/eval_ember_local_tf.yaml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 10: '12' 5 | 11: '' 6 | 12: '0' 7 | n_local_workers: 1 8 | pipes: 9 | 0: 10 | name: BatcherPipe(batch_size=1) 11 | variant: INPROCESS 12 | variant_ctx: 13 | variant_type: INPROCESS 14 | 1: 15 | name: MapperPipe_per_image_standardization 16 | variant: TF 17 | variant_ctx: 18 | num_parallel_calls: null 19 | variant_type: TF 20 | 2: 21 | name: MapperPipe_gaussian_blur 22 | variant: TF 23 | variant_ctx: 24 | num_parallel_calls: null 25 | variant_type: TF 26 | 3: 27 | name: MapperPipe_rgb_to_grayscale 28 | variant: TF 29 | variant_ctx: 30 | num_parallel_calls: null 31 | variant_type: TF 32 | 4: 33 | name: MapperPipe_color_jitter 34 | variant: TF 35 | variant_ctx: 36 | num_parallel_calls: null 37 | variant_type: TF 38 | 5: 39 | name: MapperPipe_random_flip 40 | variant: TF 41 | variant_ctx: 42 | num_parallel_calls: null 43 | variant_type: TF 44 | 6: 45 | name: MapperPipe_crop_and_resize 46 | variant: TF 47 | variant_ctx: 48 | num_parallel_calls: null 49 | variant_type: TF 50 | 7: 51 | name: MapperPipe_convert_to_float 52 | variant: TF 53 | variant_ctx: 54 | num_parallel_calls: null 55 | variant_type: TF 56 | 8: 57 | name: MapperPipe_decode_jpeg 58 | variant: TF 59 | variant_ctx: 60 | num_parallel_calls: null 61 | variant_type: TF 62 | 9: 63 | name: MapperPipe_read_file 64 | variant: TF 65 | variant_ctx: 66 | num_parallel_calls: null 67 | variant_type: TF 68 | 10: 69 | name: LocalFSListerPipe 70 | variant: INPROCESS 71 | variant_ctx: 72 | variant_type: INPROCESS 73 | 11: 74 | name: PrefetcherPipe 75 | variant: INPROCESS 76 | variant_ctx: 77 | variant_type: INPROCESS 78 | 12: 79 | fused_pipes: 80 | - 9 81 | - 8 82 | - 3 83 | - 6 84 | - 2 85 | - 5 86 | - 4 87 | - 7 88 | - 1 89 | name: FusedPipe 90 | variant: TF 91 | variant_ctx: 92 | num_parallel_calls: -1 93 | variant_type: TF 94 | -------------------------------------------------------------------------------- /evaluation/run_cedar_remote.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note, we provide the exact stats and config files produced by the optimizer on our setup in order to enable reproducibility. 4 | # To generate new profiling stats and re-run the optimizer, use the --run_profiling and --generate_plan flags in eval_cedar.py 5 | # Replace the stats and optimizer-produced config in the following commands 6 | 7 | # cv-torch 8 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_remote_dataset.py --profiled_stats pipelines/simclrv2/stats/cedar_remote.yaml --master_feature_config pipelines/simclrv2/configs/eval_cedar_remote.yaml --use_ray --ray_ip 10.138.0.8 9 | # cv-tf 10 | python eval_cedar.py --dataset_file pipelines/simclrv2/cedar_tf_dataset.py --profiled_stats pipelines/simclrv2/stats/cedar_tf_stats.yaml --master_feature_config pipelines/simclrv2/configs/eval_cedar_remote_tf.yaml --use_ray --ray_ip 10.138.0.8 11 | # nlp-torch 12 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_dataset.py --profiled_stats pipelines/wikitext103/stats/cedar.yaml --use_ray --ray_ip 10.138.0.8 --num_total_samples 100000 --master_feature_config pipelines/wikitext103/configs/eval_remote.yaml 13 | # nlp-hf-tf 14 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_tf_dataset.py --profiled_stats pipelines/wikitext103/stats/tf.yaml --master_feature_config pipelines/wikitext103/configs/eval_remote_tf.yaml --use_ray --ray_ip 10.138.0.8 --num_total_samples 100000 15 | # nlp-tf 16 | python eval_cedar.py --dataset_file pipelines/wikitext103/cedar_tf_service_dataset.py --profiled_stats pipelines/wikitext103/stats/tf_service.yaml --master_feature_config pipelines/wikitext103/configs/eval_remote_tf_service.yaml --use_ray --ray_ip 10.138.0.8 --num_total_samples 200000 17 | # asr 18 | python eval_cedar.py --dataset_file pipelines/commonvoice/cedar_dataset.py --profiled_stats pipelines/commonvoice/stats/cedar.yaml --master_feature_config pipelines/commonvoice/configs/eval_remote.yaml --num_total_samples 10000 19 | 20 | python eval_cedar.py --dataset_file pipelines/coco/cedar_remote_dataset.py --master_feature_config pipelines/coco/configs/cedar_remote_plan.yml --use_ray --ray_ip 10.138.0.45 21 | 22 | python eval_cedar.py --dataset_file pipelines/coco/cedar_tf_dataset.py --master_feature_config pipelines/coco/configs/cedar_tf_remote_plan.yml --profiled_stats pipelines/coco/stats/coco_tf_remote_stats.yaml --use_ray --ray_ip 10.138.0.26 -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/torch_dataset.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import torch.nn as nn 3 | import torchdata.datapipes as dp 4 | from torch.hub import load_state_dict_from_url 5 | 6 | import torchtext.transforms as T 7 | from evaluation.torch_utils import TorchEvalSpec 8 | from torch.utils.data import DataLoader 9 | 10 | DATASET_LOC = "datasets/wikitext103" 11 | 12 | 13 | def build_datapipe(root, spec: TorchEvalSpec): 14 | encoder_json_path = ( 15 | "https://download.pytorch.org/models/text/gpt2_bpe_encoder.json" 16 | ) 17 | vocab_bpe_path = ( 18 | "https://download.pytorch.org/models/text/gpt2_bpe_vocab.bpe" 19 | ) 20 | tokenizer = T.GPT2BPETokenizer(encoder_json_path, vocab_bpe_path) 21 | vocab_path = "https://download.pytorch.org/models/text/roberta.vocab.pt" 22 | vocab = T.VocabTransform(load_state_dict_from_url(vocab_path)) 23 | add_bos = T.AddToken(token=0, begin=True) 24 | add_eos = T.AddToken(token=2, begin=False) 25 | 26 | embedding = nn.Embedding(50257, 764, _freeze=True) 27 | 28 | datapipe = dp.iter.FileLister(root=root, recursive=True) 29 | datapipe = dp.iter.FileOpener(datapipe) 30 | datapipe = dp.iter.LineReader(datapipe, return_path=False) 31 | datapipe = datapipe.sharding_filter() 32 | 33 | datapipe = dp.iter.Mapper(datapipe, tokenizer) 34 | datapipe = dp.iter.Mapper(datapipe, T.Truncate(max_seq_len=254)) 35 | datapipe = dp.iter.Mapper(datapipe, vocab) 36 | datapipe = dp.iter.Mapper(datapipe, add_bos) 37 | datapipe = dp.iter.Mapper(datapipe, add_eos) 38 | datapipe = dp.iter.Mapper(datapipe, T.ToTensor()) 39 | datapipe = dp.iter.Mapper(datapipe, embedding) 40 | 41 | return datapipe 42 | 43 | 44 | def get_dataset(spec: TorchEvalSpec): 45 | data_dir = ( 46 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 47 | ) 48 | train_filepath = pathlib.Path(data_dir) / pathlib.Path( 49 | "wikitext-103/wiki.train.tokens" 50 | ) 51 | 52 | datapipe = build_datapipe(str(train_filepath), spec) 53 | 54 | dataloader = DataLoader( 55 | datapipe, batch_size=spec.batch_size, num_workers=spec.num_workers 56 | ) 57 | 58 | return dataloader 59 | 60 | 61 | if __name__ == "__main__": 62 | dataset = get_dataset(TorchEvalSpec(1, 1)) 63 | for i, x in enumerate(dataset): 64 | print(x) 65 | if i == 10: 66 | break 67 | -------------------------------------------------------------------------------- /cedar/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Config file for cedar 3 | """ 4 | 5 | from typing import Type, TypeVar, Optional 6 | import ray 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | try: 13 | import nvidia.dali as dali # noqa: F401 14 | 15 | DALI_AVAILABLE = True 16 | except ImportError: 17 | DALI_AVAILABLE = False 18 | 19 | T = TypeVar("T", bound="CedarContext") 20 | 21 | 22 | class RayConfig: 23 | """ 24 | Configuration class for Ray 25 | """ 26 | 27 | def __init__(self, ip: str = "", n_cpus: Optional[int] = None): 28 | self.ip = ip 29 | self.n_cpus = n_cpus 30 | 31 | 32 | class CedarContext: 33 | """ 34 | Context holding necessary state for cedar services. 35 | """ 36 | 37 | def __init__(self, ray_config: Optional[RayConfig] = None): 38 | self.ray_config = ray_config 39 | 40 | def init_ray(self): 41 | """ 42 | Initialize the Ray runtime. 43 | NOTE: If calling this from a child process, ensure that the parent 44 | process does not call init_ray(). 45 | """ 46 | if self.ray_config is None: 47 | raise RuntimeError("Ray config not specified.") 48 | 49 | if ray.is_initialized(): 50 | logger.warning("Ray already initialized. Defaulting to it.") 51 | elif self.ray_config.ip != "": 52 | if ray.is_initialized(): 53 | ray.shutdown() 54 | logger.info(f"Connecting to ray cluster at {self.ray_config.ip}") 55 | ray.init(f"ray://{self.ray_config.ip}:10001") 56 | else: 57 | logger.info("Launching to local ray instance") 58 | if self.ray_config.n_cpus is not None: 59 | logger.info( 60 | "Using {} CPUs for local ray instance".format( 61 | self.ray_config.n_cpus 62 | ) 63 | ) 64 | ray.init(num_cpus=self.ray_config.n_cpus) 65 | else: 66 | ray.init() 67 | 68 | @classmethod 69 | def from_yaml(cls: Type[T], config_file: str) -> T: 70 | # TODO (myzhao) 71 | raise NotImplementedError 72 | 73 | def __del__(self): 74 | if self.ray_config: 75 | if ray.is_initialized(): 76 | ray.shutdown() 77 | pass 78 | 79 | def use_ray(self) -> bool: 80 | """ 81 | Returns if the context should use Ray. 82 | """ 83 | return self.ray_config is not None 84 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/cache_results/configs/wikitext_cache_after_truncate_two_offloads.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '10' 4 | 1: '0' 5 | 2: '1' 6 | 8: '12' 7 | 9: '11' 8 | 10: '' 9 | 11: '2' 10 | 12: '9' 11 | n_local_workers: 1 12 | pipes: 13 | 0: 14 | name: BatcherPipe(batch_size=1) 15 | variant: INPROCESS 16 | variant_ctx: 17 | variant_type: INPROCESS 18 | 1: 19 | name: MapperPipe_Embedding(50257, 764) 20 | variant: INPROCESS 21 | variant_ctx: 22 | variant_type: INPROCESS 23 | 2: 24 | name: MapperPipe_ToTensor() 25 | variant: INPROCESS 26 | variant_ctx: 27 | variant_type: INPROCESS 28 | 3: 29 | name: MapperPipe_AddToken() 30 | variant: INPROCESS 31 | variant_ctx: 32 | variant_type: INPROCESS 33 | 4: 34 | name: MapperPipe_AddToken() 35 | variant: INPROCESS 36 | variant_ctx: 37 | variant_type: INPROCESS 38 | 5: 39 | name: "MapperPipe_VocabTransform(\n (vocab): Vocab()\n)" 40 | variant: INPROCESS 41 | variant_ctx: 42 | variant_type: INPROCESS 43 | 6: 44 | name: MapperPipe_Truncate() 45 | variant: INPROCESS 46 | variant_ctx: 47 | variant_type: INPROCESS 48 | 7: 49 | name: MapperPipe_GPT2BPETokenizer() 50 | variant: INPROCESS 51 | variant_ctx: 52 | variant_type: INPROCESS 53 | 8: 54 | name: LocalLinePipe 55 | variant: INPROCESS 56 | variant_ctx: 57 | variant_type: INPROCESS 58 | 9: 59 | name: ObjectDiskCachePipe 60 | variant: INPROCESS 61 | variant_ctx: 62 | variant_type: INPROCESS 63 | 10: 64 | name: PrefetcherPipe 65 | variant: INPROCESS 66 | variant_ctx: 67 | variant_type: INPROCESS 68 | 11: 69 | fused_pipes: 70 | - 5 71 | - 4 72 | - 3 73 | name: FusedPipe 74 | variant: RAY 75 | variant_ctx: 76 | max_inflight: 10000 77 | max_prefetch: 10000 78 | n_actors: 16 79 | submit_batch_size: 500 80 | use_threads: true 81 | variant_type: RAY 82 | 12: 83 | fused_pipes: 84 | - 7 85 | - 6 86 | name: FusedPipe 87 | variant: RAY 88 | variant_ctx: 89 | max_inflight: 10000 90 | max_prefetch: 10000 91 | n_actors: 16 92 | submit_batch_size: 500 93 | use_threads: true 94 | variant_type: RAY 95 | -------------------------------------------------------------------------------- /evaluation/pipelines/wikitext103/tf_service_dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_text as text 3 | import pathlib 4 | 5 | from evaluation.tf_utils import TFEvalSpec 6 | 7 | DATASET_LOC = "datasets/wikitext103" 8 | 9 | # from https://github.com/cirquit/presto/blob/master/openwebtext_pipeline_modern.py # noqa: E501 10 | # vocabulary size 50001, GPT2 originally used 50257 11 | vocabulary_size = 50001 12 | bpe_model_path = tf.keras.utils.get_file( 13 | "bpe_en_50k.model", 14 | "https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs50000.model", 15 | ) 16 | bpe_model = open(bpe_model_path, "rb").read() 17 | 18 | embedding_dimension = 768 19 | bpe_tokernizer = text.SentencepieceTokenizer( 20 | model=bpe_model, out_type=tf.dtypes.int32 21 | ) 22 | 23 | embedding = tf.Variable( 24 | tf.random.uniform([vocabulary_size, embedding_dimension], -1.0, 1.0) 25 | ) 26 | 27 | 28 | def _truncate(x): 29 | dim = tf.shape(x)[0] 30 | slice_size = tf.minimum(dim, 254) 31 | x = tf.slice(x, [0], [slice_size]) 32 | return x 33 | 34 | 35 | def _embedding(x): 36 | return tf.nn.embedding_lookup(embedding, x) 37 | 38 | 39 | def build_dataset(path, spec): 40 | # ds = _load_text(path) 41 | ds = tf.data.TextLineDataset(path) 42 | 43 | ds = ds.map( 44 | bpe_tokernizer.tokenize, num_parallel_calls=spec.num_parallel_calls 45 | ) 46 | ds = ds.map(_truncate, num_parallel_calls=spec.num_parallel_calls) 47 | ds = ds.map(_embedding, num_parallel_calls=spec.num_parallel_calls) 48 | 49 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 50 | 51 | if spec.service_addr: 52 | print( 53 | "Using tf.data.service with address {}".format(spec.service_addr) 54 | ) 55 | ds = ds.apply( 56 | tf.data.experimental.service.distribute( 57 | processing_mode="distributed_epoch", service=spec.service_addr 58 | ) 59 | ) 60 | 61 | return ds 62 | 63 | 64 | def get_dataset(spec: TFEvalSpec): 65 | data_dir = ( 66 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 67 | ) 68 | train_filepath = pathlib.Path(data_dir) / pathlib.Path( 69 | "wikitext-103/wiki.train.tokens" 70 | ) 71 | 72 | return build_dataset( 73 | str(train_filepath), 74 | spec, 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | tf_dataset = get_dataset(TFEvalSpec(1, 1)) 80 | 81 | for i, x in enumerate(tf_dataset): 82 | print(x) 83 | # print(x.shape) 84 | print(i) 85 | if i == 10: 86 | break 87 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/cache_results/configs/cache_after_list.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 1: '0' 5 | 3: '6' 6 | 6: '12' 7 | 7: '1' 8 | 8: '3' 9 | 9: '10' 10 | 10: '8' 11 | 11: '' 12 | 12: '7' 13 | n_local_workers: 8 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=8) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_Grayscale(num_output_channels=1) 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 37 | 1.1), hue=(-0.1, 0.1)) 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 5: 42 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 6: 47 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 48 | 1.3333), interpolation=bilinear, antialias=warn) 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 7: 53 | name: MapperPipe_to_float 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 8: 58 | name: ImageReaderPipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 9: 63 | name: LocalFSListerPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | 10: 68 | name: ObjectDiskCachePipe 69 | variant: INPROCESS 70 | variant_ctx: 71 | variant_type: INPROCESS 72 | 11: 73 | name: PrefetcherPipe 74 | variant: INPROCESS 75 | variant_ctx: 76 | variant_type: INPROCESS 77 | 12: 78 | fused_pipes: 79 | - 2 80 | - 5 81 | - 4 82 | name: FusedPipe 83 | variant: RAY 84 | variant_ctx: 85 | max_inflight: 100 86 | max_prefetch: 100 87 | n_actors: 4 88 | submit_batch_size: 16 89 | use_threads: true 90 | variant_type: RAY 91 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/cache_results/configs/cache_after_read.yml: -------------------------------------------------------------------------------- 1 | physical_plan: 2 | graph: 3 | 0: '11' 4 | 1: '0' 5 | 3: '6' 6 | 6: '12' 7 | 7: '1' 8 | 8: '10' 9 | 9: '8' 10 | 10: '3' 11 | 11: '' 12 | 12: '7' 13 | n_local_workers: 8 14 | pipes: 15 | 0: 16 | name: BatcherPipe(batch_size=8) 17 | variant: INPROCESS 18 | variant_ctx: 19 | variant_type: INPROCESS 20 | 1: 21 | name: MapperPipe_Normalize(mean=(0.1307,), std=(0.3081,)) 22 | variant: INPROCESS 23 | variant_ctx: 24 | variant_type: INPROCESS 25 | 2: 26 | name: MapperPipe_GaussianBlur(kernel_size=(11, 11), sigma=(0.1, 2.0)) 27 | variant: INPROCESS 28 | variant_ctx: 29 | variant_type: INPROCESS 30 | 3: 31 | name: MapperPipe_Grayscale(num_output_channels=1) 32 | variant: INPROCESS 33 | variant_ctx: 34 | variant_type: INPROCESS 35 | 4: 36 | name: MapperPipe_ColorJitter(brightness=(0.9, 1.1), contrast=(0.9, 1.1), saturation=(0.9, 37 | 1.1), hue=(-0.1, 0.1)) 38 | variant: INPROCESS 39 | variant_ctx: 40 | variant_type: INPROCESS 41 | 5: 42 | name: MapperPipe_RandomHorizontalFlip(p=0.5) 43 | variant: INPROCESS 44 | variant_ctx: 45 | variant_type: INPROCESS 46 | 6: 47 | name: MapperPipe_RandomResizedCrop(size=(244, 244), scale=(0.08, 1.0), ratio=(0.75, 48 | 1.3333), interpolation=bilinear, antialias=warn) 49 | variant: INPROCESS 50 | variant_ctx: 51 | variant_type: INPROCESS 52 | 7: 53 | name: MapperPipe_to_float 54 | variant: INPROCESS 55 | variant_ctx: 56 | variant_type: INPROCESS 57 | 8: 58 | name: ImageReaderPipe 59 | variant: INPROCESS 60 | variant_ctx: 61 | variant_type: INPROCESS 62 | 9: 63 | name: LocalFSListerPipe 64 | variant: INPROCESS 65 | variant_ctx: 66 | variant_type: INPROCESS 67 | 10: 68 | name: ObjectDiskCachePipe 69 | variant: INPROCESS 70 | variant_ctx: 71 | variant_type: INPROCESS 72 | 11: 73 | name: PrefetcherPipe 74 | variant: INPROCESS 75 | variant_ctx: 76 | variant_type: INPROCESS 77 | 12: 78 | fused_pipes: 79 | - 2 80 | - 5 81 | - 4 82 | name: FusedPipe 83 | variant: RAY 84 | variant_ctx: 85 | max_inflight: 100 86 | max_prefetch: 100 87 | n_actors: 4 88 | submit_batch_size: 16 89 | use_threads: true 90 | variant_type: RAY 91 | -------------------------------------------------------------------------------- /cedar/pipes/custom/commonvoice.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import librosa 4 | 5 | SAMPLE_FREQ = 8000 6 | N_FFT = 400 7 | FREQ_MASK_PARAM = 80 8 | TIME_MASK_PARAM = 80 9 | N_MELS = 256 10 | 11 | 12 | def time_mask(x): 13 | if isinstance(x, dict): 14 | x = x["item"].copy() 15 | ray_ds = True 16 | else: 17 | x = x.copy() 18 | ray_ds = False 19 | t = np.random.uniform(low=0.0, high=TIME_MASK_PARAM) 20 | t = int(t) 21 | tau = x.shape[1] 22 | rand_int = max(0, tau - t) 23 | t0 = random.randint(0, rand_int) 24 | x[:, t0 : t0 + t] = 0 # noqa: E203 25 | if ray_ds: 26 | return {"item": x} 27 | else: 28 | return x 29 | 30 | 31 | def frequency_mask(x): 32 | if isinstance(x, dict): 33 | x = x["item"].copy() 34 | ray_ds = True 35 | else: 36 | x = x.copy() 37 | ray_ds = False 38 | f = np.random.uniform(low=0.0, high=FREQ_MASK_PARAM) 39 | f = int(f) 40 | v = x.shape[0] 41 | f0 = random.randint(0, v - f) 42 | x[f0 : f0 + f, :] = 0 # noqa: E203 43 | if ray_ds: 44 | return {"item": x} 45 | else: 46 | return x 47 | 48 | 49 | def mel(x): 50 | if isinstance(x, dict): 51 | return { 52 | "item": librosa.feature.melspectrogram( 53 | S=x["item"], sr=SAMPLE_FREQ, n_mels=N_MELS, n_fft=N_FFT 54 | ) 55 | } 56 | else: 57 | return librosa.feature.melspectrogram( 58 | S=x, sr=SAMPLE_FREQ, n_mels=N_MELS, n_fft=N_FFT 59 | ) 60 | 61 | 62 | def _read(x): 63 | if isinstance(x, dict): 64 | return {"item": librosa.load(x["item"])} 65 | else: 66 | return librosa.load(x) 67 | 68 | 69 | def _resample(x): 70 | if isinstance(x, dict): 71 | data = x["item"] 72 | return { 73 | "item": librosa.resample( 74 | y=data[0], orig_sr=data[1], target_sr=SAMPLE_FREQ 75 | ) 76 | } 77 | else: 78 | return librosa.resample(y=x[0], orig_sr=x[1], target_sr=SAMPLE_FREQ) 79 | 80 | 81 | def _spec(x): 82 | if isinstance(x, dict): 83 | return {"item": np.abs(librosa.stft(x["item"], n_fft=N_FFT)) ** 2} 84 | else: 85 | return np.abs(librosa.stft(x, n_fft=N_FFT)) ** 2 86 | 87 | 88 | def _stretch(x): 89 | if isinstance(x, dict): 90 | return { 91 | "item": librosa.effects.time_stretch( 92 | x["item"], rate=0.8, n_fft=N_FFT 93 | ) 94 | } 95 | else: 96 | return librosa.effects.time_stretch(x, rate=0.8, n_fft=N_FFT) 97 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/nlp_hf_app.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import fastflow as ff 3 | from transformers import GPT2Tokenizer 4 | 5 | from eval_app_runner import App 6 | 7 | DATASET_LOC="/home/myzhao/cedar/evaluation/datasets/wikitext103/wikitext-103/wiki.train.tokens" 8 | 9 | def _load_text(path): 10 | text = tf.io.read_file(path) 11 | return tf.data.Dataset.from_tensor_slices(tf.strings.split(text, "\n")) 12 | 13 | 14 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 15 | embedding = tf.Variable(tf.random.uniform([50257, 764], -1.0, 1.0)) 16 | 17 | 18 | def _tokenize(x): 19 | return tokenizer(str(x.numpy()), return_tensors="tf")["input_ids"] 20 | 21 | def tokenize(x): 22 | res = tf.py_function(_tokenize, [x], [tf.int32]) 23 | return res 24 | 25 | def _truncate(x): 26 | dim = tf.shape(x)[1] 27 | slice_size = tf.minimum(dim, 254) 28 | x = tf.slice(x, [0, 0], [1, slice_size]) 29 | return x 30 | 31 | 32 | def _embedding(x): 33 | return (tf.nn.embedding_lookup(embedding, x), tf.constant(0.0)) 34 | 35 | 36 | class WikiTextModel(ff.FastFlowModel): 37 | # class WikiTextModel(tf.keras.Model): 38 | def __init__(self): 39 | super().__init__() 40 | 41 | def call(self, inputs): 42 | # do nothing 43 | return inputs 44 | 45 | def __deepcopy__(self): 46 | return WikiTextModel() 47 | 48 | class WikiTextApp(App): 49 | # class WikiTextApp(): 50 | def __init__(self, args, config): 51 | super().__init__(args, config) 52 | # pass 53 | 54 | def dummy_loss(self, y_true, y_pred): 55 | return tf.constant(0.0) 56 | 57 | def create_model(self): 58 | model = WikiTextModel() 59 | 60 | model.compile(optimizer="adam", loss=self.dummy_loss) 61 | return model 62 | 63 | def create_dataset(self, num_parallel): 64 | ds = tf.data.TextLineDataset(DATASET_LOC).take(100000) 65 | ds = ds.map( 66 | tokenize, 67 | num_parallel_calls=num_parallel, 68 | name="prep_begin", 69 | ) 70 | ds = ds.map(_truncate, num_parallel_calls=num_parallel) 71 | ds = ds.map(_embedding, num_parallel_calls=num_parallel) 72 | ds = ds.batch(1) 73 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 74 | return ds 75 | 76 | def create_valid_dataset(self, num_parallel): 77 | return None 78 | 79 | if __name__ == "__main__": 80 | app = WikiTextApp(None, None) 81 | ds = app.create_dataset(1) 82 | 83 | for x in ds: 84 | print(x) 85 | break 86 | model = app.create_model() 87 | 88 | # config = ff.FastFlowConfig.from_yaml("/home/myzhao/FastFlow/examples/config.yaml") 89 | 90 | model.fit(ds, epochs=10) 91 | 92 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/nlp_app.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import fastflow as ff 3 | import tensorflow_text as text 4 | 5 | from eval_app_runner import App 6 | 7 | DATASET_LOC="/home/myzhao/cedar/evaluation/datasets/wikitext103/wikitext-103/wiki.train.tokens" 8 | 9 | # from https://github.com/cirquit/presto/blob/master/openwebtext_pipeline_modern.py # noqa: E501 10 | # vocabulary size 50001, GPT2 originally used 50257 11 | vocabulary_size = 50001 12 | bpe_model_path = tf.keras.utils.get_file( 13 | "bpe_en_50k.model", 14 | "https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs50000.model", 15 | ) 16 | bpe_model = open(bpe_model_path, "rb").read() 17 | 18 | embedding_dimension = 768 19 | bpe_tokernizer = text.SentencepieceTokenizer( 20 | model=bpe_model, out_type=tf.dtypes.int32 21 | ) 22 | 23 | embedding = tf.Variable( 24 | tf.random.uniform([vocabulary_size, embedding_dimension], -1.0, 1.0) 25 | ) 26 | 27 | def _truncate(x): 28 | dim = tf.shape(x)[0] 29 | slice_size = tf.minimum(dim, 254) 30 | x = tf.slice(x, [0], [slice_size]) 31 | return x 32 | 33 | 34 | def _embedding(x): 35 | return (tf.nn.embedding_lookup(embedding, x), tf.constant(0.0)) 36 | 37 | class WikiTextModel(ff.FastFlowModel): 38 | def __init__(self): 39 | super().__init__() 40 | 41 | def call(self, inputs): 42 | # do nothing 43 | return inputs 44 | 45 | def __deepcopy__(self): 46 | return WikiTextModel() 47 | 48 | class WikiTextApp(App): 49 | def __init__(self, args, config): 50 | super().__init__(args, config) 51 | 52 | def dummy_loss(self, y_true, y_pred): 53 | return tf.constant(0.0) 54 | 55 | def create_model(self): 56 | model = WikiTextModel() 57 | 58 | model.compile(optimizer="adam", loss=self.dummy_loss) 59 | return model 60 | 61 | def create_dataset(self, num_parallel): 62 | ds = tf.data.TextLineDataset(DATASET_LOC).take(200000) 63 | ds = ds.map( 64 | bpe_tokernizer.tokenize, num_parallel_calls=num_parallel, 65 | name="prep_begin", 66 | ) 67 | ds = ds.map(_truncate, num_parallel_calls=num_parallel) 68 | ds = ds.map(_embedding, num_parallel_calls=num_parallel) 69 | ds = ds.batch(1) 70 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 71 | return ds 72 | 73 | def create_valid_dataset(self, num_parallel): 74 | return None 75 | 76 | if __name__ == "__main__": 77 | app = WikiTextApp(None, None) 78 | ds = app.create_dataset(1) 79 | 80 | # for x in ds: 81 | # print(x) 82 | # break 83 | model = app.create_model() 84 | 85 | # config = ff.FastFlowConfig.from_yaml("/home/myzhao/FastFlow/examples/config.yaml") 86 | 87 | model.fit(ds, epochs=10) 88 | 89 | -------------------------------------------------------------------------------- /evaluation/fastflow/examples/simclr_app.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import fastflow as ff 3 | import tensorflow_addons as tfa 4 | 5 | from eval_app_runner import App 6 | 7 | DATASET_LOC="/home/myzhao/cedar/evaluation/datasets/imagenette2/imagenette2/train/*/*" 8 | IMG_HEIGHT = 244 9 | IMG_WIDTH = 244 10 | GAUSSIAN_BLUR_KERNEL_SIZE = 11 11 | 12 | class SimCLRModel(ff.FastFlowModel): 13 | def __init__(self): 14 | super().__init__() 15 | 16 | def call(self, inputs): 17 | # do nothing 18 | return inputs 19 | 20 | def __deepcopy__(self): 21 | return SimCLRModel() 22 | 23 | class SimCLRApp(App): 24 | def __init__(self, args, config): 25 | super().__init__(args, config) 26 | 27 | def process_path(self, img): 28 | boxes = tf.random.uniform(shape=(1, 4)) 29 | 30 | img = tf.image.decode_jpeg(img, channels=3) 31 | img = tf.image.convert_image_dtype(img, tf.float32) 32 | img = tf.expand_dims(img, axis=0) 33 | img = tf.image.crop_and_resize(img, boxes, [0], [IMG_HEIGHT, IMG_WIDTH]) 34 | img = tf.image.random_flip_left_right(img) 35 | img = tf.image.random_brightness(img, max_delta=0.1) 36 | img = tf.image.random_contrast(img, lower=0.9, upper=1.1) 37 | img = tf.image.random_saturation(img, lower=0.9, upper=1.1) 38 | img = tf.image.random_hue(img, max_delta=0.1) 39 | img = tf.image.rgb_to_grayscale(img) 40 | img = tfa.image.gaussian_filter2d( 41 | img, 42 | filter_shape=[GAUSSIAN_BLUR_KERNEL_SIZE, GAUSSIAN_BLUR_KERNEL_SIZE], 43 | ) 44 | img = tf.image.per_image_standardization(img) 45 | return img 46 | 47 | def dummy_loss(self, y_true, y_pred): 48 | return tf.constant(0.0) 49 | 50 | def create_model(self): 51 | model = SimCLRModel() 52 | 53 | model.compile(optimizer="adam", loss=self.dummy_loss) 54 | return model 55 | 56 | def create_dataset(self, num_parallel): 57 | ds = tf.data.Dataset.list_files(DATASET_LOC, shuffle=True) 58 | ds = ds.map(tf.io.read_file, num_parallel_calls=num_parallel) 59 | ds = ds.map( 60 | lambda x: (self.process_path(x), tf.constant(0.0)), 61 | num_parallel_calls=num_parallel, 62 | name="prep_begin", 63 | ) 64 | ds = ds.batch(1) 65 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 66 | 67 | return ds 68 | 69 | def create_valid_dataset(self, num_parallel): 70 | return None 71 | 72 | if __name__ == "__main__": 73 | app = SimCLRApp(None, None) 74 | ds = app.create_dataset(1) 75 | 76 | # for x in ds: 77 | # print(x) 78 | # break 79 | model = app.create_model() 80 | 81 | # config = ff.FastFlowConfig.from_yaml("/home/myzhao/FastFlow/examples/config.yaml") 82 | 83 | model.fit(ds, epochs=10) 84 | 85 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/torch_dataset.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import matplotlib.pyplot as plt 3 | import torch 4 | import torchdata.datapipes as dp 5 | import librosa 6 | import numpy as np 7 | import random 8 | 9 | from evaluation.torch_utils import TorchEvalSpec 10 | from torch.utils.data import DataLoader 11 | 12 | DATASET_LOC = "datasets/commonvoice/cv-corpus-15.0-delta-2023-09-08/en/clips/" 13 | SAMPLE_FREQ = 8000 14 | N_FFT = 400 15 | FREQ_MASK_PARAM = 80 16 | TIME_MASK_PARAM = 80 17 | N_MELS = 256 18 | 19 | 20 | def to_float(x): 21 | return x.to(torch.float32) 22 | 23 | 24 | def time_mask(x): 25 | t = np.random.uniform(low=0.0, high=TIME_MASK_PARAM) 26 | t = int(t) 27 | tau = x.shape[1] 28 | t0 = random.randint(0, tau - t) 29 | x[:, t0 : t0 + t] = 0 30 | return x 31 | 32 | 33 | def frequency_mask(x): 34 | f = np.random.uniform(low=0.0, high=FREQ_MASK_PARAM) 35 | f = int(f) 36 | v = x.shape[0] 37 | f0 = random.randint(0, v - f) 38 | x[f0 : f0 + f, :] = 0 39 | return x 40 | 41 | 42 | def mel(x): 43 | return librosa.feature.melspectrogram( 44 | S=x, sr=SAMPLE_FREQ, n_mels=N_MELS, n_fft=N_FFT 45 | ) 46 | 47 | 48 | def build_datapipe(root, spec: TorchEvalSpec): 49 | datapipe = dp.iter.FileLister(root=root, recursive=True) 50 | # TODO: Evaluate where is a fair place to put this... 51 | datapipe = datapipe.sharding_filter() 52 | datapipe = dp.iter.Mapper(datapipe, lambda x: librosa.load(x)) 53 | datapipe = dp.iter.Mapper( 54 | datapipe, 55 | lambda x: librosa.resample( 56 | y=x[0], orig_sr=x[1], target_sr=SAMPLE_FREQ 57 | ), 58 | ) 59 | datapipe = dp.iter.Mapper( 60 | datapipe, 61 | lambda x: np.abs(librosa.stft(x, n_fft=N_FFT)) ** 2, 62 | ) 63 | datapipe = dp.iter.Mapper( 64 | datapipe, 65 | lambda x: librosa.effects.time_stretch(x, rate=0.8, n_fft=N_FFT), 66 | ) 67 | datapipe = dp.iter.Mapper(datapipe, time_mask) 68 | datapipe = dp.iter.Mapper(datapipe, frequency_mask) 69 | datapipe = dp.iter.Mapper(datapipe, mel) 70 | return datapipe 71 | 72 | 73 | def get_dataset(spec: TorchEvalSpec): 74 | data_dir = ( 75 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 76 | ) 77 | 78 | datapipe = build_datapipe(str(data_dir), spec) 79 | 80 | dataloader = DataLoader(datapipe, num_workers=spec.num_workers) 81 | 82 | return dataloader 83 | 84 | 85 | if __name__ == "__main__": 86 | dataset = get_dataset(TorchEvalSpec(8, 1)) 87 | for x in dataset: 88 | print(x) 89 | print(x.size()) 90 | 91 | fig, ax = plt.subplots() 92 | D = librosa.power_to_db(x.squeeze(0).numpy(), ref=np.max) 93 | img = librosa.display.specshow( 94 | D, y_axis="mel", x_axis="time", sr=SAMPLE_FREQ, ax=ax 95 | ) 96 | fig.savefig("tmp.png") 97 | break 98 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/ray_dataset.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import torch 3 | import glob 4 | from torchvision import transforms 5 | import PIL 6 | import ray 7 | import time 8 | 9 | DATASET_LOC = "datasets/imagenette2" 10 | IMG_HEIGHT = 244 11 | IMG_WIDTH = 244 12 | GAUSSIAN_BLUR_KERNEL_SIZE = 11 13 | 14 | 15 | class Timer: 16 | def __init__(self): 17 | self._start = None 18 | self._end = None 19 | 20 | def __enter__(self): 21 | self._start = time.perf_counter() 22 | 23 | def __exit__(self, exc_type, exc_val, exc_tb): 24 | self._end = time.perf_counter() 25 | 26 | def reset(self): 27 | self._start = time.perf_counter() 28 | 29 | def delta(self): 30 | if self._start is None or self._end is None: 31 | raise RuntimeError() 32 | return self._end - self._start 33 | 34 | 35 | def read_img(x): 36 | return {"image": PIL.Image.open(x["item"])} 37 | 38 | 39 | def transform_img(x): 40 | transform = transforms.Compose( 41 | [ 42 | transforms.PILToTensor(), 43 | transforms.ConvertImageDtype(torch.float), 44 | transforms.RandomResizedCrop((IMG_HEIGHT, IMG_WIDTH)), 45 | transforms.ColorJitter(0.1, 0.1, 0.1, 0.1), 46 | transforms.Grayscale(num_output_channels=1), 47 | transforms.GaussianBlur(GAUSSIAN_BLUR_KERNEL_SIZE), 48 | transforms.Normalize((0.1307,), (0.3081,)), 49 | ] 50 | ) 51 | return {"image": transform(x["image"])} 52 | 53 | 54 | def ret_img(x): 55 | return x 56 | 57 | 58 | def build_ds(root): 59 | # Get list of dirs 60 | dir_list = [] 61 | file_list = [] 62 | for item in pathlib.Path(root).iterdir(): 63 | if item.is_dir(): 64 | dir_list.append(str(item)) 65 | files = glob.glob(f"{str(item)}/*.JPEG") 66 | file_list.extend(files) 67 | ds = ray.data.from_items(file_list) 68 | ds = ds.map(read_img) 69 | ds = ds.map(transform_img) 70 | 71 | return ds 72 | 73 | 74 | def get_dataset(): 75 | data_dir = ( 76 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 77 | ) 78 | train_filepath = pathlib.Path(data_dir) / pathlib.Path("imagenette2/train") 79 | 80 | ds = build_ds(str(train_filepath)) 81 | ray.data.DataContext.get_current().execution_options.locality_with_output = ( 82 | True 83 | ) 84 | 85 | return ds 86 | 87 | 88 | if __name__ == "__main__": 89 | ds = get_dataset() 90 | epoch_times = [] 91 | for _ in range(3): 92 | timer = Timer() 93 | with timer: 94 | for idx, row in enumerate(ds.iter_rows()): 95 | pass 96 | epoch_times.append(timer.delta()) 97 | print(epoch_times[-1]) 98 | 99 | print("Epoch times: {}".format(epoch_times)) 100 | -------------------------------------------------------------------------------- /evaluation/plots/plot_ablation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | # Load the data 6 | file_path = "~/cedar/evaluation/plots/ablation.csv" 7 | data = pd.read_csv(file_path) 8 | 9 | rename_dict = { 10 | "Baseline": "Baseline", 11 | "plus parallelism": "+P", 12 | "plus reorder": "+PR", 13 | "plus offload": "+PRO", 14 | "plus fusion": "+PROF", 15 | } 16 | data["Setup"] = data["Setup"].map(rename_dict) 17 | 18 | # Convert execution time to throughput 19 | data["Runtime"] = 1 / data["Runtime"] 20 | 21 | # Normalize the 'Average' for 'cedar-remote' in each 'Pipeline' group 22 | normalization_factors = data[data["Setup"] == "Baseline"].set_index( 23 | "Pipeline" 24 | )["Runtime"] 25 | data["Normalized Runtime"] = data.apply( 26 | lambda row: row["Runtime"] / normalization_factors.get(row["Pipeline"], 1), 27 | axis=1, 28 | ) 29 | print(data) 30 | 31 | # Create the plot with normalized values 32 | f = plt.figure(figsize=(3.33, 1.8), dpi=600) 33 | # sns.set_style("whitegrid") 34 | ax = sns.barplot( 35 | x="Pipeline", 36 | y="Normalized Runtime", 37 | hue="Setup", 38 | data=data, 39 | linewidth=0, 40 | hue_order=["Baseline", "+P", "+PR", "+PRO", "+PROF"], 41 | ) 42 | 43 | # Add hatches 44 | for i, bar in enumerate(ax.patches): 45 | if i in range(0, 8): 46 | bar.set_hatch("//") 47 | if i in range(8, 16): 48 | bar.set_hatch("\\\\") 49 | if i in range(16, 24): 50 | bar.set_hatch("--") 51 | if i in range(24, 32): 52 | bar.set_hatch("..") 53 | if i in range(32, 40): 54 | bar.set_hatch("oo") 55 | ax.patches[40].set_hatch("////") 56 | ax.patches[41].set_hatch("\\\\\\") 57 | ax.patches[42].set_hatch("----") 58 | ax.patches[43].set_hatch("..") 59 | ax.patches[44].set_hatch("oo") 60 | 61 | # Adding vertical lines and red "X" for missing values 62 | pipeline_labels = data["Pipeline"].unique() # Get unique pipeline labels 63 | 64 | # Set x-ticks 65 | # Adding vertical lines to mark ranges of each x category 66 | for i in range(len(pipeline_labels) - 1): 67 | ax.axvline( 68 | x=i + 0.5, color="grey", linestyle="-", linewidth=0.5 69 | ) # End of group 70 | 71 | plt.xticks(rotation=30, ha="right", fontsize=6) 72 | plt.yticks(fontsize=6) 73 | # plt.yticks((0, 0.5, 1), fontsize=6) 74 | # ax.set_ylim((0, 50)) 75 | ax.tick_params(axis="both", which="major", pad=0) 76 | # Set y ticks to small font 77 | ax.set_ylabel("Normalized Throughput", fontsize=6) 78 | ax.set_xlabel("") 79 | ax.tick_params(axis="x", direction="out", length=3, color="black") 80 | # ax.set_yscale("log") 81 | 82 | ax.set_ylim((0, 26)) 83 | 84 | # Write the throughput for the ASR PROF setup above the bar 85 | ax.text(0.93, 1.02, "43.83", fontsize=5, transform=ax.transAxes) 86 | 87 | 88 | ax.legend(fontsize=5, title_fontsize="6", ncol=5) 89 | 90 | # Display the plot 91 | plt.tight_layout() 92 | # ax.legend(fontsize=6, title_fontsize='6') 93 | f.savefig("ablation.png", bbox_inches="tight") 94 | -------------------------------------------------------------------------------- /evaluation/pipelines/commonvoice/tf_dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import matplotlib.pyplot as plt 3 | import pathlib 4 | import librosa 5 | import random 6 | import numpy as np 7 | 8 | from evaluation.tf_utils import TFEvalSpec 9 | 10 | DATASET_LOC = "datasets/commonvoice/cv-corpus-15.0-delta-2023-09-08/en/clips/" 11 | SAMPLE_FREQ = 8000 12 | N_FFT = 400 13 | FREQ_MASK_PARAM = 80 14 | TIME_MASK_PARAM = 80 15 | N_MELS = 256 16 | 17 | 18 | def time_mask(x): 19 | t = np.random.uniform(low=0.0, high=TIME_MASK_PARAM) 20 | t = int(t) 21 | tau = x.shape[1] 22 | t0 = random.randint(0, tau - t) 23 | x[:, t0 : t0 + t] = 0 24 | return x 25 | 26 | 27 | def frequency_mask(x): 28 | f = np.random.uniform(low=0.0, high=FREQ_MASK_PARAM) 29 | f = int(f) 30 | v = x.shape[0] 31 | f0 = random.randint(0, v - f) 32 | x[f0 : f0 + f, :] = 0 33 | return x 34 | 35 | 36 | def mel(x): 37 | return librosa.feature.melspectrogram( 38 | S=x, sr=SAMPLE_FREQ, n_mels=N_MELS, n_fft=N_FFT 39 | ) 40 | 41 | 42 | @tf.py_function(Tout=tf.float32) 43 | def process_path(path): 44 | x, sr = librosa.load(path.numpy()) 45 | x = librosa.resample(y=x, orig_sr=sr, target_sr=SAMPLE_FREQ) 46 | x = np.abs(librosa.stft(x, n_fft=N_FFT)) ** 2 47 | x = librosa.effects.time_stretch(x, rate=0.8, n_fft=N_FFT) 48 | x = time_mask(x) 49 | x = frequency_mask(x) 50 | x = mel(x) 51 | return x 52 | 53 | 54 | def build_dataset(data_dir, spec): 55 | ds = tf.data.Dataset.list_files(f"{data_dir}/*", shuffle=False) 56 | ds = ds.map( 57 | lambda x: process_path(x), num_parallel_calls=spec.num_parallel_calls 58 | ) 59 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 60 | 61 | if spec.service_addr: 62 | print( 63 | "Using tf.data.service with address {}".format(spec.service_addr) 64 | ) 65 | ds = ds.apply( 66 | tf.data.experimental.service.distribute( 67 | processing_mode="distributed_epoch", service=spec.service_addr 68 | ) 69 | ) 70 | return ds 71 | 72 | 73 | def get_dataset(spec: TFEvalSpec): 74 | data_dir = ( 75 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 76 | ) 77 | 78 | # return gen_files(train_filepath) 79 | 80 | return build_dataset( 81 | str(data_dir), 82 | spec, 83 | ) 84 | 85 | 86 | if __name__ == "__main__": 87 | batch_size = 8 88 | num_workers = tf.data.AUTOTUNE 89 | 90 | tf_dataset = get_dataset(TFEvalSpec(1, 1)) 91 | 92 | for i, x in enumerate(tf_dataset): 93 | print(x) 94 | print(x.shape) 95 | 96 | fig, ax = plt.subplots() 97 | D = librosa.power_to_db(x.numpy(), ref=np.max) 98 | img = librosa.display.specshow( 99 | D, y_axis="mel", x_axis="time", sr=SAMPLE_FREQ, ax=ax 100 | ) 101 | fig.savefig("tmptf.png") 102 | break 103 | -------------------------------------------------------------------------------- /evaluation/pipelines/simclrv2/tf_dataset.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_addons as tfa 3 | import pathlib 4 | 5 | from evaluation.tf_utils import TFEvalSpec 6 | 7 | DATASET_LOC = "datasets/imagenette2" 8 | IMG_HEIGHT = 244 9 | IMG_WIDTH = 244 10 | GAUSSIAN_BLUR_KERNEL_SIZE = 11 11 | GCS_PATTERN = "gs://ember-data/imagenette2/train/*/*" 12 | 13 | 14 | def process_path(img): 15 | boxes = tf.random.uniform(shape=(1, 4)) 16 | 17 | # img = tf.io.read_file(file_path) 18 | img = tf.image.decode_jpeg(img, channels=3) 19 | img = tf.image.convert_image_dtype(img, tf.float32) 20 | img = tf.expand_dims(img, axis=0) 21 | img = tf.image.crop_and_resize(img, boxes, [0], [IMG_HEIGHT, IMG_WIDTH]) 22 | img = tf.image.random_flip_left_right(img) 23 | img = tf.image.random_brightness(img, max_delta=0.1) 24 | img = tf.image.random_contrast(img, lower=0.9, upper=1.1) 25 | img = tf.image.random_saturation(img, lower=0.9, upper=1.1) 26 | img = tf.image.random_hue(img, max_delta=0.1) 27 | img = tf.image.rgb_to_grayscale(img) 28 | img = tfa.image.gaussian_filter2d( 29 | img, 30 | filter_shape=[GAUSSIAN_BLUR_KERNEL_SIZE, GAUSSIAN_BLUR_KERNEL_SIZE], 31 | ) 32 | img = tf.image.per_image_standardization(img) 33 | return img 34 | 35 | 36 | def build_dataset(data_dir, spec): 37 | if spec.read_from_remote: 38 | list_of_files = tf.io.gfile.glob(GCS_PATTERN) 39 | ds = tf.data.Dataset.from_tensor_slices(list_of_files) 40 | else: 41 | ds = tf.data.Dataset.list_files(str(data_dir / "*/*"), shuffle=True) 42 | 43 | # ds = tf.data.Dataset.list_files(str(data_dir / "*/*"), shuffle=True) 44 | ds = ds.map(tf.io.read_file, num_parallel_calls=spec.num_parallel_calls) 45 | ds = ds.map( 46 | lambda x: process_path(x), 47 | num_parallel_calls=spec.num_parallel_calls, 48 | ) 49 | ds = ds.batch(spec.batch_size) 50 | ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE) 51 | 52 | if spec.service_addr: 53 | print( 54 | "Using tf.data.service with address {}".format(spec.service_addr) 55 | ) 56 | ds = ds.apply( 57 | tf.data.experimental.service.distribute( 58 | processing_mode="distributed_epoch", service=spec.service_addr 59 | ) 60 | ) 61 | return ds 62 | 63 | 64 | def get_dataset(spec: TFEvalSpec): 65 | data_dir = ( 66 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 67 | ) 68 | train_filepath = pathlib.Path(data_dir) / "imagenette2/train" 69 | 70 | # return gen_files(train_filepath) 71 | 72 | return build_dataset( 73 | train_filepath, 74 | spec, 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | batch_size = 8 80 | num_workers = tf.data.AUTOTUNE 81 | data_dir = ( 82 | pathlib.Path(__file__).resolve().parents[2].joinpath(DATASET_LOC) 83 | ) 84 | train_filepath = pathlib.Path(data_dir) / "imagenette2/train" 85 | 86 | tf_dataset = get_dataset(TFEvalSpec(1, 1)) 87 | 88 | for i, x in enumerate(tf_dataset): 89 | print(x) 90 | # print(x.shape) 91 | print(i) 92 | break 93 | -------------------------------------------------------------------------------- /evaluation/plots/plot_scaling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | 6 | set_data = { 7 | "set_procs": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 8 | "observed_tput": [ 9 | 70.76981133, 10 | 82.05666667, 11 | 149.46, 12 | 229.2966667, 13 | 289.39, 14 | 351.71, 15 | 418.7233333, 16 | 478.2433333, 17 | 535.5066667, 18 | 601.7, 19 | 634.5366667, 20 | 681.65, 21 | 729.01, 22 | 757.992, 23 | 791.775, 24 | 820.474, 25 | 862.17, 26 | ], 27 | } 28 | 29 | target_data = { 30 | "target_tput": [ 31 | 40, 32 | 100, 33 | 200, 34 | 400, 35 | 600, 36 | ], 37 | "observed_procs": [ 38 | 0, 39 | 2, 40 | 3, 41 | 6, 42 | 10, 43 | ], 44 | } 45 | 46 | f = plt.figure(figsize=(3.33, 1.8), dpi=600) 47 | 48 | # Line plot for the set data 49 | ax = sns.lineplot( 50 | x="set_procs", 51 | y="observed_tput", 52 | data=pd.DataFrame(set_data), 53 | color="blue", 54 | label="observed throughput", 55 | linewidth=1, 56 | ) 57 | ax.set_ylabel("Throughput (samples/s)", fontsize=6, labelpad=2) 58 | ax.set_xlabel("Distributed Processes", fontsize=6, labelpad=2) 59 | ax.tick_params(axis="x", direction="out", length=2, color="black") 60 | ax.tick_params(axis="y", direction="out", length=2, color="black") 61 | # set tick labels to small 62 | plt.xticks(fontsize=6) 63 | plt.yticks(fontsize=6) 64 | 65 | # Add x and y axis grid lines 66 | ax.yaxis.grid(color="lightgray", linestyle="--", linewidth=0.5) 67 | ax.xaxis.grid(color="lightgray", linestyle="--", linewidth=0.5) 68 | 69 | ax.set_xlim(-1, 16) 70 | ax.set_ylim(-50, 900) 71 | 72 | # Draw horizontal line for target throughput 73 | for i in range(len(target_data["target_tput"])): 74 | # Make the line go from the left to where the corresponding observed_procs is 75 | ax.axhline( 76 | target_data["target_tput"][i], 77 | xmin=0, 78 | xmax=(target_data["observed_procs"][i] + 1) / 17, 79 | color="red", 80 | linewidth=0.4, 81 | linestyle="-", 82 | label="target throughput", 83 | ) 84 | # Draw the vertial line for the observed_procs 85 | ax.axvline( 86 | target_data["observed_procs"][i], 87 | ymin=0, 88 | ymax=(target_data["target_tput"][i] + 50) / 950, 89 | color="red", 90 | linewidth=0.4, 91 | linestyle="-", 92 | ) 93 | 94 | 95 | # Change legend to say "target throughput" 96 | handles, labels = ax.get_legend_handles_labels() 97 | # Don't show the legend for the set data 98 | ax.legend( 99 | handles=handles, 100 | labels=["Observed Throughput", "Target Throughput and Tuned Scale"], 101 | fontsize=6, 102 | title_fontsize="6", 103 | ) 104 | 105 | # Reduce pad between axis and labels 106 | ax.tick_params(axis="both", which="major", pad=2) 107 | 108 | 109 | 110 | 111 | plt.tight_layout() 112 | # ax.legend(fontsize=6, title_fontsize='6') 113 | f.savefig("scaling.png", bbox_inches="tight") 114 | --------------------------------------------------------------------------------