├── .gitignore
├── .gitmodules
├── README.md
├── data
├── TF1.15
│ ├── xla_candidates_bert.txt
│ ├── xla_candidates_inceptionv3.txt
│ ├── xla_candidates_resnet.txt
│ ├── xla_candidates_vgg.txt
│ └── xla_candidates_vgg19.txt
├── TF2.4
│ └── xla_candidates_resnet50.txt
├── mx_20200824_resnet50.py
├── tf_20200720.py
├── tf_20200731_resnet50.py
└── tf_20200811_pred_error.py
├── docker
└── tensorflow.Dockerfile
├── docs
├── backup.md
├── dependency.md
├── format.md
├── nvprof.md
├── profile.md
├── sample_config.yaml
└── usage.md
├── dpro
├── __init__.py
├── analyze.py
├── arg_utils.py
├── base.py
├── bps_helper
│ ├── __init__.py
│ ├── graph.py
│ └── preprocess.py
├── collect.py
├── cost_model
│ ├── __init__.py
│ ├── _gpu_predict
│ │ ├── __init__.py
│ │ ├── dataloader.py
│ │ ├── dim_reduce.py
│ │ ├── gpu_cost_model.py
│ │ ├── gpu_pred.py
│ │ └── grouper.py
│ ├── _mixed_precision
│ │ ├── .cost_model
│ │ │ ├── CastToFp16.txt
│ │ │ ├── CastToFp32.txt
│ │ │ ├── Conv2D.txt
│ │ │ └── MatMul.txt
│ │ ├── __init__.py
│ │ ├── amp_cost_model.py
│ │ ├── amp_pred.py
│ │ ├── dataloader.py
│ │ ├── dim_reduce.py
│ │ ├── grouper.py
│ │ └── test_rst.py
│ ├── _tsfs
│ │ ├── __init__.py
│ │ └── cost_model.py
│ ├── _xla
│ │ ├── __init__.py
│ │ ├── execute_graph.py
│ │ ├── gen_dataset_utils.py
│ │ ├── gen_samples.py
│ │ ├── p_dispersion.py
│ │ ├── pk_graph.py
│ │ ├── process_trace.py
│ │ ├── utils.py
│ │ ├── xla_module_cost_model.py
│ │ ├── xla_run_generate_kernel_dataset.sh
│ │ ├── xla_run_test_module_cm.sh
│ │ ├── xla_run_train_module_cm.sh
│ │ └── xlatools.py
│ ├── base.py
│ ├── gpu_models_info.py
│ ├── mixed_precision.py
│ ├── op_fusion.py
│ ├── tensor_fusion.py
│ ├── trace_clct.sh
│ └── trace_filter.py
├── dag_utils.py
├── debug_utils.py
├── helper
│ ├── combine_json.py
│ ├── compare_graph.py
│ ├── get_iter_time_from_trace.py
│ ├── tf_flops_profile.py
│ ├── tf_helper.py
│ └── visualize.py
├── hvd
│ ├── __init__.py
│ └── graph.py
├── logger_utils.py
├── memory
│ ├── .gitignore
│ ├── README.md
│ ├── __init__.py
│ ├── cost_model.py
│ ├── estimator.py
│ ├── gradient_accumulation.py
│ ├── node.py
│ ├── recomputation.py
│ ├── schedule.py
│ └── utils.py
├── mg_generate_dataset.py
├── ml_platform
│ ├── __init__.py
│ ├── mxnet
│ │ ├── __init__.py
│ │ └── metadata.py
│ └── tensorflow
│ │ ├── __init__.py
│ │ ├── amp_lists.py
│ │ ├── memory_lists.py
│ │ ├── metadata.py
│ │ └── util.py
├── nvprof
│ └── analyze.py
├── optimizer
│ ├── __init__.py
│ ├── base.py
│ ├── dp.py
│ ├── mcmc.py
│ └── mcts.py
├── parameter.py
├── replay.py
├── trace_utils.py
├── xla_cm_entry.py
└── xla_test_generate_cluster_spec.py
├── dpro_cli
├── requirements.txt
├── setup.py
├── setup.sh
└── sleep.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_Store
3 | *.pyc
4 | build/
5 | .vscode/
6 | .env
7 | data/data_*
8 | dist/dist/byteprofile_analysis-0.1-py3.8.egg
9 | byteprofile_analysis.egg-info
10 | dist/
11 | */cost_model/_xla/.cost_model/
12 | */cost_model/_gpu_predict/.cost_model/
13 | .idea/
14 | *egg-info
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "3rdparty/mixbench"]
2 | path = 3rdparty/mixbench
3 | url = https://github.com/ekondis/mixbench.git
4 | [submodule "3rdparty/nvprof2json"]
5 | path = 3rdparty/nvprof2json
6 | url = https://github.com/joapolarbear/nvprof2json.git
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | This project is used to analyze the trace results profiled via [byteprofile](https://github.com/joapolarbear/byteps) a developed version of [BytePS](https://github.com/bytedance/byteps).
4 |
5 | # Usage
6 | By choosing different `--option`, this project supports the functionalities as shown below.
7 |
8 | ## Statistic
9 | Set arg `--option statistic` to show the statistic results, and arg `--path` must be set to the exact trace file path (ending with `.json` ).
10 |
11 | ## Visualize the DAG
12 | Set arg `--option graph`, visualize the dependency dag graph. and arg `--path` must be set to the exact DAG path (ending with `.gml`).
13 |
14 | ## Combine Trace Files
15 | Set arg `--option combine`, this can be used to combine several trace files into one file, e.g., one worker may has two GPUs, each of which generates a trace file, you can use this option and list the paths of these two files using `--path`.
16 |
17 | There are two options to define the trace paths.
18 |
19 | 1. Use file paths. In this case, `--path` should be a list of file paths, each of which denotes a trace file. The combined trace file will be stored under the same directory as the first trace file.
20 | 2. Use directory paths. In this case, `--path` is a list of directory paths, each of which denotes one worker and contains trace directories of GPUs on this worker. By default, the combined trace file will be stored under the first directory.
21 |
22 | **Note: please ensure that all of paths are file paths or all of them are diretory paths.**
23 |
24 |
25 | If you do not want combine all the traces, you can use `--filter` to give a list communication operations seperated with comma, then only these communication operations will appear in the combined trace file. For now, the filter only supports communication nodes. An example is shown below.
26 |
27 | ```bash
28 | python3 analyze.py --option combine --path ... --filter Comm.gradient_1,Comm.gradient_2
29 | ```
30 |
31 |
32 | An example of combined timeline of 2 GPUs visualized by [chrome trace tool](chrome://tracing/) is shown below, which uses mnist as the dataset, running on 2 worker, each with 2 V100 GPUs. Here the prefix `Process 0`, `0` denotes the local rank of this GPU.
33 |
34 |
35 |
36 | ## Compare two trace files
37 | Set arg `--option compare`. Similar to option `combine`, the argument `--path` could be a list of worker trace directories or a list of trace files. When a list of directories is given, traces on one worker will automatically be merged.
38 |
39 | Besides, you can
40 | * set `--xlsx` to export the comparison results to an XLSX file.
41 | * set `--sort` to sort the comparison results.
42 | * set `--head ` to display first `` of comparison results.
43 |
44 |
45 | ## Calculate the Critical Path of the DAG
46 | Set arg `--option critical`, here `--path` should be the root trace directory, by default, it's `BYTEPS_TRACE_DIR`.
47 |
48 | **Note that, you must use the latest version of byteprofile to run this option.**
49 |
50 | ## Replay based on the traces
51 | Set arg `--option replay` to replay the traces for one worker.
52 | * Use `--path` to specify the path where the worker traces are stored.
53 | * Set `--del_queue` to include each partition and QueueType for communication traces.
54 | * Use `--step_num` to give the number of steps to replay.
55 | * Set `--pretty` to output necessary info.
56 |
57 | ## Update final traces
58 | Set arg `--option collect` to update the final traces. In the meanwhile, the average iteration time would be outputed. `--path` should be the root directory of a worker or a GPU.
59 | * `--sub_option iter_time`, only calculate the iteration time and FW+BW time
60 | * `--sub_option operator`, update operator traces based on the source files.
61 | * others, re-combine all traces based on the source files.
62 |
63 | ## `--option 3dcompare`
64 | Ignore partition id
65 |
66 | # Requirements
67 | pip3 packet: intervaltree, networkx, ujson, xlsxwriter, scapy
68 |
--------------------------------------------------------------------------------
/data/tf_20200811_pred_error.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)
4 |
5 | ax = plt.subplot(111)
6 |
7 | # all_data[label id][test error for 100 times of repeated test]
8 | all_data = [
9 | [13.956024, 14.433242, 36.365207, 10.380671, 13.353506, 50.151516, 11.076908, 12.229296, 13.197254, 44.824227, 35.116051, 64.394852, 10.480341, 13.720852, 88.557334, 37.497621, 11.776002, 11.423172, 55.909996, 16.600878, 14.260358, 13.863314, 48.258129, 15.427159, 9.433073, 11.831115, 10.212262, 12.354979, 43.799844, 12.839599, 56.183875, 11.828298, 12.073554, 85.360527, 49.737677, 11.457030, 9.551553, 12.788189, 14.407476, 36.279821, 37.727431, 53.597884, 10.865854, 49.453092, 11.744117, 16.065677, 49.642372, 60.080919, 10.576669, 56.671448, 11.937124, 59.463291, 12.169872, 9.903194, 68.125121, 9.870735, 12.528896, 12.528278, 13.134355, 92.268916, 10.767118, 10.133862, 12.381869, 11.146730, 12.500672, 11.041856, 46.560807, 11.221893, 15.538326, 11.204281, 53.683452, 11.305852, 13.934167, 44.761666, 11.418703, 39.687794, 39.263057, 14.782713, 56.810139, 57.797041, 13.545492, 63.703707, 13.209352, 12.424941, 14.129963, 56.139439, 14.960313, 58.006742, 52.780320, 10.053853, 11.335702, 13.900809, 13.569368, 8.642230, 12.184775, 44.865501, 49.213126, 11.585204, 13.603950, 47.714876],
10 | [57.166523, 6.982943, 7.753159, 6.544062, 6.370150, 58.934236, 5.821519, 7.581446, 31.132802, 7.275806, 7.342372, 6.154925, 4.845664, 5.375899, 5.250347, 5.409458, 47.462595, 7.730537, 8.586070, 5.298019, 42.381557, 9.279349, 67.919465, 3.912002, 5.851241, 7.560528, 5.731278, 6.970734, 5.871561, 60.336510, 6.118150, 7.916642, 6.013825, 7.946192, 5.281653, 6.026306, 4.679249, 5.957760, 4.761488, 6.515127, 88.822437, 6.525346, 6.549746, 74.405183, 51.872324, 68.615514, 61.264099, 6.917063, 5.913056, 6.204503, 7.289733, 6.271309, 78.464292, 6.500042, 7.200059, 58.205104, 6.920534, 5.753805, 5.271359, 46.357106, 68.670515, 83.747194, 58.410471, 5.499968, 54.382253, 62.618250, 9.220569, 37.116831, 6.494756, 7.962453, 7.138617, 90.676043, 7.098806, 82.908148, 8.753192, 53.015609, 6.137569, 61.078094, 6.131989, 55.992310, 54.107132, 5.951071, 41.104941, 89.356646, 51.611572, 6.954993, 56.234897, 7.386588, 48.450425, 5.957338, 49.784691, 70.344788, 6.639423, 7.197308, 6.826844, 70.026041, 6.429870, 6.276725, 46.879217, 5.949097],
11 |
12 | [10.515195, 45.434328, 51.414588, 15.402890, 9.034116, 55.602559, 69.823075, 9.650470, 10.743006, 9.906061, 13.349366, 12.244130, 10.966927, 10.924287, 13.885874, 12.222774, 65.770478, 10.316878, 39.384284, 14.644657, 11.249421, 41.947391, 15.741479, 8.898329, 12.156129, 14.583042, 9.001215, 11.812949, 9.541513, 12.339056, 11.040557, 11.452073, 12.472323, 9.783736, 57.167140, 40.489246, 43.608511, 61.649401, 12.363932, 94.788833, 8.643454, 44.381189, 9.933129, 60.882352, 14.526221, 6.390452, 13.461222, 11.654321, 52.283489, 11.649394, 11.015001, 7.754916, 10.947251, 11.494571, 9.674303, 12.610179, 42.955850, 9.670831, 13.405886, 9.058590, 11.744207, 13.501221, 43.761783, 12.304239, 10.588361, 12.464460, 69.354573, 9.949217, 43.991506, 13.569916, 11.333769, 9.802779, 10.881983, 11.404215, 62.910407, 12.273455, 8.903062, 10.182866, 12.851778, 10.193168, 42.471023, 37.978502, 74.606463, 12.347102, 10.987125, 13.633108, 15.169739, 12.006797, 13.912672, 12.773112, 17.140734, 12.890699, 9.391158, 54.936686, 12.315391, 9.843667, 12.330516, 11.492315, 17.803154, 13.282663],
13 | [6.957377, 64.313606, 56.916500, 45.012399, 5.190539, 6.840786, 57.126490, 53.727270, 7.356256, 6.667304, 6.476066, 5.256859, 6.906330, 70.462762, 7.660466, 7.012304, 7.090306, 6.105001, 6.178831, 66.952585, 61.379096, 7.649414, 7.092564, 5.626575, 5.648277, 64.937245, 7.898103, 5.554414, 7.115182, 6.907596, 9.395951, 5.095690, 43.717707, 6.670211, 39.997334, 5.986490, 3.661167, 69.984995, 4.068519, 5.391135, 5.804065, 66.306257, 34.389040, 5.428332, 6.674271, 58.584086, 5.627205, 36.384092, 88.157494, 6.184002, 5.053334, 6.944832, 6.302980, 7.521219, 7.880967, 53.725881, 44.218278, 6.332340, 7.813679, 6.771026, 6.560170, 7.177000, 4.961530, 9.494012, 66.770047, 4.043301, 4.791953, 6.563170, 5.435724, 5.899271, 5.110108, 7.188095, 5.768912, 65.809342, 7.354027, 6.562866, 4.582166, 5.480870, 6.541070, 36.262630, 6.259378, 6.305899, 51.754131, 6.250982, 6.641401, 35.641656, 9.512910, 4.657524, 7.552413, 61.726023, 8.057250, 53.622812, 7.819604, 5.348360, 5.155510, 75.192289, 59.524181, 69.975642, 5.468333, 5.369140],
14 |
15 | [15.341629, 14.678412, 14.665693, 13.633613, 13.982519, 11.178406, 13.697694, 10.461185, 7.996832, 58.611549, 7.431059, 11.076401, 13.597231, 13.139524, 13.495966, 10.273490, 11.452367, 10.167614, 11.316935, 12.724643, 11.802032, 12.646366, 18.280768, 12.672976, 63.965102, 10.045398, 14.369920, 9.424832, 9.887030, 15.149035, 50.458433, 11.725453, 68.129204, 11.349208, 8.817826, 12.674762, 11.424591, 9.955086, 15.008888, 13.119296, 53.527772, 12.351472, 11.709062, 10.334177, 14.370902, 11.150850, 49.790847, 11.347105, 12.541186, 11.364660, 8.964185, 13.050379, 13.894001, 12.638281, 12.705963, 10.696781, 12.950993, 12.468188, 10.024792, 11.244612, 8.306938, 14.698039, 52.269719, 11.344778, 10.537424, 12.616367, 11.841470, 16.306481, 9.616387, 6.608985, 31.705475, 51.275058, 51.768775, 6.900935, 5.397402, 35.518681, 10.463089, 50.515297, 10.941296, 15.312960, 9.595655, 10.641053, 12.715098, 51.306539, 11.187088, 12.041390, 12.267792, 9.327741, 13.728392, 9.170088, 15.267901, 46.450641, 9.717752, 9.947808, 11.213984, 10.757838, 13.362415, 11.037630, 12.577730, 12.897338],
16 | [7.313773, 7.248264, 5.067444, 8.842977, 7.608721, 8.653894, 62.134383, 5.102347, 5.519440, 81.937181, 5.452010, 8.626065, 8.645136, 7.088186, 5.837918, 54.474982, 5.189909, 6.546465, 4.629266, 70.000192, 9.395889, 4.155231, 5.089076, 4.218533, 74.919280, 6.717917, 53.014032, 3.969286, 8.605599, 4.849044, 60.630324, 65.410434, 6.596810, 62.020646, 71.857941, 5.895819, 68.392247, 65.461754, 3.318386, 5.016175, 8.190272, 57.106412, 4.494780, 7.278212, 6.401423, 7.194343, 6.375702, 4.365551, 64.258294, 7.937419, 6.915316, 7.233481, 4.928482, 6.972867, 5.505200, 6.332903, 6.686514, 8.309472, 4.569678, 4.482371, 6.715820, 57.047777, 7.832438, 6.657865, 4.348972, 70.160085, 41.008560, 3.262324, 7.509626, 4.000181, 5.597471, 6.570225, 9.739884, 5.706723, 7.667952, 6.558006, 65.735783, 9.167421, 55.175922, 7.146803, 5.051127, 7.391683, 3.779323, 97.722961, 51.246519, 3.141432, 4.814731, 5.119214, 7.471215, 69.127114, 56.380045, 6.883611, 6.874813, 4.792079, 3.996554, 7.456756, 7.623711, 34.324562, 6.483892, 6.905238],
17 |
18 | [7.588937, 9.964588, 5.913335, 14.336912, 13.438318, 64.998050, 14.631312, 11.699449, 12.018943, 17.035852, 33.823215, 14.634302, 10.590887, 6.548417, 6.542256, 52.161803, 11.575169, 4.272291, 10.748003, 13.584780, 10.553002, 7.793081, 8.634919, 10.588455, 10.426559, 25.592627, 6.528304, 6.643721, 12.551807, 6.212011, 12.035708, 11.079625, 11.048117, 6.524898, 9.812931, 11.548364, 13.711194, 17.094585, 10.433940, 9.024216, 13.412596, 14.670297, 52.645683, 18.038336, 16.261581, 18.772533, 9.714485, 8.775804, 10.928841, 12.796661, 14.090557, 6.190334, 11.486405, 12.780759, 6.899154, 14.491948, 13.350100, 13.927349, 10.942886, 13.221856, 55.177635, 8.480024, 13.393132, 5.715504, 40.622648, 11.076411, 13.566425, 9.729145, 17.015641, 10.556499, 16.986986, 7.519732, 5.639839, 15.001418, 7.281235, 10.256605, 12.889487, 4.466114, 72.670157, 12.998940, 12.355639, 63.626501, 9.878510, 11.465503, 8.883998, 12.721210, 17.134859, 64.329050, 13.715173, 7.411055, 10.270259, 13.054703, 42.645570, 11.476296, 13.285420, 7.498828, 13.339854, 10.922121, 7.470640, 13.386343],
19 | [7.070935, 5.232053, 4.053272, 62.656732, 5.062426, 9.154989, 7.826296, 75.240219, 59.411407, 4.992232, 76.335673, 10.298458, 5.990009, 6.480765, 56.082957, 7.455960, 75.049727, 5.104337, 10.104645, 8.023306, 5.274468, 5.660575, 41.476173, 4.449734, 4.155806, 54.748757, 79.477174, 4.416540, 5.932159, 6.119858, 6.648595, 7.458493, 6.435249, 4.800734, 14.987605, 10.404192, 6.928261, 6.226757, 55.388482, 8.294353, 5.280141, 11.410110, 63.670893, 53.467722, 5.892295, 11.209773, 5.881369, 3.327046, 3.848187, 7.901931, 8.173404, 2.997336, 2.130338, 10.497846, 10.303912, 5.364018, 47.910314, 44.345466, 4.803757, 3.830554, 8.968367, 3.300144, 51.769722, 6.027482, 5.917627, 12.123815, 9.938891, 7.154330, 56.591873, 77.392856, 52.516691, 8.520153, 7.513137, 8.637736, 5.819892, 1.602784, 66.231463, 4.927559, 7.980261, 4.327471, 7.357223, 6.400759, 2.846838, 4.951876, 11.456527, 9.133238, 7.623498, 5.560661, 2.278189, 32.366328, 5.101368, 7.557821, 51.718406, 8.412207, 7.562392, 4.207905, 85.316843, 2.467279, 9.525035, 3.514291],
20 |
21 | [5.802008, 8.326228, 8.165158, 16.077765, 11.404562, 20.139854, 15.238530, 10.588925, 12.392165, 11.837725, 7.154748, 5.875821, 10.407793, 11.690149, 8.048689, 16.255622, 15.526473, 11.459678, 12.052782, 14.200299, 4.264560, 88.325601, 14.532483, 16.565511, 17.438027, 8.135357, 3.824198, 3.467835, 10.596049, 18.631508, 14.997960, 31.864815, 14.841692, 8.964419, 16.878293, 6.764996, 14.238110, 25.446969, 10.016874, 12.221086, 12.377311, 9.950374, 18.485064, 16.869409, 19.702920, 3.454878, 12.833186, 5.692901, 11.024399, 13.564892, 19.688321, 10.989238, 41.383812, 62.427356, 4.397385, 19.694828, 21.198388, 13.547000, 14.133338, 10.734672, 25.351968, 13.517704, 21.242162, 11.340485, 5.907095, 12.182404, 11.363229, 14.015253, 60.685415, 17.550834, 13.884081, 14.959678, 2.485946, 14.537648, 9.575146, 15.711994, 8.527022, 13.957632, 13.892153, 2.227685, 33.477207, 9.593510, 5.948852, 6.847351, 10.919294, 4.699008, 40.898343, 8.204610, 16.438709, 3.779228, 4.759139, 10.056058, 8.071077, 8.019649, 15.380003, 2.677506, 14.841307, 10.080636, 15.473311, 4.767401],
22 | [43.318922, 6.468859, 9.913257, 55.903135, 7.152742, 6.173400, 2.891720, 4.458493, 27.375968, 10.632708, 5.867643, 64.593774, 4.494313, 7.137831, 1.818833, 5.329690, 8.931935, 8.405413, 1.048821, 65.639816, 4.528360, 8.039385, 6.093105, 11.571390, 62.971126, 5.784095, 51.295974, 25.464415, 6.198689, 74.464807, 57.764465, 1.976959, 2.721981, 6.619558, 2.583216, 5.255425, 2.634106, 36.394218, 6.374182, 74.863759, 75.432708, 59.118688, 1.349001, 51.663599, 94.537953, 4.214529, 41.511808, 6.025560, 2.026080, 4.719750, 2.235685, 54.522440, 3.547660, 6.277774, 6.484477, 69.144590, 91.105996, 26.620359, 6.041705, 8.359846, 4.649439, 11.478467, 33.835395, 49.054471, 62.870999, 0.866062, 8.030661, 1.979249, 18.575960, 5.919261, 7.368157, 5.224024, 52.748799, 10.431567, 5.649330, 6.407153, 18.211424, 5.390051, 4.327642, 6.499954, 6.056569, 0.409378, 13.968299, 13.279833, 28.487376, 74.997135, 63.198839, 38.402265, 0.731686, 56.470007, 1.779763, 3.175988, 4.355366, 13.662878, 10.410555, 62.581137, 2.014510, 79.898078, 3.744049, 1.151870],
23 |
24 | # with threshold B > 4
25 | ]
26 |
27 | labels = [
28 | 'no threshold\nTrain:Test=6:4', 'B>4\nTrain:Test=6:4',
29 | 'no threshold\nTrain:Test=7:3', 'B>4\nTrain:Test=7:3',
30 | 'no threshold\nTrain:Test=8:2', 'B>4\nTrain:Test=8:2',
31 | 'no threshold\nTrain:Test=9:1', 'B>4\nTrain:Test=9:1',
32 | 'no threshold\nTrain:Test=95:5', 'B>4\nTrain:Test=95:5'
33 | ]
34 |
35 | bplot = ax.boxplot(all_data[:2*4], patch_artist=True, labels=labels[:2*4])
36 | plt.title('Evaluate the cost model for AMP with Conv2D')
37 |
38 | # colors = ['pink', 'lightblue', 'lightgreen']
39 | # for patch, color in zip(bplot['boxes'], colors):
40 | # patch.set_facecolor(color)
41 |
42 | ax.yaxis.set_major_locator(MultipleLocator(10))
43 | ax.yaxis.grid(True, which="both")
44 | # plt.xlabel('Three separate samples')
45 | plt.ylabel('Prediction Error (%)')
46 | plt.show()
--------------------------------------------------------------------------------
/docker/tensorflow.Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # =============================================================================
15 |
16 | FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
17 |
18 | # RUN rm -f /tmp/pip.conf &&\
19 | # echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf
20 |
21 | ENV USE_CUDA_PATH=/usr/local/cuda:/usr/local/cudnn/lib64 \
22 | PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} \
23 | OLD_LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH \
24 | LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:$OLD_LD_LIBRARY_PATH \
25 | LIBRARY_PATH=/usr/local/lib:/usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/nccl/lib/:$LIBRARY_PATH
26 |
27 | ENV BYTEPS_SERVER_MXNET_LINK=https://github.com/joapolarbear/incubator-mxnet.git \
28 | MXNET_BUILD_OPTS="USE_OPENCV=1 \
29 | USE_BLAS=openblas \
30 | USE_CUDNN=1 \
31 | USE_CUDA=1 \
32 | USE_CUDA_PATH=/usr/local/cuda \
33 | USE_MKLDNN=0 \
34 | USE_DIST_KVSTORE=1 \
35 | USE_NCCL=1 \
36 | USE_NCCL_PATH=/usr/local/nccl" \
37 | BYTEPS_BASE_PATH=/usr/local \
38 | BYTEPS_PATH=${BYTEPS_BASE_PATH}/byteps
39 |
40 | # ----------------------------- Install dependencies -----------------------------
41 | RUN apt-get update && \
42 | apt-get install -y software-properties-common && \
43 | add-apt-repository ppa:ubuntu-toolchain-r/test && \
44 | add-apt-repository ppa:deadsnakes/ppa && \
45 | apt-get update && \
46 | apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends --fix-missing \
47 | build-essential \
48 | ca-certificates \
49 | git \
50 | curl \
51 | wget \
52 | vim \
53 | libopenblas-dev \
54 | liblapack-dev \
55 | libopencv-dev \
56 | python \
57 | python-pip \
58 | python-dev \
59 | python-setuptools \
60 | libjemalloc-dev \
61 | graphviz \
62 | cmake \
63 | libjpeg-dev \
64 | libpng-dev \
65 | iftop \
66 | lsb-release \
67 | libnuma-dev \
68 | gcc-4.9 \
69 | g++-4.9 \
70 | gcc-4.9-base \
71 | gcc-7 \
72 | g++-7 \
73 | python3.7 \
74 | python3.7-dev \
75 | python3-pip \
76 | python3-setuptools \
77 | ssh \
78 | librdmacm-dev \
79 | zip unzip
80 |
81 | ### pin python3 version to 3.7
82 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 10
83 |
84 | RUN python -m pip install --upgrade pip && \
85 | pip --no-cache-dir install \
86 | matplotlib \
87 | numpy==1.15.2 \
88 | scipy \
89 | sklearn \
90 | pandas \
91 | graphviz==0.9.0 \
92 | mxboard \
93 | tensorboard==1.0.0a6 \
94 | networkx
95 |
96 | RUN python3 -m pip install --upgrade pip && \
97 | python3 -m pip install Cython && \
98 | python3 -m pip install --upgrade --force-reinstall setuptools && \
99 | python3 -m pip --no-cache-dir install \
100 | wheel \
101 | matplotlib \
102 | numpy==1.17.2 \
103 | pandas \
104 | mxboard \
105 | XlsxWriter \
106 | cvxopt \
107 | cvxpy \
108 | intervaltree \
109 | networkx==2.5 \
110 | protobuf \
111 | scapy \
112 | scipy \
113 | scikit-learn \
114 | tqdm \
115 | ujson \
116 | setuptools
117 |
118 | WORKDIR /root/
119 |
120 | RUN git clone https://github.com/NVIDIA/cuda-samples.git
121 |
122 | # ----------------------------- Install OpenMPI 4.0.3 -----------------------------
123 | RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz && \
124 | tar -xvf openmpi-* && cd openmpi-* && \
125 | ./configure --prefix="/usr" && \
126 | make -j && make all install && \
127 | ln -sf /home/$USER/.openmpi/bin/* /usr/bin/
128 |
129 | # ----------------------------- Install NCCL -----------------------------
130 | RUN git clone --recurse-submodules -b byteprofile https://github.com/joapolarbear/nccl.git && \
131 | cd /root/nccl && make -j src.build && make pkg.txz.build && \
132 | mkdir -p /usr/local/nccl && \
133 | tar -Jxf ./build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \
134 | echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
135 | ldconfig && ln -sf /usr/local/nccl/include/* /usr/include/
136 |
137 | # ----------------------------- Install MXNet -----------------------------
138 |
139 | ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
140 |
141 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
142 |
143 | RUN git clone --single-branch --branch byteprofile --recurse-submodules $BYTEPS_SERVER_MXNET_LINK customized-mxnet && \
144 | cd /root/customized-mxnet && \
145 | make clean_all && make -j16 $MXNET_BUILD_OPTS
146 |
147 | #! python3 required
148 | RUN python3 -m pip --no-cache-dir install numpy==1.17.2 && \
149 | cd /root/customized-mxnet/python && \
150 | python3 setup.py build && \
151 | python3 setup.py install && \
152 | python3 setup.py bdist_wheel && \
153 | cd && MX_PATH=`python3 -c "import mxnet; path=str(mxnet.__path__); print(path.split(\"'\")[1])"` && \
154 | ln -sf /root/customized-mxnet/include $MX_PATH/include && echo $MX_PATH
155 |
156 | # ----------------------------- Install Tensorflow -----------------------------
157 | ### install bazel
158 | RUN python3 -m pip --no-cache-dir install keras_applications --no-deps \
159 | keras_preprocessing --no-deps \
160 | h5py --no-deps
161 |
162 | RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.7.4/bazelisk-linux-amd64 && \
163 | chmod +x bazelisk-linux-amd64 && \
164 | mv bazelisk-linux-amd64 /usr/local/bin/bazel
165 |
166 | # RUN ln -sf /usr/local/cuda/lib64/libcupti.so /usr/local/cuda/lib64/libcupti.so.10.0 && \
167 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so.10.0 && \
168 | # ln -sf /usr/lib/x86_64-linux-gnu/libcublas.so /usr/lib/x86_64-linux-gnu/libcublas.so.10.0 && \
169 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcufft.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcufft.so.10.0 && \
170 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcurand.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcurand.so.10.0 && \
171 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusolver.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusolver.so.10.0 && \
172 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusparse.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusparse.so.10.0
173 |
174 | ### pin gcc and g++ version to 7
175 | # RUN update-alternatives --remove-all gcc && \
176 | # update-alternatives --remove-all g++ && \
177 | # update-alternatives --remove-all x86_64-linux-gnu-gcc && \
178 | # update-alternatives --remove-all x86_64-linux-gnu-g++
179 |
180 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 10 && \
181 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 20 && \
182 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 10 && \
183 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-7 20 && \
184 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 10 && \
185 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-7 20 && \
186 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 10 && \
187 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-7 20
188 |
189 | RUN update-alternatives --set gcc /usr/bin/gcc-7 && \
190 | update-alternatives --set g++ /usr/bin/g++-7 && \
191 | update-alternatives --set x86_64-linux-gnu-gcc /usr/bin/gcc-7 && \
192 | update-alternatives --set x86_64-linux-gnu-g++ /usr/bin/g++-7
193 |
194 | ENV BPF_TENSORFLOW_LINK=https://github.com/chenyu-jiang/tensorflow.git
195 |
196 | ENV PYTHON_BIN_PATH="/usr/bin/python3"
197 | ENV USE_DEFAULT_PYTHON_LIB_PATH=1
198 | ENV TF_ENABLE_XLA=1
199 | ENV TF_NEED_HDFS=0
200 |
201 | ### Download and build tensorflow
202 | RUN ln -sf /usr/bin/python3 /usr/bin/python && \
203 | git clone --single-branch --branch r1.15 --recurse-submodules ${BPF_TENSORFLOW_LINK} && \
204 | cd tensorflow && /usr/local/bin/bazel build -j auto --config=cuda //tensorflow/tools/pip_package:build_pip_package && \
205 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg && \
206 | pip3 --no-cache-dir install $(ls /tmp/tensorflow_pkg/tensorflow-1.15.4*) && \
207 | chmod +x build_bpf_tf_modules.sh && \
208 | ./build_bpf_tf_modules.sh
209 |
210 | ENV BPF_TF_PATH=$(pwd)
211 |
212 | # ----------------------------- Install BytePS -----------------------------
213 |
214 | RUN cd /usr/lib/python3/dist-packages && ln -s $(ls apt_pkg.cpython-*-linux-gnu.so) apt_pkg.so && \
215 | cd ${WORKDIR}
216 |
217 | RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \
218 | | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
219 | apt-add-repository 'deb https://apt.kitware.com/ubuntu/ xenial main' && \
220 | apt-get update && \
221 | apt-get install cmake -y
222 |
223 | RUN git clone https://github.com/gabime/spdlog.git && \
224 | cd spdlog && mkdir build && cd build && \
225 | cmake .. && make -j && make install
226 |
227 | ENV BPF_BYTEPS_LINK=https://github.com/chenyu-jiang/byteps.git
228 |
229 | #! Install BytePS
230 | RUN cd /usr/local && \
231 | git clone --single-branch --branch byteprofile --recurse-submodules ${BPF_BYTEPS_LINK} && \
232 | cd byteps && \
233 | BYTEPS_WITHOUT_PYTORCH=1 python3 setup.py install
234 |
235 | # ----------------------------- Install Horovod -----------------------------
236 | RUN cd /usr/local && \
237 | git clone --recurse-submodules -b byteprofile https://github.com/joapolarbear/horovod && \
238 | cd /usr/local/horovod && python3 setup.py sdist && \
239 | HOROVOD_NCCL_HOME=/usr/local/nccl \
240 | HOROVOD_GPU_ALLREDUCE=NCCL \
241 | HOROVOD_GPU_BROADCAST=NCCL \
242 | HOROVOD_WITH_MPI=1 \
243 | HOROVOD_WITH_TENSORFLOW=1 \
244 | HOROVOD_WITHOUT_PYTORCH=1 \
245 | HOROVOD_WITH_MXNET=1 pip3 install --no-cache-dir dist/horovod* && \
246 | cp -r /usr/local/horovod/examples /root/horovod_examples
247 |
248 | # ----------------------------- Install gluon-nlp -----------------------------
249 | RUN git clone -b bert-byteprofile https://github.com/joapolarbear/gluon-nlp.git && \
250 | cd gluon-nlp && python3 setup.py install && \
251 | mkdir -p /root/.mxnet/models && \
252 | cd /root/.mxnet/models && \
253 | wget https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip && unzip -o *.zip
254 |
255 | ### Set the environment for developing.
256 | ENV LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH \
257 | BYTEPS_TRACE_ON=1 \
258 | BYTEPS_TRACE_END_STEP=30 \
259 | BYTEPS_TRACE_START_STEP=10 \
260 | BYTEPS_TRACE_DIR=/root/traces \
261 | MXNET_GPU_WORKER_NTHREADS=1 \
262 | MXNET_EXEC_BULK_EXEC_TRAIN=0
263 |
264 | # # -------- install byteprofile analysis
265 | # RUN git clone --recurse-submodules https://github.com/joapolarbear/byteprofile-analysis.git && \
266 | # cd byteprofile-analysis && python3 setup.py install
267 |
268 | ### Sample command to start the docker
--------------------------------------------------------------------------------
/docs/backup.md:
--------------------------------------------------------------------------------
1 | # Commands for dPRO
2 | We have `bash setup.sh` to install dPRO now, the following commands are for archive
3 |
4 | ## Install dPRO
5 | `
6 | cd ${HOME}/
7 | rm -rf dpro
8 | git clone https://github.com/joapolarbear/dpro.git
9 | cd dpro && sudo bash setup.sh
10 | `
11 | ## debug mode
12 | `
13 | pip3 install -e $HOME/ws/git/dpro
14 | `
15 | —--
16 |
17 | ## Reinstall customized TF
18 | ```
19 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.2/tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl
20 | pip3 --no-cache-dir install --force-reinstall tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl
21 | ```
22 |
23 | ## RUN
24 | ```
25 | export HOROVOD_FUSION_THRESHOLD="${HOROVOD_FUSION_THRESHOLD:-67108864}"
26 | export HOROVOD_CYCLE_TIME="${HOROVOD_CYCLE_TIME:-0}"
27 | export HOROVOD_LOG_LEVEL="${HOROVOD_LOG_LEVEL:-warning}"
28 | export NCCL_DEBUG="${NCCL_DEBUG:-INFO}"
29 | export NCCL_DEBUG_SUBSYS="${NCCL_DEBUG_SUBSYS:-INIT}"
30 | export NCCL_ALGO="${NCCL_ALGO:-Ring}"
31 |
32 | export HOROVOD_FUSION_THRESHOLD=0
33 | export HOROVOD_CYCLE_TIME=5
34 |
35 | bash mpirun.sh python3 $HOME/horovod_examples/tensorflow/tensorflow_synthetic_benchmark.py --model VGG16 --num-iters 5
36 |
37 | bash mpirun.sh nsys profile -o 1ib_overlap_xlaoff_gpu%q{OMPI_COMM_WORLD_RANK}.qdrep python3 $HOME/horovod_examples/tensorflow/tensorflow_synthetic_benchmark.py --model VGG16 --num-iters 5
38 |
39 | TF_XLA_FLAGS=--tf_xla_auto_jit=2
40 |
41 | for (( id=0; id < 8; id++ )); do
42 | python3 $HOME/nvprof2json/nvprof2json.py --filename $HOME/global_traces/host0/simple.${id}.nvprof --filter CUPTI_ACTIVITY_KIND_MEMCPY,CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL > $HOME/global_traces/rank${id}.json
43 | done
44 | for (( id=8; id < 16; id++ )); do
45 | python3 $HOME/nvprof2json/nvprof2json.py --filename $HOME/global_traces/host1/simple.${id}.nvprof --filter CUPTI_ACTIVITY_KIND_MEMCPY,CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL > $HOME/global_traces/rank${id}.json
46 | done
47 | ```
48 |
49 | ---
50 | ## Train xla cost model
51 | ```
52 | cd ${HOME}/
53 | rm -rf dpro
54 | git clone https://github.com/joapolarbear/dpro.git
55 | cd dpro && sudo bash setup.sh
56 |
57 | cd ${HOME}/
58 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.1/dpro_xla_tools.zip
59 | unzip dpro_xla_tools.zip
60 | export BPF_TF_PATH=${HOME}/dpro_xla_tools
61 | sudo ln -sf /usr/local/lib/python3.7/dist-packages/tensorflow/libtensorflow_framework.so.2 /usr/lib/
62 | ```
63 |
64 | ## The GPU id to run profiling on (specify one GPU only)
65 | ```
66 | export BPF_COST_MODEL_PROFILE_GPU="0"
67 | export CUDA_VISIBLE_DEVICES=0
68 |
69 | cd ${HOME}
70 | COMM_BACKEND_LAUNCHER="python3 /usr/local/byteps/launcher/launch.py python3 test.py --comm_backend bps"
71 | COMM_BACKEND_LAUNCHER="horovod -np 1 python3 test.py"
72 | ```
73 | ### RUN
74 | ```
75 | $COMM_BACKEND_LAUNCHER
76 | ALL_TRACE_DIR=${HOME}/trace_dirs_vgg16
77 | mv $HOME/traces $ALL_TRACE_DIR
78 |
79 | export XLA_DUMP_DIR=${HOME}/xla_dump
80 | mkdir -p $XLA_DUMP_DIR
81 | TF_DUMP_GRAPH_PREFIX=${XLA_DUMP_DIR} TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2" $COMM_BACKEND_LAUNCHER
82 |
83 | export DPRO_GRAPHDEF_DFG_PATH=${XLA_DUMP_DIR}/graphdef_dag.gml
84 | export TRACE_DIR=$ALL_TRACE_DIR/0
85 | export OUTPUT_DIR="${HOME}/xla_vgg16"
86 | mkdir -p $OUTPUT_DIR
87 |
88 | NUM_RANDOM_SAMPLES=5000
89 | MAX_CLUSTER_SAMPLES=5
90 | MIN_CLUSTER_SIZE=4
91 | MAX_CLUSTER_SIZE=800
92 |
93 | cd ${HOME}/dpro
94 | python3 xla_cm_entry.py --mode 0 \
95 | --trace_dir ${TRACE_DIR} \
96 | --output_dir ${OUTPUT_DIR} \
97 | --num_samples ${NUM_RANDOM_SAMPLES} \
98 | --max_cluster_samples ${MAX_CLUSTER_SAMPLES} \
99 | --min_cluster_size ${MIN_CLUSTER_SIZE} \
100 | --max_cluster_size ${MAX_CLUSTER_SIZE} \
101 | --batch_size 256
102 |
103 | ```
104 | ## TEST the searched results
105 | ```
106 | hdfs dfs -rm -r /usr/hphu/search_rst && hdfs dfs -mkdir /usr/hphu/search_rst
107 |
108 | function put_spec_to_hdfs {
109 | hdfs dfs -put $1/spec /usr/hphu/search_rst/$1_spec
110 | }
111 |
112 | put_spec_to_hdfs 20210929_01_bps_tf_resnet50_tcp_2w8g2s_tsfs_tspart_optws
113 |
114 | hdfs dfs -ls /usr/hphu/search_rst
115 | ```
116 |
117 |
118 | # BytePS
119 |
120 | ## 重装byteps
121 | ```
122 | cd /usr/local/byteps && git pull && git submodule update
123 | cd /usr/local/byteps/3rdparty/ps-lite && make clean && make -j USE_RDMA=1 && \
124 | cd /usr/local/byteps/ && rm -rf build && \
125 | BYTEPS_USE_RDMA=1 BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_MXNET=1 python3 setup.py install
126 | ```
127 | ### Test byteps
128 | ```
129 | export DMLC_ROLE=scheduler
130 | export DMLC_ROLE=worker
131 | export DMLC_WORKER_ID=0
132 |
133 | export DMLC_NUM_WORKER=2
134 | export DMLC_NUM_SERVER=1
135 | export DMLC_PS_ROOT_URI=10.129.120.196
136 | export DMLC_PS_ROOT_PORT=8008
137 |
138 | unset NCCL_ALGO
139 | unset NCCL_DEBUG_SUBSYS
140 | unset NCCL_DEBUG
141 | unset NCCL_TRACE_START_STEP
142 | unset NCCL_TRACE_DIR
143 | unset NCCL_TRACE_END_STEP
144 | unset NCCL_ENABLE_TIMELINE
145 | export BYTEPS_LOG_LEVEL=INFO
146 |
147 | cd $HOME/bert && sudo git checkout b_tf2_4
148 | python3 /usr/local/byteps/launcher/launch.py python3 $HOME/bert/run_pretraining.py
149 | ```
150 |
151 | # NCCL Contention 测试
152 | ```
153 | rm -rf /usr/local/nccl
154 | pip3 uninstall -y horovod
155 |
156 | cd /usr/local && git clone https://github.com/NVIDIA/nccl.git
157 | cd /usr/local/nccl && git checkout v2.10.3-1
158 | rm -rf /usr/include/nccl.h
159 |
160 | make -j src.build && make pkg.txz.build
161 | tar -Jxf ./build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1
162 | echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
163 | ldconfig && ln -sf /usr/local/nccl/include/* /usr/include/
164 |
165 | HOROVOD_NCCL_HOME=/usr/local/nccl \
166 | HOROVOD_NCCL_HOME=/usr/local/nccl \
167 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL \
168 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 \
169 | pip3 install --no-cache-dir horovod==0.21.0
170 | ```
171 |
172 |
--------------------------------------------------------------------------------
/docs/dependency.md:
--------------------------------------------------------------------------------
1 |
2 | # 3rdparty version
3 |
4 | ## Frameworks
5 | * [MXNet](https://github.com/joapolarbear/incubator-mxnet/tree/mlsys2022)
6 | * [TensorFlow](https://github.com/joapolarbear/tensorflow/tree/mlsys2022)
7 | * [BytePS](https://github.com/joapolarbear/byteps/tree/mlsys2022)
8 | * [pslite](https://github.com/joapolarbear/ps-lite/tree/mlsys2022)
9 | * [ZMQ](https://github.com/chenyu-jiang/libzmq/commit/5ed25589f000dc613e1a8575ba193eb78eb9b86e)
10 | * [Horovod](https://github.com/joapolarbear/horovod/tree/mlsys2022)
11 | * [NCCL](https://github.com/joapolarbear/nccl/tree/mlsys2022)
12 |
13 |
14 | ## Benchmarks
15 | * [BERT]( https://github.com/joapolarbear/bert/tree/mlsys2022)
16 | * [gluon-nlp](https://github.com/joapolarbear/gluon-nlp/tree/mlsys2022)
17 |
18 | ## Tools
19 | * [spdlog](https://github.com/gabime/spdlog/commit/6aafa89d20eef25ec75462ffb7eedc328f135638)
20 | * [nvprof2json](https://github.com/joapolarbear/nvprof2json): convert nvprof results to JSON format
21 | * [catapult](https://github.com/joapolarbear/catapult): convert JSON files to a HTML in the format of chrome://tracing.
22 |
23 |
24 | # Installation
25 |
26 | ## TensorFlow
27 |
28 | You can installed our compiled version of TensorFlow if you are using python3.7
29 | ```
30 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.2/tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl && \
31 | pip3 --no-cache-dir install --force-reinstall tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl && \
32 | rm tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl
33 | ```
34 |
35 | Or you can build our customized TensorFlow yourself using bazel. First, clone our customized TensorFlow
36 | ```
37 | git clone --recurse-submodules -b r2.4_dev https://github.com/joapolarbear/tensorflow.git
38 | cd tensorflow
39 | ```
40 | Then, you need to config the building process, if you are using python3.7 and cuda11, you can also use our configuration file
41 | ```
42 | cp tools/sample_config/cuda11.3_python3.7 .tf_configure.bazelrc
43 | ```
44 | Install dependencies
45 | ```
46 | pip3 install -U --user keras_applications --no-deps
47 | pip3 install -U --user keras_preprocessing --no-deps
48 | ```
49 | Pin default python to python3.7
50 | ```
51 | ln -sf /usr/bin/python3 /usr/bin/python
52 | ```
53 |
54 | Then, follow the commands below to build and install TensorFlow.
55 | ```
56 | cd /root/tensorflow && bazel build -j 32 --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
57 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
58 | ls -lh /tmp/tensorflow_pkg
59 | pip3 --no-cache-dir install --force-reinstall /tmp/tensorflow_pkg/tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl
60 | bazel clean && ln -sf /usr/bin/python2.7 /usr/bin/python && rm -rf /tmp/tensorflow_pkg/*
61 | rm -rf tensorflow && rm -rf /var/lib/apt/lists/*
62 | ```
63 |
64 | ## MXNet
65 | ```
66 | cd customized-mxnet
67 | make clean_all && make -j16 USE_OPENCV=1 \
68 | USE_BLAS=openblas \
69 | USE_CUDNN=1 \
70 | USE_CUDA=1 \
71 | USE_CUDA_PATH=/usr/local/cuda \
72 | USE_MKLDNN=0 \
73 | USE_DIST_KVSTORE=1 \
74 | USE_NCCL=1 \
75 | USE_NCCL_PATH=/usr/local/nccl
76 | cd python
77 | python3 setup.py build
78 | python3 setup.py install
79 | python3 setup.py bdist_wheel
80 | ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so.1
81 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
82 | ln -sf /root/customized-mxnet/include $MX_PATH/include && echo $MX_PATH
83 | ```
84 |
85 | ## BytePS + pslite + ZMQ
86 | ```
87 | cd $HOME && git clone https://github.com/gabime/spdlog.git
88 | cd $HOME/spdlog && mkdir build && cd build && cmake .. && make -j && make install
89 | cd $HOME && git clone --single-branch --branch byteprofile_rdma --recurse-submodules https://github.com/joapolarbear/byteps.git
90 | cd $HOME/byteps/3rdparty/ps-lite && make -j USE_RDMA=1
91 | cd $HOME/byteps/
92 | BYTEPS_USE_RDMA=1 BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_MXNET=1 python3 setup.py install
93 | BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_MXNET=1 python3 setup.py bdist_wheel
94 | ```
95 |
96 | ## Horovod + NCCL
97 | Install OpenMPI first
98 | ```
99 | cd $HOME
100 | wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz
101 | rm -rf /usr/lib/x86_64-linux-gnu/openmpi
102 | tar -xvf openmpi-4.0.3.tar.gz && cd openmpi-4.0.3
103 | ./configure --prefix="/usr"
104 | make -j && make all install
105 | ```
106 |
107 | Then install NCCL
108 | ```
109 | cd $HOME && git clone --recurse-submodules -b byteprofile https://github.com/joapolarbear/nccl.git
110 | rm -rf /usr/include/nccl.h
111 | cd $HOME/nccl && make -j src.build && make pkg.txz.build
112 | mkdir -p $HOME/nccl
113 | tar -Jxf ./build/pkg/txz/nccl*.txz -C $HOME/nccl/ --strip-components 1
114 | echo "$HOME/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf
115 | ldconfig && ln -sf $HOME/nccl/include/* /usr/include/
116 | ```
117 |
118 | And install Horovod
119 | ```
120 | cd $HOME && git clone --recurse-submodules -b b_v0.21.0 https://github.com/joapolarbear/horovod
121 | cd $HOME/horovod && python3 setup.py sdist
122 | pip3 install cloudpickle psutil pyyaml cffi==1.4.0 pycparser
123 | HOROVOD_NCCL_HOME=$HOME/nccl \
124 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL \
125 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 \
126 | pip3 install --no-cache-dir dist/horovod*
127 | cp -r $HOME/horovod/examples $HOME/horovod_examples
128 | ```
--------------------------------------------------------------------------------
/docs/format.md:
--------------------------------------------------------------------------------
1 | # Format Specification
2 |
3 | ## Trace Format
4 |
5 | ```python
6 | ### Uniform template
7 | {
8 | "name": op_cat.op_name.sub_op,
9 | "ts": time_in_us,
10 | "dur": time_in_us,
11 | "pid": process_id, # e.g., host0.rank0
12 | "args": {
13 | "name": op_type.op_name.sub_op~>suffix,
14 | ...
15 | "cnt": 2, # how many times the op occurs,
16 | "step": 5, # which step the op is in, may be larger than "cnt"
17 | }
18 | }
19 | ```
20 |
21 |
22 | - `op_cat` is one of `BW`, `FW`, `Comm`, `UPDATE_`, or specially, `trace["name"] = UPDATE_...`, e.g., `UPDATE_CAL`, `UPDATE_0`, `UPDATE_1`, ... And `trace["name"] = I/O_...`
23 | - `op_name` is the raw name profiled by the built-in profiler of each ML framework.
24 | - For `Comm`, `sub_op` could be `SEND`, `RECV`, and `suffix` could be `0_1_6_0` (for NCCL) denoteing `loopId`, `channelId`, `chunkId`, and `liceId` respectively
25 |
26 | We call the names follow the format of `op_cat.op_name.sub_op` as `standard name`
27 |
28 | ### For communication traces
29 | Name should be tensor index `tensor_id` or `tensor_id_1+tensor_id_2+...+tensor_id_n` and the corresponding `tensor_name` should be stored in the `gradient_name_list` field in `/metadata.json`.
30 |
31 | ### Detailed communication traces
32 | `"comm_detail"` in `trace["tid"]`
33 |
34 |
35 | ## Trace Statistic Format
36 | ``` python
37 | name2sta = {
38 | op_long_name: {
39 | "avg": ...
40 | "cnt":
41 | "time":
42 | "min_t":
43 | "max_t":
44 | "id": ,# the order the op is created in the dict
45 | "step_ids": [] # a list of index, denotes where this operator appears in the traces
46 | }
47 |
48 | op_long_name = event["pid"]->event["name"]
49 | or event["pid"]->event["name"]~>suffix
50 | ```
51 |
52 | ## Dependency Graph
53 | Nodes:
54 | ```python
55 | op_long_name: {
56 | "avg": time_in_us,
57 | gap_string:time_in_us
58 | }
59 | ```
60 | `gap_string` denotes different kinds of gaps
61 |
62 | Special Nodes `END`, the end node.
63 |
64 | ## NCCL Graph
65 | - During trace collection, NCCL graph needs to parse at least one GPU's NCCL traces to get `chunkNum`, `sliceNum`, `channelNum`, `loopNum` for each `raw_name` (`op_cat.op_name`, without `sub_op`)
66 | - During trace collection, we need to parse `nccl_rank_graph.json` to get the connection information of this GPU.
67 |
68 | ## ParameterDict
69 | Manage the parameter info of a DNN model. Seek to implement a unified `ParameterDict`, but now, it is only for MXNet.
70 |
71 | ### MXNet
72 | Contains:
73 | - `gradient_name_list`, which maps `tensor_id` to `tensor_name`;
74 | - `tensor2update`, which maps `tensor_id` to `update_id`
75 |
76 |
77 | ## Rules of converting Framework traces
78 | ### Tensorflow
79 | #### UPADATE operators
80 | 1. Take all down stream operators of `Comm` as UPDATE operators
81 | 2. There may be depedency between UPDATE operators
82 | #### FW and BW operators
83 | 1. **Assumption**: in TensorFlow, some operators may have multiple traces with the same name in one step, which we call as sub_trace, we assume they are continuous and combine them into one single operator.
84 |
85 | #### Statistic the number of step
86 | 1. pre_cat not in [io, fw], cur_cat in [io, fw], step cnt + 1
87 |
88 | ### MXNET
89 | #### UPDATE operators
90 | 1. We assume there is no dependency between UPDATE operators, except for `UPDATE_CAL`->`UPDATE_ID`
91 |
--------------------------------------------------------------------------------
/docs/nvprof.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | This tutorial gives an introduction of how to compare ByteProfile Traces and NvProf traces
4 |
5 | # How to use
6 |
7 | ## Traces Collection
8 |
9 | Use `nvprof` to collect NvProf traces.
10 | ```nvprof -o foo.nvvp your_program```
11 |
12 | In the meanwhile, you should customized the source code and enable BPF_related environment variabless correspondingly to collect ByteProfile Traces.
13 |
14 | Then you can run `python3 nvprof2json.py --filename > ` to convert NvProf Traces from `.nvvp` to `JSON` format.
15 |
16 | ## Comparison
17 |
18 | For comparison, you can simply run
19 | `python3 analyze.py --option mapping --path ,`
20 | Then the statitical results will be stored in `mapfrom_op2kernels.xlsx` under the same folder as ``
21 |
22 | The rational is
23 | 1. for the first iteration, check the bias. check those relatively large kernel-level traces, it overlapping with a op-level trace but is not convered by that
24 | 2. for the second iteration, generate the mapping
--------------------------------------------------------------------------------
/docs/profile.md:
--------------------------------------------------------------------------------
1 | # Profiler
2 | ### Horovod + Tensorflow
3 | Please follow the instructions in [docs/dependency.md](./dependency.md) to install our customized Horovod and TensorFlow.
4 | To enable profiling, add the following code to your script for a training job using Horovod.
5 |
6 | ```
7 | recorder = hvd.Recorder()
8 |
9 | @hvd.profile(recorder)
10 | @tf.function
11 | def benchmark_step(first_batch):
12 | ...
13 | with tf.GradientTape() as tape:
14 | ...
15 | tape = hvd.DistributedGradientTape(tape)
16 | ...
17 | ```
18 |
19 | Besides, the following environment variables needs to be set
20 | ```
21 | export BYTEPS_TRACE_ON=1
22 | export BYTEPS_TRACE_DIR=path/to/store/traces
23 | export BYTEPS_TRACE_START_STEP=
24 | export BYTEPS_TRACE_END_STEP==
25 | ```
26 | Then, launch the distributed trianing job on a cluster.
27 |
28 |
29 | Before analyzing traces using the dPRO toolkit, you need to collect traces from different workers to one device and organize them in the following manner.
30 | ```
31 | global_traces/
32 | |
33 | - host0/ # traces of device 0
34 | |
35 | - 0/ # traces of GPU 0 on device 0
36 | |
37 | - 1/ # traces of GPU 1 on device 0
38 | ...
39 | |
40 | - host1/
41 | |
42 | ...
43 | ```
--------------------------------------------------------------------------------
/docs/sample_config.yaml:
--------------------------------------------------------------------------------
1 | # Normal arguments
2 | platform: TENSORFLOW
3 | comm_backend: NCCL
4 | nccl_algo: RING
5 | optimizer: MCMC
6 | xla_candidate_path: {{path}}/.xla_dump/unsafe_resource_deps.txt
7 |
8 | # Stroe true argements
9 | store_true:
10 | pretty: 1
11 | layer_by_layer: 1
12 |
13 | # environment variables
14 | env:
15 | DPRO_GRAPHDEF_DFG_PATH: {{path}}/.xla_dump/graphdef_dag.gml
--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
1 |
2 | This script introduces how to analyze the traces of a distributed trianing job using dPRO toolkit.
3 | Suppose `DPRO_PATH` denotes the path where dpro is installed (check by entering `python3 -c "improt dpro; print(dpro.__path__)`), and `GLOBAL_TRACE_PATH` represents the path to store the final traces.
4 |
5 | ### Basic Usage
6 | You can check the help information using the following commands
7 | ```
8 | dpro_cli -h
9 | dpro_cli --help
10 | ```
11 |
12 | It will output all arguments of `DPRO_PATH/analyze.py`. So you can use dPRO as a python script.
13 | ```
14 | python3 DPRO_PATH/analyze.py
15 | ```
16 |
17 | It's tedious to set so many arguments everytime we want to analyze the traces of a job, so dPRO provides a command line tool, namely `dpro_cli`, which automatically search for a configuration file in YAML format (file extension .yaml).
18 | * The first argument `dpro_cli` must be `option`, which can be `collect`, `replay` or `optimize`.
19 | * The second argument `path` can be set as `GLOBAL_TRACE_PATH`.
20 | * Users can write a configuration file for each job. [sample_config.yaml](./sample_config.yaml) shows an example with three fields: 1) normal arguments; 2) `store_true` argument, which corresponds to the arguments set with `action=store_true`, see the help info for more details; 3) `env` field, which allow users set some environment variables of dPRO.
21 | * `dpro_cli` will automaticall substitute `{{path}}` in the config file with the second argument `path`.
22 |
23 | Writing a configuration file benefits when you want to fix some arguments for a job, e.g., platform (TENSORFLOW or MXNET), comm_backend (BYTEPS or NCCL), and so on. You can also add additional arguments for different analysis methods, e.g., sub_option, optimizer (DP or MCMC).
24 |
25 |
26 | # Statistic
27 | Users can statistic traces using the following commands.
28 | ```
29 | python3 /home/tiger/byteprofile-analysis/analyze.py \
30 | --option collect \
31 | --platform TENSORFLOW \
32 | --comm_backend NCCL --nccl_algo RING --pretty \
33 | --path $GLOBAL_TRACE_PATH
34 | ```
35 | or
36 | ```
37 | dpro_cli collect $GLOBAL_TRACE_PATH
38 | ```
39 |
40 | # Replay
41 | Users can simulate a training job using the following commands.
42 | ```
43 | python3 /home/tiger/byteprofile-analysis/analyze.py \
44 | --option replay \
45 | --platform TENSORFLOW \
46 | --comm_backend NCCL --nccl_algo RING --pretty \
47 | --path $GLOBAL_TRACE_PATH
48 | ```
49 | or
50 | ```
51 | dpro_cli replay $GLOBAL_TRACE_PATH
52 | ```
53 |
54 | ---
55 | # Optimizer
56 |
57 | ## Operator Fusion
58 | ### Search operator fusion strategies
59 | Sample commands, put the XLA cost model in the path of `./cost_model/_xla/.cost_model`
60 | ```
61 | python3 analyze.py --option optimize --sub_option xla,^memory \
62 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \
63 | --path $GLOBAL_TRACE_PATH --layer_by_layer --mcmc_beta 10 \
64 | --xla_candidate_path data/xla_candidates_resnet.txt
65 | ```
66 | If you do not have a XLA cost model, run the following command to search with estimated fusion time:
67 | ```
68 | python3 analyze.py --option optimize --sub_option xla \
69 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty --simulate \
70 | --path $GLOBAL_TRACE_PATH \
71 | --workspace $GLOBAL_TRACE_PATH \
72 | --xla_candidate_path data/xla_candidates_resnet.txt \
73 | --update_infi_para --layer_by_layer
74 | ```
75 |
76 | ### Sample some example strategies
77 | Fuse operators layer by layer, below is an exmple where each 2 layers' operators are fused.
78 | ```
79 | python3 analyze.py --option optimize --sub_option xla,^memory \
80 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \
81 | --path path/to/trace/directory \
82 | --xla_candidate_path path/to/candidate/file/ \
83 | --update_infi_para --simulate --layer_num_limit 2
84 | ```
85 |
86 | ## Tensor Fusion
87 | ### Search tensor fusion strategies
88 | Sample commands
89 | ```
90 | python3 analyze.py --option optimize --sub_option tensor_fusion \
91 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \
92 | --path $GLOBAL_TRACE_PATH \
93 | --workspace $GLOBAL_TRACE_PATH
94 | ```
95 | or
96 | ```
97 | dpro_cli optimize $GLOBAL_TRACE_PATH --sub_option tsfs
98 | ```
99 |
100 | ## Combine Tensor Fusion and Operator Fusion
101 | ### Search both tensor fusion and operator fusion strategies
102 | Sample commands
103 | ```
104 | python3 analyze.py --option optimize --sub_option tensor_fusion,xla \
105 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \
106 | --path $GLOBAL_TRACE_PATH \
107 | --workspace $GLOBAL_TRACE_PATH \
108 | --xla_candidate_path /root/byteprofile-analysis/data/xla_candidates_resnet.txt
109 | ```
110 |
111 | ### Generate tensor fusion strategies according to operator fusion strategies
112 | Sample commands
113 | ```
114 | python3 analyze.py --option optimize --sub_option from_opfs2tsfs \
115 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \
116 | --path $GLOBAL_TRACE_PATH,
117 | ```
118 | where `` denotes the path to the cluster_mapping.txt (operator fusion search result).
119 |
120 |
121 | ## Mixed Precision Training
122 | `TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_PRIORLIST_FILE`: a file containing ops to force quantize, seperated by \n
123 | `TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_PRIORLIST_ADD`: ops to force quantize, seperated by comma
124 | `TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_FORCE`: clear the CLEARLIST and BCACKLIST if set
125 |
126 | ---
127 | # Train Cost Model
128 | ## Cost Model for MultiGPU
129 |
130 | ```
131 | python3 mg_generate_dataset.py --option optimize --sub_option train_gpu --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --path $GLOBAL_TRACE_PATH
132 | ```
133 | `--path` specifies where traces are stored, organized by the GPU model name and ML model name
134 |
135 | ---
136 |
137 | ### Heat-based Search Algorithm for Operator Fusion
138 | #### Requirements of Weights
139 | 1. Initially, each strategy has a weight of 1
140 | 2. If fuse(a, b) generating c brings speedup,
141 | 1. The weights of fusion strategies involving a, b, c > 1
142 | 2. The weights of de-fusion strategies involving a, b, c < 1
143 | 3. If fuse(a, b) generating c leads to worse performance,
144 | 3. The weights of fusion strategies involving a, b, c < 1
145 | 4. The weights of de-fusion strategies involving a, b, c > 1
146 | 4. If defuse(c) generating a, b is better, the same as item 3
147 | 5. If defuse(c) generating a, b is worse, the same as item 2
148 | 6.
149 | #### Solution
150 | - The heat is directional, i.e., a large heat means an operator is expected to participate in operator fusion, but not in operator partition
151 | - After applying a strategy, if it's a fusion strategy, record Delta T at the heat history list, otherwise, ecord - Delta T at the heat history list.
152 | - To calculate the final heat H of one operator, if the heat history list is empty, return 0, otherwise, return ..., k > 1, thus H > -1
153 | $$H = \frac{1}{n}\sum_i^{n}\frac{e^{\Delta T_i} - 1}{k \Delta t_i}$$
154 | - With the heat H, calculate the final weight W as follows
155 | $$W = \left\{
156 | \begin{array}{rcl}
157 | 1 + H & &, fusion \quad strategy\\
158 | 1 + \frac{1}{H+1} - 1 = \frac{1}{H+1} & & , partition \quad strategy\\
159 | \end{array} \right.$$
160 |
161 |
162 | ## Apply strategies
163 |
164 | ### Operator Fusion
165 | We know XLA is enabled by setting `TF_XLA_FLAGS="--tf_xla_auto_jit=2"`, to apply customized XLA clustering strategies, set `XLA_CLUSTER_SPEC` to the path of the clustering specification file, where each row is in the format of `operator name` `cluster_id`
166 |
167 | Besides, We can set `XLA_DUMP_DIR` to the path to store the intermediate informantion, which can be used to train the XLA cost model.
168 | * `xla_candidates.txt`: candidates
169 | * `unsafe_resource_deps.txt`: unsafe_resource_deps, __currently this file also contains xla_candidates.__
170 | * `xla_clustering.txt`: the clustring strategies being applied, exits only when using default XLA (`XLA_CLUSTER_SPEC` is not set)
171 |
172 | We can further set `TF_DUMP_GRAPH_PREFIX=${XLA_DUMP_DIR} TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2"` to dump graph_def.
173 |
174 |
175 | ### Tensor Fusion
176 |
177 | #### BytePS
178 | Use https://github.com/joapolarbear/byteps.git:byteprofile_rdma
179 |
180 | You can also configure to fuse tensors to multiple tensor groups, by
181 | 1. specifying the number of tensor groups by setting `BYTEPS_TENSOR_GROUP_NUM=x`
182 | 2. using a specification file by setting `BYTEPS_TENSOR_GROUP_FILE=/path/to/spec`. The file should be a json file and in the following format, where `0, 1, ...` denotes the indexes of tensors.
183 | ```
184 | {
185 | "mapping": [
186 | "0+1+2",
187 | "3+4"
188 | ]
189 | }
190 | ```
191 |
192 |
193 | You can also configure the tensor partition size. A smaller size improves BytePS pipelining, but may have higher other overhead like NCCL coordination, ZMQ message headers, etc. The default and recommended value is 4096000 (in bytes).
194 |
195 | ```
196 | export BYTEPS_PARTITION_BYTES=y
197 | ```
198 |
199 | You can also configure the tensor partition size for each tensor use a specification file. Each line of the specification file should follow the format of ` `.
200 | ```
201 | export BYTEPS_PARTITION_SPEC_FILE=/path/to/spec
202 | ```
203 | Another way to specify the tensor partition size for a tensor is to use `BYTEPS_PARTITION_SPEC==`. You can also specify the tensor size for multiple tensors by seperating their specification with comma.
204 |
205 | #### Horovod
206 | Use https://github.com/joapolarbear/horovod:b_v0.21.0
207 |
208 | You can also configure to fuse tensors to multiple tensor groups, by
209 | 1. specifying the number of tensor groups by setting `HOROVOD_TENSOR_GROUP_NUM=x`
210 | 2. using a specification file by setting `HOROVOD_TENSOR_GROUP_FILE=/path/to/spec`. The file should be a json file and in the following format, where `0, 1, ...` denotes the indexes of tensors. Note that all tensor indexes should be specificed, even if one tensor is not fused with other tensors.
211 | ```
212 | {
213 | "mapping": [
214 | "0+1+2",
215 | "3+4"
216 | ]
217 | }
218 | ```
219 |
--------------------------------------------------------------------------------
/dpro/__init__.py:
--------------------------------------------------------------------------------
1 | from . import base
2 | from . import logger_utils
3 | from . import collect
4 | from . import trace_utils
5 | from . import replay
6 |
7 | def init(workspace, name, **kwargs):
8 | from dpro.logger_utils import SingleLogger
9 | logger = SingleLogger(workspace, name, **kwargs)
--------------------------------------------------------------------------------
/dpro/arg_utils.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from .base import Singleton
3 |
4 | parser = argparse.ArgumentParser(description="dPRO Arguments",
5 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
6 | # parser.add_argument("-s", action="store_true", help="sort the output result")
7 | parser.add_argument("--option", type=str,
8 | choices=["statistic", "graph", "combine", "mapping", "compare", "critical", "timeline", "replay", "topo_sort", "collect", "3dcompare", "optimize"],
9 | help="The type of analysis to process. including:\n" +
10 | "* statistic: show the statistic results\n" +
11 | "* graph: show the dependency graph\n")
12 | parser.add_argument("--sub_option", type=str, default=None, help="Sub options for each option")
13 | parser.add_argument("--path", type=str, required=True, help="The paths of traces you want to analyze, support multiple paths seperated with comma.")
14 | parser.add_argument("--del_queue", action="store_true", help="If set True, delete the queue time in communication traces. ")
15 | parser.add_argument("--logging_level", type=str, default="INFO", help="Logging level")
16 | parser.add_argument("--clean", action="store_true", help="Flush the log file")
17 | parser.add_argument("--pretty", action="store_true", help="Output necessary info if set")
18 | parser.add_argument("--filter", type=str, default=None, help="Used to show part of communication operations, seperated with comma.")
19 | parser.add_argument("--progress", action="store_true", help="Show the progress bar if it is set, disable the std output")
20 | parser.add_argument("--debug_traces", action="store_true", help="If set, output traces profiled for the analysis process")
21 |
22 | ### collect
23 | group_clct = parser.add_argument_group('Trace Collection')
24 | group_clct.add_argument("--comm_backend", type=str, default="NCCL", choices=["NCCL", "BYTEPS", "NONE"], help="Communication backend")
25 | group_clct.add_argument("--platform", type=str, default="TENSORFLOW", choices=["TENSORFLOW", "MXNET"], help="Platform used to run the model")
26 | group_clct.add_argument("--nccl_algo", type=str, default=None, help="NCCL algorithm, Tree or Ring")
27 | group_clct.add_argument("--trace_level", type=str, choices=["debug", "info"], default="info", help="if set to debug, show some trival traces")
28 | group_clct.add_argument("--disable_revise", action="store_true", help="By default, revise traces according to SEND-RECV dependency, set to disable this argument to disable")
29 | group_clct.add_argument("--force", action="store_true", help="Force to re-generate traces, graphs")
30 | group_clct.add_argument("--update_infi_para", action="store_true", help="Tensorflow timeline display UPDATE traces in parallel, set `update_infi_para` to True to keep all UPDATE traces")
31 |
32 | ### Used for BytePS traces collection
33 | group_clct_bps = parser.add_argument_group('Trace Collection for BytePS')
34 | group_clct_bps.add_argument("--pcap_file_path", type=str, default=None, help="Path to the directory containing BytePS communication pcap files.")
35 | group_clct_bps.add_argument("--zmq_log_path", type=str, default=None, help="Path to the directory containing BytePS communication zmq log files.")
36 | group_clct_bps.add_argument("--server_log_path", type=str, default=None, help="Path to the directory containing BytePS server log files.")
37 | group_clct_bps.add_argument("--profile_start_step", type=int, default=None, help="The start step of computation profiling. Used for truncating BytePS comm trace.")
38 | group_clct_bps.add_argument("--profile_duration", type=int, default=None, help="The duration (in steps) of computation profiling. Used for truncating BytePS comm trace.")
39 | group_clct_bps.add_argument("--van_type", type=str, choices=["ZMQ", "RDMA"], default=None, help="Type of protocol used in BytePS.")
40 |
41 | ### statistic
42 | group_stat = parser.add_argument_group('Statistic')
43 | group_stat.add_argument("--sort", action="store_true", help="Sorted in descending order")
44 | group_stat.add_argument("--head", type=int, default=None, help="Print the first few lines")
45 | group_stat.add_argument("--xlsx", action="store_true", help="Output XLSX file of the statistic results")
46 |
47 | ### replay
48 | group_replay = parser.add_argument_group('Replayer')
49 | group_replay.add_argument("--update_barrier", action="store_true", default=False, help="If true, add a barrier before all UPDATE ops.")
50 | group_replay.add_argument("--update_clip_overlapping", action="store_true", help="If true, clip overlapping UPDATE nodes in the timeline.")
51 | group_replay.add_argument("--step_num", type=int, default="1", help="Default step numbers to replay.")
52 | group_replay.add_argument("--delay_ratio", type=float, default=1.1, help="delay ratio")
53 | group_replay.add_argument("--full_trace", action="store_true", help="If this arg is set, simulate traces with detailed dependency info.")
54 | group_replay.add_argument("--show_queue", action="store_true", help="If this arg is set, record the queue status of each device during replaying.")
55 |
56 | ### Optimize
57 | group_opt = parser.add_argument_group('Optimal Strategies Search')
58 | group_opt.add_argument("--optimizer", type=str, default="MCMC", choices=["MCTS", "MCMC", "DP"], help="The algorithm used to search the optimal optimzation strategy")
59 | group_opt.add_argument("--ucb_type", type=str, default="AVG", choices=["MAX", "AVG"], help="The type of quanlity value used in the UCB euqation")
60 | group_opt.add_argument("--no_mutation", action="store_true", help="If this arg is set, the default policy of MCTS will not rollout")
61 | group_opt.add_argument("--ucb_gamma", type=float, default=0.1, help="Hyper Parameter used in UCB to control the exploration rate.")
62 | group_opt.add_argument("--ucb_visual", action="store_true", help="If this arg is set, visualize the MCTS search process")
63 | group_opt.add_argument("--no_crit", action="store_true", help="If this arg is set, relax the critical path constaint")
64 |
65 | group_opt.add_argument("--mcmc_beta", type=float, default=10, help="Hyper Parameter used in MCMC/SA to control the exploration rate")
66 | group_opt.add_argument("--step_size", type=int, default=1, help="Step size used in MCMC optimizer.")
67 |
68 | group_opt.add_argument("--heat_window_size", type=int, default=5, help="Window size for the heat based search heuristic.")
69 | group_opt.add_argument("--relabel", action="store_true", help="If this arg is set, relabel the dag with indexes.")
70 | group_opt.add_argument("--ckpt", action="store_true", help="If this arg is set, start search from checkpoint")
71 | group_opt.add_argument("--workspace", type=str, default=None, help="Workerspace of the optimizer")
72 | group_opt.add_argument("--memory_budget", type=float, default=16, help="GPU Memory budget")
73 |
74 | group_opt.add_argument("--search_ts_group_num", action="store_true", help="Search the optimal tensor group numbers if set")
75 | group_opt.add_argument("--fit_data_save_dir", type=str, default=None, help="Dump the data used to fit the tensor fusion cost model")
76 | group_opt.add_argument("--test_ts_group_num", type=int, default=None, help="Test the simulation result of fusing"
77 | "tensors to a spcific numbertensor group, defaul tensor partition size ~ 4 MB")
78 |
79 | ### Operator fusion
80 | group_xla = parser.add_argument_group('Operator Fusion')
81 | group_xla.add_argument("--simulate", action="store_true", help="If this arg is set, simulate the XLA cost model,"
82 | " but still use its rule to determine which operators to fuse.")
83 | group_xla.add_argument("--xla_candidate_path", type=str, default=None, help="XLA candidate path")
84 | group_xla.add_argument("--layer_num_limit", type=str, default=None, help="Sample some operator fusion strategies, "
85 | "where BW operators are fused layer by layer."
86 | "This argument specifies the maximum number of layers that can be fused."
87 | "Test multiple values by separating them with commas")
88 | group_xla.add_argument("--layer_by_layer", action="store_true", help="Fuse operators layer by layer, if set ture")
89 | group_xla.add_argument("--fusion_once", action="store_true",
90 | help="If set, one op can be fused only once")
91 | group_xla.add_argument("--disable_estimate", action="store_true",
92 | help="If set, disable estimate the fused time when failed, instead, raise an error")
93 |
94 | args = parser.parse_args()
95 |
96 |
97 | @Singleton
98 | class SingleArg:
99 | def __init__(self):
100 | self.args = args
101 |
--------------------------------------------------------------------------------
/dpro/base.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | dpro_dir = os.path.dirname(__file__)
4 |
5 | #! define a singleton class
6 | def Singleton(cls):
7 | _instance = {}
8 |
9 | def _singleton(*args, **kargs):
10 | if cls not in _instance:
11 | _instance[cls] = cls(*args, **kargs)
12 | return _instance[cls]
13 |
14 | return _singleton
15 |
16 | class bcolors:
17 | ENDC = '\033[0m'
18 | BOLD = '\033[1m'
19 | UNDERLINE = '\033[4m'
20 | CBLINK = '\33[5m'
21 | CBLINK2 = '\33[6m'
22 | CSELECTED = '\33[7m'
23 |
24 | CBLACK = '\33[30m'
25 | CRED = '\33[31m'
26 | CGREEN = '\33[32m'
27 | CYELLOW = '\33[33m'
28 | CBLUE = '\33[34m'
29 | CVIOLET = '\33[35m'
30 | CBEIGE = '\33[36m'
31 | CWHITE = '\33[37m'
32 |
33 | CBLACKBG = '\33[40m'
34 | CREDBG = '\33[41m'
35 | CGREENBG = '\33[42m'
36 | CYELLOWBG = '\33[43m'
37 | CBLUEBG = '\33[44m'
38 | CVIOLETBG = '\33[45m'
39 | CBEIGEBG = '\33[46m'
40 | CWHITEBG = '\33[47m'
41 |
42 | FAIL = '\33[31m'
43 | WARNING = '\33[33m'
44 |
--------------------------------------------------------------------------------
/dpro/bps_helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/bps_helper/__init__.py
--------------------------------------------------------------------------------
/dpro/cost_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/__init__.py
--------------------------------------------------------------------------------
/dpro/cost_model/_gpu_predict/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_gpu_predict/__init__.py
--------------------------------------------------------------------------------
/dpro/cost_model/_gpu_predict/dim_reduce.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | import matplotlib.pyplot as plt
4 |
5 | from mpl_toolkits.mplot3d import Axes3D
6 | from matplotlib.ticker import NullFormatter
7 | from sklearn import manifold
8 | from sklearn.utils import check_random_state
9 |
10 | def init_fig_base(cnt):
11 | h = math.ceil(math.sqrt(cnt))
12 | w = math.ceil(cnt / h)
13 | fig_base = w * 100 + h * 10 + 1
14 | return fig_base, 0
15 |
16 | class DimReducer:
17 | def __init__(self, xdata, ydata):
18 | '''
19 | xdata: numpy.ndarray
20 | data needed to reduce dimension, shape = (n_samples, n_dims)
21 | ydata: numpy.ndarray
22 | label data, shape = (n_samples, 1)
23 | '''
24 | assert xdata.shape[0] > xdata.shape[1], \
25 | "n_samples should be larger than the dimension: (%d, %d) is given"%(xdata.shape[0], xdata.shape[1])
26 | assert len(ydata.shape) == 1 or ydata.shape[1] == 1, \
27 | "label should be a 1 x ndims vector: {} is given".format(ydata.shape)
28 | self.xdata = xdata
29 | self.ydata = ydata.flatten()
30 |
31 | max_y = max(self.ydata)
32 | min_y = min(self.ydata)
33 | N_CLASS = 10
34 | class_step = (max_y - min_y) / N_CLASS
35 | self.ydata_class = np.floor((self.ydata - min_y) / class_step)
36 |
37 | def do_LLE(self, n_comp=2, n_neib=20, show=None):
38 | from sklearn.manifold import LocallyLinearEmbedding
39 | lle = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=n_neib)
40 | X_reduced = lle.fit_transform(self.xdata)
41 |
42 | if show is not None:
43 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d')
44 | plt.title('LLE with k = {}'.format(n_neib), size=12)
45 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class)
46 | ax.view_init(20, -19)
47 |
48 | return X_reduced
49 |
50 | def do_MDS(self, n_comp=2, show=None):
51 | from sklearn.manifold import MDS
52 | model = MDS(n_components=n_comp)
53 | X_reduced = model.fit_transform(self.xdata)
54 | if show is not None:
55 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d')
56 | plt.title('MDS', size=12)
57 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class)
58 | ax.view_init(20, -19)
59 | return X_reduced
60 |
61 | def do_LDA(self, n_comp=2, show=None):
62 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
63 | lda = LDA(n_components=n_comp)
64 | X_reduced = lda.fit_transform(self.xdata, self.ydata_class)
65 | if show is not None:
66 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d')
67 | plt.title('LDA', size=12)
68 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class)
69 | ax.view_init(20, -19)
70 | return X_reduced
71 |
72 | def do_reduction(self, n_comp=2, algo='LLE', show=True):
73 | if show:
74 | self.fig = plt.figure(figsize = (9, 8))
75 | plt.style.use('default')
76 |
77 | if isinstance(algo, str):
78 | if show:
79 | self.fig_base, _ = init_fig_base(1)
80 | if algo == 'LLE':
81 | X_reduced = self.do_LLE(n_comp=n_comp, show=0)
82 | elif algo == 'MDS':
83 | X_reduced = self.do_MDS(n_comp=n_comp, show=0)
84 | elif algo == 'LDA':
85 | X_reduced = self.do_LDA(n_comp=n_comp, show=0)
86 | else:
87 | raise ValueError("Invalid algorithm: {}".format(algo))
88 | if show:
89 | plt.show()
90 | return X_reduced
91 | elif isinstance(algo, list):
92 | if show:
93 | self.fig_base, _ = init_fig_base(len(algo))
94 | ret = []
95 | for idx, _algo in enumerate(algo):
96 | if _algo == 'LLE':
97 | X_reduced = self.do_LLE(n_comp=n_comp, show=idx)
98 | elif _algo == 'MDS':
99 | X_reduced = self.do_MDS(n_comp=n_comp, show=idx)
100 | elif _algo == 'LDA':
101 | X_reduced = self.do_LDA(n_comp=n_comp, show=idx)
102 | else:
103 | raise ValueError("Invalid algorithm: {}".format(_algo))
104 | ret.append(X_reduced)
105 | if show:
106 | plt.show()
107 | return ret
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/.cost_model/CastToFp16.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/CastToFp16.txt
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/.cost_model/CastToFp32.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/CastToFp32.txt
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/.cost_model/Conv2D.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/Conv2D.txt
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/.cost_model/MatMul.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/MatMul.txt
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/__init__.py
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/dim_reduce.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import math
3 | import matplotlib.pyplot as plt
4 |
5 | from mpl_toolkits.mplot3d import Axes3D
6 | from matplotlib.ticker import NullFormatter
7 | from sklearn import manifold
8 | from sklearn.utils import check_random_state
9 |
10 | def init_fig_base(cnt):
11 | h = math.ceil(math.sqrt(cnt))
12 | w = math.ceil(cnt / h)
13 | fig_base = w * 100 + h * 10 + 1
14 | return fig_base, 0
15 |
16 | class DimReducer:
17 | def __init__(self, xdata, ydata):
18 | '''
19 | xdata: numpy.ndarray
20 | data needed to reduce dimension, shape = (n_samples, n_dims)
21 | ydata: numpy.ndarray
22 | label data, shape = (n_samples, 1)
23 | '''
24 | assert xdata.shape[0] > xdata.shape[1], \
25 | "n_samples should be larger than the dimension: (%d, %d) is given"%(xdata.shape[0], xdata.shape[1])
26 | assert len(ydata.shape) == 1 or ydata.shape[1] == 1, \
27 | "label should be a 1 x ndims vector: {} is given".format(ydata.shape)
28 | self.xdata = xdata
29 | self.ydata = ydata.flatten()
30 |
31 | max_y = max(self.ydata)
32 | min_y = min(self.ydata)
33 | N_CLASS = 10
34 | class_step = (max_y - min_y) / N_CLASS
35 | self.ydata_class = np.floor((self.ydata - min_y) / class_step)
36 |
37 | def do_LLE(self, n_comp=2, n_neib=20, show=None):
38 | from sklearn.manifold import LocallyLinearEmbedding
39 | lle = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=n_neib)
40 | X_reduced = lle.fit_transform(self.xdata)
41 |
42 | if show is not None:
43 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d')
44 | plt.title('LLE with k = {}'.format(n_neib), size=12)
45 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class)
46 | ax.view_init(20, -19)
47 |
48 | return X_reduced
49 |
50 | def do_MDS(self, n_comp=2, show=None):
51 | from sklearn.manifold import MDS
52 | model = MDS(n_components=n_comp)
53 | X_reduced = model.fit_transform(self.xdata)
54 | if show is not None:
55 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d')
56 | plt.title('MDS', size=12)
57 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class)
58 | ax.view_init(20, -19)
59 | return X_reduced
60 |
61 | def do_LDA(self, n_comp=2, show=None):
62 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
63 | lda = LDA(n_components=n_comp)
64 | X_reduced = lda.fit_transform(self.xdata, self.ydata_class)
65 | if show is not None:
66 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d')
67 | plt.title('LDA', size=12)
68 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class)
69 | ax.view_init(20, -19)
70 | return X_reduced
71 |
72 | def do_reduction(self, n_comp=2, algo='LLE', show=True):
73 | if show:
74 | self.fig = plt.figure(figsize = (9, 8))
75 | plt.style.use('default')
76 |
77 | if isinstance(algo, str):
78 | if show:
79 | self.fig_base, _ = init_fig_base(1)
80 | if algo == 'LLE':
81 | X_reduced = self.do_LLE(n_comp=n_comp, show=0)
82 | elif algo == 'MDS':
83 | X_reduced = self.do_MDS(n_comp=n_comp, show=0)
84 | elif algo == 'LDA':
85 | X_reduced = self.do_LDA(n_comp=n_comp, show=0)
86 | else:
87 | raise ValueError("Invalid algorithm: {}".format(algo))
88 | if show:
89 | plt.show()
90 | return X_reduced
91 | elif isinstance(algo, list):
92 | if show:
93 | self.fig_base, _ = init_fig_base(len(algo))
94 | ret = []
95 | for idx, _algo in enumerate(algo):
96 | if _algo == 'LLE':
97 | X_reduced = self.do_LLE(n_comp=n_comp, show=idx)
98 | elif _algo == 'MDS':
99 | X_reduced = self.do_MDS(n_comp=n_comp, show=idx)
100 | elif _algo == 'LDA':
101 | X_reduced = self.do_LDA(n_comp=n_comp, show=idx)
102 | else:
103 | raise ValueError("Invalid algorithm: {}".format(_algo))
104 | ret.append(X_reduced)
105 | if show:
106 | plt.show()
107 | return ret
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/dpro/cost_model/_mixed_precision/test_rst.py:
--------------------------------------------------------------------------------
1 | ''' This script is used to collect TF AMP strategy and test the mixed precision search resutls'''
2 | import re
3 | import os
4 | import json
5 | import argparse
6 |
7 | parser = argparse.ArgumentParser(description="AMP Parser",
8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
9 | parser.add_argument("--option", type=str, required=True, help="option.")
10 | parser.add_argument("--env", type=str, default="", help="environment.")
11 | parser.add_argument("--cmd", type=str, default=None, help="command.")
12 |
13 | parser.add_argument("--amp_rst_path", type=str, default=None, help="amp_rst_path.")
14 | parser.add_argument("--search_rst_path", type=str, default=None, help="amp_rst_path.")
15 | parser.add_argument("--timeline_path", type=str, default=None, help="timeline_path.")
16 | parser.add_argument("--target_path", type=str, default=None, help="target_path.")
17 |
18 | args = parser.parse_args()
19 |
20 | def read_amp_fp16_ops(_path):
21 | with open(_path, "r") as f:
22 | fp16_ops = json.load(f)
23 | fp16_ops = fp16_ops['names']
24 | if "DT_HALF" in fp16_ops:
25 | fp16_ops = [l.split("node ")[1].split(" to DT_HALF")[0] for l in fp16_ops]
26 | return fp16_ops
27 |
28 | def read_search_fp16_ops(_path):
29 | with open(_path, "r") as f:
30 | fp16_ops = json.load(f)
31 | fp16_ops = fp16_ops['best_strategy']
32 | fp16_ops = [n[1].split("->")[1].split(".")[1] for n in fp16_ops]
33 | return fp16_ops
34 |
35 | if args.option == "parse":
36 | os.system("rm nohup.out")
37 | env = ""
38 | if len(args.env) > 0:
39 | env = " ".join(args.env.split(","))
40 | if args.cmd is None:
41 | cmd = "python3 /opt/tiger/horovod_examples/tensorflow_synthetic_benchmark.py --num-warmup-batches 1 --num-batches-per-iter 1 --num-iters 1 --amp"
42 | else:
43 | cmd = args.cmd
44 |
45 | print(env + " TF_CPP_MIN_LOG_LEVEL=0 TF_CPP_MIN_VLOG_LEVEL=2 nohup {}".format(cmd))
46 | os.system(env + " TF_CPP_MIN_LOG_LEVEL=0 TF_CPP_MIN_VLOG_LEVEL=2 nohup {}".format(cmd))
47 |
48 | with open("nohup.out", 'r') as f:
49 | result = f.read()
50 |
51 | ret = {}
52 | lines = re.findall("Converted [0-9]+/[0-9]+ nodes to "
53 | "float16 precision using [0-9]+ cast\(s\) to "
54 | "float16 \(excluding Const and Variable casts\)", result)
55 | if len(lines) > 0:
56 | print(lines[0])
57 | ret["info"] = lines[0]
58 |
59 | lines = re.findall("Changing type .+ of "
60 | ".+ node .+ to DT_HALF", result)
61 | print("check change {} nodes type".format(len(lines)))
62 |
63 | ret["names"] = [l.split("node ")[1].split(" to DT_HALF")[0] for l in lines]
64 | with open("amp_result.json", "w") as f:
65 | json.dump(ret, f)
66 | os.system("rm nohup.out")
67 | elif args.option == "paint":
68 | with open(args.timeline_path, "r") as f:
69 | traces = json.load(f)
70 |
71 | fp16_ops_list = [read_amp_fp16_ops(args.amp_rst_path), read_search_fp16_ops(args.search_rst_path)]
72 |
73 | rst_traces = []
74 | one_pid = None
75 | for trace in traces:
76 | if "Comm" in trace["name"]:
77 | continue
78 | if one_pid is None:
79 | one_pid = trace["pid"]
80 | elif one_pid != trace["pid"]:
81 | continue
82 |
83 | if trace["args"]["step"] != 0:
84 | continue
85 |
86 | raw_name = trace["name"].split(".")[1]
87 |
88 | is_fp16 = [False, False]
89 |
90 | new_trace = trace.copy()
91 | if raw_name in fp16_ops_list[0]:
92 | new_trace["name"] = "Single float16"
93 | is_fp16[0] = True
94 | else:
95 | new_trace["name"] = "Double float32"
96 | new_trace["pid"] = "TF AMP"
97 | new_trace["tid"] ="default"
98 | rst_traces.append(new_trace)
99 |
100 | new_trace = trace.copy()
101 | if raw_name in fp16_ops_list[1]:
102 | new_trace["name"] = "Single float16"
103 | is_fp16[1] = True
104 | else:
105 | new_trace["name"] = "Double float32"
106 | new_trace["pid"] = "Search Result"
107 | new_trace["tid"] ="default"
108 | rst_traces.append(new_trace)
109 |
110 | if is_fp16[0] != is_fp16[1]:
111 | print("{} - TF_AMP:{}, Search Result:{}".format(raw_name,
112 | "float16" if is_fp16[0] else "float32",
113 | "float16" if is_fp16[1] else "float32"))
114 |
115 | with open(args.target_path, "w") as f:
116 | json.dump(rst_traces, f)
117 |
118 | elif args.option == "show":
119 | if (args.amp_rst_path is not None and args.search_rst_path is not None) or (args.amp_rst_path is None and args.search_rst_path is None):
120 | raise ValueError("Please input one and only one path")
121 |
122 | if args.amp_rst_path is not None:
123 | fp16_ops = read_amp_fp16_ops(args.amp_rst_path)
124 | print("TF AMP: ", ",".join(fp16_ops))
125 |
126 | if args.search_rst_path is not None:
127 | fp16_ops = read_search_fp16_ops(args.search_rst_path)
128 | print("Search Results: ", ",".join(fp16_ops))
129 |
130 | if args.target_path is not None:
131 | with open(args.target_path, 'w') as f:
132 | for op in fp16_ops:
133 | f.write(op + "\n")
134 |
135 |
136 |
--------------------------------------------------------------------------------
/dpro/cost_model/_tsfs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_tsfs/__init__.py
--------------------------------------------------------------------------------
/dpro/cost_model/_tsfs/cost_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import json
4 |
5 | from ...logger_utils import SingleLogger
6 | from ...base import bcolors
7 |
8 | def piecewise_linear_3seg(x, x0, y0, x1, y1, k2):
9 | return np.piecewise(x, [x <= x0, x > x1],
10 | [
11 | lambda x: y0,
12 | lambda x: (x - x1) + y1,
13 | lambda x: k2 * (x - x0) + y0,])
14 | p0_3seg = (1, 0, 6, 0, 1)
15 |
16 | def piecewise_linear_2seg(x, x0, y0):
17 | return np.piecewise(x, [x <= x0],
18 | [
19 | lambda x: y0,
20 | lambda x: (x - x0) + y0,])
21 | p0_2seg = (6, 0)
22 |
23 |
24 | class DataRepo:
25 | def __init__(self, tensor_time):
26 | self.para_2seg = None
27 | self.para_3seg = None
28 | self.tensor_time = tensor_time
29 |
30 | def dumps(self):
31 | print("2 seg: ", self.array_str(self.para_2seg))
32 | print("3 seg: ", self.array_str(self.para_3seg))
33 |
34 | def array_str(self, a):
35 | return "[" + ", ".join([str(n) for n in a]) + "]"
36 |
37 | def wrap_predict(func, para, xdata):
38 | pred_ydata = func(np.log10(xdata), *para)
39 | return np.power(10, pred_ydata)
40 | # pred_ydata = func(xdata, *para)
41 | # return pred_ydata
42 |
43 | def test_accuracy(func, para, xdata, ydata):
44 | pred_ydata = wrap_predict(func, para, xdata)
45 | mape = np.average(np.abs(pred_ydata - ydata) / ydata)
46 | return mape
47 |
48 | ### TCP
49 | intra_2GPU_para = DataRepo(None)
50 | intra_2GPU_para.para_2seg = [6.478717760741668, -0.7911850258660735]
51 | intra_2GPU_para.para_3seg = [5.768569837527714, -0.8112763281978731, 7.378590861143234, 0.07736945356154445, 0.4601007391482461]
52 | inter_100Gb_para = DataRepo(None)
53 | inter_100Gb_para.para_2seg = [5.72967574893935, 0.27409744017295945]
54 | inter_100Gb_para.para_3seg = [5.481425042939888, 0.24998168803732868, 523.1069698319661, 517.6116145143503, 0.8976445312387689]
55 |
56 | ### 20210909 profile PUSH and PULL standalone in 1wk*1gpu 1server
57 | push_data = DataRepo(None)
58 | push_data.para_2seg = [4.686307490183, -1.662961882088019]
59 | push_data.para_3seg = [4.846827061369098, -1.6483260907019037, 626.2712890335985, 619.9568948850784, 1.1001192383975844]
60 | pull_data = DataRepo(None)
61 | pull_data.para_2seg = [4.803492695527605, -1.5562480802345402]
62 | pull_data.para_3seg = [4.961341192845001, -1.5523328848981286, 626.2723641952061, 619.9558183092073, 1.119712390211427]
63 |
64 | ### 20210916 profile PUSH and PULL standalone in 2wk*8gpu 2server
65 | push_data = DataRepo(None)
66 | push_data.para_2seg = [4.659495844497468, -1.7521176854189915]
67 | push_data.para_3seg = [4.70781790174102, -1.7521176854008658, 626.2991919900213, 619.9289905166377, 1.0343874361266248]
68 | pull_data = DataRepo(None)
69 | pull_data.para_2seg = [4.671509439964712, -1.6513319055098747]
70 | pull_data.para_3seg = [4.7581024691199625, -1.6513319055201428, 626.2641292852817, 619.9640547461936, 1.063909142047732]
71 |
72 | ### 20210926 profile PUSH and PULL in a completed ResNet50 model
73 | # push_data = DataRepo(None)
74 | # push_data.para_3seg = [1.394117768140235, -2.7347276537801393, 6.265639039503503, 0.12958045808905536, 0.7140396422668894]
75 | # pull_data = DataRepo(None)
76 | # pull_data.para_3seg = push_data.para_3seg
77 | # # pull_data.para_3seg = [2.866912655486207, -2.4697033402682265, 4.575557144977073, -1.5755571450053836, 1.3329606768736635]
78 |
79 | # push_data = DataRepo(None)
80 | # push_data.para_3seg = [2.403217615362119, -1.548809007772742, 9.522663714692643, 3.443890309353957, 0.5209623472565877]
81 | # pull_data = DataRepo(None)
82 | # pull_data.para_3seg = push_data.para_3seg
83 |
84 |
85 | # data_table_path = "/home/tiger/sub_op2tensor_size2avg.json"
86 | # data_table_path = "/home/tiger/sub_op2tensor_size2avg_tcp_vgg16.json"
87 | # data_table_path = "/home/tiger/sub_op2tensor_size2avg_tcp_icptv3.json"
88 | data_table_path = os.environ.get("SUBOP_TENSORSIZE_AVG_PATH", None)
89 | if data_table_path is None:
90 | data_table = None
91 | else:
92 | with open(data_table_path, 'r') as fp:
93 | data_table = json.load(fp)
94 | SingleLogger().info(bcolors.CGREEN + \
95 | "Read tensor_size2avg mapping from {}".format(data_table_path) + bcolors.ENDC)
96 |
97 | def interpolation(tensor_size, tensor_size2avg):
98 | tensor_size_list = list(tensor_size2avg.keys())
99 | available_tensor_size = sorted(
100 | enumerate([float(key) for key in tensor_size_list]),
101 | key=lambda x: x[1])
102 | i = 0
103 | while i < len(available_tensor_size):
104 | if tensor_size < available_tensor_size[i][1]:
105 | break
106 | i += 1
107 | if i == 0:
108 | i = 1
109 | SingleLogger().warn("[TSFS CM] small tensor size {}".format(tensor_size))
110 | elif i == len(available_tensor_size):
111 | i = len(available_tensor_size) - 1
112 | SingleLogger().warn("[TSFS CM] large tensor size {}".format(tensor_size))
113 | x1, x2 = available_tensor_size[i-1][1], available_tensor_size[i][1]
114 | y1 = tensor_size2avg[tensor_size_list[available_tensor_size[i-1][0]]]
115 | y2 = tensor_size2avg[tensor_size_list[available_tensor_size[i][0]]]
116 | return ((y1 - y2) / (x1 - x2)) * (tensor_size - x1) + y1
117 |
118 | piecewise_linear_func = piecewise_linear_3seg
119 |
120 | def predict_ps_intra_comm_time(tensor_size):
121 | return wrap_predict(piecewise_linear_func, intra_2GPU_para.para_3seg, tensor_size)
122 |
123 | USE_INTERPOLATION=False
124 | if USE_INTERPOLATION and data_table is None:
125 | SingleLogger().error("{} must be set if interpolation is used".format("SUBOP_TENSORSIZE_AVG_PATH"))
126 | exit(-1)
127 |
128 | def predict_ps_inter_comm_time(tensor_size, is_push):
129 | if USE_INTERPOLATION:
130 | if is_push:
131 | return interpolation(tensor_size, data_table["PUSH_REQ"])
132 | else:
133 | return interpolation(tensor_size, data_table["PULL_RES"])
134 | else:
135 | RATIO = 1.8
136 | if is_push:
137 | return RATIO * wrap_predict(piecewise_linear_func, push_data.para_3seg, tensor_size)
138 | else:
139 | return RATIO * wrap_predict(piecewise_linear_func, pull_data.para_3seg, tensor_size)
140 | ### 20210827_01: Previous method using coarse grained profiled push_pull time
141 | # all_time = wrap_predict(piecewise_linear_3seg, inter_100Gb_para.para_3seg, tensor_size)
142 | # intra_time = predict_ps_intra_comm_time(tensor_size)
143 | # return all_time - intra_time
144 |
145 |
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_xla/__init__.py
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/execute_graph.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.client import timeline
3 | import numpy as np
4 | import os
5 | import json
6 | from tensorflow.python.client import timeline
7 |
8 | def get_shape_from_placeholder(placeholder_op):
9 | dim_protos = placeholder_op.get_attr("shape").dim
10 | return [d.size for d in dim_protos]
11 |
12 | def get_dtype_from_placeholder(placeholder_op):
13 | return str(placeholder_op.get_attr("dtype")).split("\'")[1]
14 |
15 | def get_output_tensors_from_graph(graph):
16 | output_tensors = []
17 | for op in graph.get_operations():
18 | output_tensors.append(op.outputs[0])
19 | return output_tensors
20 |
21 | def execute_graph_def(graph_def, input_node_defs, fetches, profile_result_dir, tf2xla_config_path=None, num_runs=20, trace_start=10, trace_end=20):
22 | graph = tf.Graph()
23 | with graph.as_default():
24 | tf.import_graph_def(graph_def, name="")
25 | input_nodes = []
26 | for node_def in input_node_defs:
27 | node = graph.get_operation_by_name(node_def.name)
28 | input_nodes.append(node)
29 | output_tensors = []
30 | for node_def in fetches:
31 | node = graph.get_operation_by_name(node_def.name)
32 | output_tensors.append(node.outputs[0])
33 |
34 | feed_dict = {}
35 | for input_node in input_nodes:
36 | shape = get_shape_from_placeholder(input_node)
37 | dtype = get_dtype_from_placeholder(input_node)
38 | print(dtype)
39 | feed_dict[input_node.outputs[0]] = np.random.rand(*shape).astype(dtype)
40 | run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
41 | run_metadata = tf.compat.v1.RunMetadata()
42 | traces = {"traceEvents":[]}
43 | fetch = output_tensors
44 | with tf.compat.v1.Session(graph=graph) as sess:
45 | for i in range(num_runs):
46 | sess.run(fetch, feed_dict, options=run_options, run_metadata=run_metadata)
47 | tl = timeline.Timeline(run_metadata.step_stats)
48 | ctf = json.loads(tl.generate_chrome_trace_format())
49 | if trace_start < i < trace_end:
50 | traces["traceEvents"] += ctf["traceEvents"]
51 | print("{} th step trace added.".format(i))
52 | with open(os.path.join(profile_result_dir, "temp.json"), "w") as f:
53 | json.dump(traces, f, indent=4)
54 | print("Ran to the end.")
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/p_dispersion.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import cvxopt
4 | import os
5 | from scipy.sparse import csr_matrix, eye as speye, vstack
6 | from tqdm import tqdm, trange
7 |
8 | from multiprocessing import Pool
9 |
10 | MUL_DELTA = 10
11 |
12 | # reference: https://stackoverflow.com/a/35566620
13 | def scipy_sparse_to_spmatrix(A):
14 | coo = A.tocoo()
15 | SP = cvxopt.spmatrix(coo.data.tolist(), coo.row.tolist(), coo.col.tolist(), size=A.shape)
16 | return SP
17 |
18 | def p_dispersion_lp(G, k, eps=0.454):
19 | num_nodes = G.shape[0]
20 | diameter = np.max(G)
21 | min_dist = np.min(G[np.nonzero(G)])
22 | delta = diameter / MUL_DELTA
23 | dists = [min_dist + delta * i for i in range(MUL_DELTA)]
24 | print("N = {}".format(num_nodes))
25 | variables = [[None]*len(dists)] * num_nodes
26 | print("Creating vector c.")
27 | c = cvxopt.matrix([-r for r in dists] * num_nodes)
28 | print("Building 1st constraint matrix.")
29 | G1_vals = []
30 | G1_is = []
31 | G1_js = []
32 | for i in range(num_nodes):
33 | for j in range(len(dists)):
34 | G1_vals.append(1)
35 | G1_is.append(0)
36 | G1_js.append(i*len(dists) + j)
37 | G1 = csr_matrix((G1_vals, (G1_is, G1_js)))
38 | G1_h = np.array([k])
39 | print("2nd constraint.")
40 | G2_vals = []
41 | G2_is = []
42 | G2_js = []
43 | for i in trange(num_nodes):
44 | for j in trange(len(dists)):
45 | for u in trange(num_nodes):
46 | dist_u_i = G[i,u]
47 | r = dists[j]
48 | if dist_u_i < r/2:
49 | G2_vals.append(1)
50 | G2_is.append(u)
51 | G2_js.append(i*len(dists) + j)
52 | G2 = csr_matrix((G2_vals, (G2_is, G2_js)))
53 | G2_h = np.ones((num_nodes,1))
54 | print("X range constraint.")
55 | G3 = speye(num_nodes*len(dists))
56 | G3_h = np.ones((num_nodes*len(dists), 1))
57 | G4 = -speye(num_nodes*len(dists))
58 | G4_h = np.zeros((num_nodes*len(dists), 1))
59 |
60 | G_concated = vstack([G1, G2, G3, G4])
61 | h_concated = np.vstack((G1_h, G2_h, G3_h, G4_h))
62 |
63 | G_cvx = scipy_sparse_to_spmatrix(G_concated)
64 | h_cvx = cvxopt.matrix(h_concated)
65 | print("Start solving...")
66 | sol = cvxopt.solvers.lp(c, G_cvx, h_cvx, solver="glpk")
67 | print("Solution obtained.")
68 | solution = np.array(sol["x"]).reshape((num_nodes, len(dists)))
69 |
70 | # rounding
71 | print("Start rounding.")
72 | while True:
73 | S = set()
74 | for i in trange(num_nodes):
75 | for j in range(len(dists)):
76 | x_i_r = solution[i,j]
77 | r = dists[j]
78 | add_prob = (1-eps)*(1-np.e**(-x_i_r))
79 | if random.random() < add_prob:
80 | should_break = False
81 | for (i_, r_) in S:
82 | if r < r_ and G[i][i_] < r_/2:
83 | should_break = True
84 | break
85 | if should_break:
86 | continue
87 | S.add((i, r))
88 | if len(S) <= k:
89 | # break
90 | indices = list(set([i for (i, r) in S]))
91 | yield indices
92 |
93 | def sum_min_distance(G, A):
94 | dist = 0
95 | min_j = {}
96 | for i in A:
97 | min_dist = float("inf")
98 | min_j_for_i = -1
99 | for j in A:
100 | if j == i:
101 | continue
102 | dist = G[i,j]
103 | if dist < min_dist:
104 | min_dist = dist
105 | min_j_for_i = j
106 | dist += min_dist
107 | min_j[i] = min_j_for_i
108 | return dist, min_j
109 |
110 | def sum_min_distance_edit(G, last_dist, min_j, A, idx_rm, idx_add):
111 | new_min_j = min_j.copy()
112 | last_dist -= G[idx_rm, min_j[idx_rm]]
113 | new_min_j.pop(idx_rm)
114 | for index in A:
115 | if index != idx_rm and min_j[index] == idx_rm:
116 | last_dist -= G[index, idx_rm]
117 | min_j_for_index = -1
118 | min_dist_for_index = float("inf")
119 | for A_index in A:
120 | if A_index != idx_rm and A_index != index:
121 | if G[index, A_index] < min_dist_for_index:
122 | min_dist_for_index = G[index, A_index]
123 | min_j_for_index = A_index
124 | new_min_j[index] = min_j_for_index
125 | last_dist += min_dist_for_index
126 | min_dist_for_add = float("inf")
127 | min_j_for_add = -1
128 | for index in A:
129 | if index != idx_rm:
130 | orig_min_dist = G[index, new_min_j[index]]
131 | if G[index, idx_add] < orig_min_dist:
132 | new_min_j[index] = idx_add
133 | last_dist -= orig_min_dist
134 | last_dist += G[index, idx_add]
135 | if G[index, idx_add] < min_dist_for_add:
136 | min_dist_for_add = G[index, idx_add]
137 | min_j_for_add = index
138 | last_dist += min_dist_for_add
139 | new_min_j[idx_add] = min_j_for_add
140 | return last_dist, new_min_j
141 |
142 |
143 | def p_dispersion_local_search(G, k, sample_ratio = 1, patience = 3, l=None, tqdm_position=0):
144 | num_nodes = G.shape[0]
145 | A = set(random.sample(list(range(num_nodes)), k=k))
146 | if l is None:
147 | l = int(np.ceil(k * np.log(k)))
148 | last_dist, min_j = sum_min_distance(G, A)
149 | max_dist = last_dist
150 | sample_k = int(np.ceil(sample_ratio * num_nodes))
151 | # print("Using {} samples in each iteration, ratio: {}".format(sample_k, sample_ratio))
152 | opt_counter = 0
153 | tqdm_iterator = trange(l, position=tqdm_position, desc="worker {}: ".format(tqdm_position), leave=False)
154 | for i in tqdm_iterator:
155 | max_new_min_j = None
156 | max_idx_rm = -1
157 | max_idx_add = -1
158 | for out_index in random.sample(range(num_nodes), sample_k):
159 | if out_index not in A:
160 | for a_index in A:
161 | new_dist, new_min_j = sum_min_distance_edit(G, last_dist, min_j, A, a_index, out_index)
162 | if new_dist > max_dist:
163 | max_dist = new_dist
164 | max_idx_rm = a_index
165 | max_idx_add = out_index
166 | max_new_min_j = new_min_j
167 | if max_idx_rm == -1:
168 | opt_counter += 1
169 | if opt_counter >= patience:
170 | break
171 | else:
172 | A.remove(max_idx_rm)
173 | A.add(max_idx_add)
174 | last_dist = max_dist
175 | min_j = max_new_min_j
176 | tqdm_iterator.close()
177 | return list(A)
178 |
179 | def worker_func(arg):
180 | max_dist = -float("inf")
181 | max_min_j = None
182 | max_a_index = -1
183 | max_out_index = -1
184 | for (G, last_dist, last_min_j, A, a_index, out_index) in arg:
185 | new_dist, new_min_j = sum_min_distance_edit(G, last_dist, last_min_j, A, a_index, out_index)
186 | if new_dist > max_dist:
187 | max_dist = new_dist
188 | max_min_j = new_min_j
189 | max_a_index = a_index
190 | max_out_index = out_index
191 | return (max_dist, max_min_j, max_a_index, max_out_index)
192 |
193 | def parallel_p_dispersion_local_search(G, k, sample_ratio = 1, patience = 3, l=None):
194 | num_nodes = G.shape[0]
195 | A = set(random.sample(list(range(num_nodes)), k=k))
196 | if l is None:
197 | l = int(np.ceil(k * np.log(k)))
198 | last_dist, min_j = sum_min_distance(G, A)
199 | max_dist = last_dist
200 | sample_k = int(np.ceil(sample_ratio * num_nodes))
201 | # print("Using {} samples in each iteration, ratio: {}".format(sample_k, sample_ratio))
202 | opt_counter = 0
203 | for i in trange(l, desc="iter: ", leave=True):
204 | max_new_min_j = None
205 | max_idx_rm = -1
206 | max_idx_add = -1
207 | map_args = []
208 | for out_index in random.sample(range(num_nodes), sample_k):
209 | if out_index not in A:
210 | for a_index in A:
211 | map_args.append( (
212 | G, last_dist, min_j, A, a_index, out_index
213 | ) )
214 | num_cores = min(os.cpu_count(), len(map_args))
215 | grouped_map_args = []
216 | chunk_size = int(np.ceil(len(map_args) / num_cores))
217 | for i in range(num_cores):
218 | actual_chunk_size = min(chunk_size, len(map_args)-i*chunk_size)
219 | grouped_map_args.append(map_args[i*chunk_size:i*chunk_size+actual_chunk_size])
220 | with Pool(num_cores) as p:
221 | distances = list(tqdm(p.imap_unordered(worker_func, grouped_map_args), total=len(grouped_map_args), desc="inner: ", leave=False))
222 | for (new_dist, new_min_j, a_index, out_index) in distances:
223 | if new_dist > max_dist:
224 | max_dist = new_dist
225 | max_idx_rm = a_index
226 | max_idx_add = out_index
227 | max_new_min_j = new_min_j
228 | if max_idx_rm == -1:
229 | opt_counter += 1
230 | if opt_counter >= patience:
231 | break
232 | else:
233 | A.remove(max_idx_rm)
234 | A.add(max_idx_add)
235 | last_dist = max_dist
236 | min_j = max_new_min_j
237 | return list(A)
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/process_trace.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import json
3 | import os
4 | import time
5 |
6 | TRACE_SUFFIX = "trace.json.gz"
7 | XLA_DUMP_SUFFIX = "after_optimizations.txt"
8 |
9 | def search_for_file(profile_dir, suffix):
10 | for dir_path, dir_names, file_names in os.walk(profile_dir):
11 | for fn in file_names:
12 | if fn.endswith(suffix):
13 | return os.path.join(dir_path, fn)
14 | return None
15 |
16 | def wait_for_file(profile_dir, suffix):
17 | for i in range(20):
18 | file_path = search_for_file(profile_dir, suffix)
19 | if file_path is not None:
20 | return file_path
21 | else:
22 | # sleep 10 ms
23 | time.sleep(0.01)
24 | file_path = search_for_file(profile_dir, suffix)
25 | if file_path is None:
26 | print("[WARNING] Cannot find file with suffix {} in dir {}.".format(suffix, profile_dir))
27 | return None
28 | else:
29 | return file_path
30 |
31 | def search_for_trace(profile_dir):
32 | return wait_for_file(profile_dir, TRACE_SUFFIX)
33 |
34 | def search_for_hlo(xla_dir):
35 | return wait_for_file(xla_dir, XLA_DUMP_SUFFIX)
36 |
37 | def get_execution_time_for_whole_graph(trace_path):
38 | with gzip.open(trace_path, "rb") as f:
39 | trace_data = json.loads(f.read().decode("ascii"))
40 | events = trace_data["traceEvents"]
41 | time_dict = {}
42 | for ev in events:
43 | if "args" not in ev:
44 | continue
45 | if "long_name" not in ev["args"]:
46 | continue
47 | long_name = ev["args"]["long_name"].split(":")[0]
48 | if long_name not in time_dict:
49 | time_dict[long_name] = (0, 0)
50 | time, count = time_dict[long_name]
51 | time_dict[long_name] = (time + ev["dur"], count + 1)
52 | for name, (time, count) in time_dict.items():
53 | time_dict[name] = (time / count, count)
54 | return time_dict
55 |
56 | def get_execution_time_from_temp_trace(trace_path):
57 | ### TODO (huhanpeng): delete
58 | # one_pid = None
59 | # kernel_pid = None
60 | # kernel_time_dict = {}
61 | with open(trace_path, "r") as f:
62 | trace = json.load(f)
63 | if isinstance(trace, dict):
64 | trace = trace["traceEvents"]
65 |
66 | ### TODO (huhanpeng): delete following
67 | # for tr in trace:
68 | # if tr["ph"] == "M" and tr["name"] == "process_name":
69 | # if "args" in tr and "name" in tr["args"]:
70 | # if "device:GPU" in tr["args"]["name"] and "Compute" in tr["args"]["name"] and "replica" in tr["args"]["name"]:
71 | # one_pid = tr["pid"]
72 | # elif "device:GPU" in tr["args"]["name"] and "stream:all Compute" in tr["args"]["name"]:
73 | # kernel_pid = tr["pid"]
74 | # if one_pid and kernel_pid:
75 | # break
76 |
77 | time_dict = {}
78 | for tr in trace:
79 | if tr["ph"] == "X":
80 | op_name = tr["args"]["name"]
81 | if op_name not in time_dict:
82 | time_dict[op_name] = []
83 | time_dict[op_name].append(tr["dur"])
84 |
85 | ### TODO (huhanpeng): delete following try...except...
86 | # try:
87 | # if tr["ph"] == "X" and tr["pid"] == one_pid:
88 | # op_name = tr["args"]["name"]
89 | # if op_name not in time_dict:
90 | # time_dict[op_name] = []
91 | # time_dict[op_name].append(tr["dur"])
92 | # elif tr["ph"] == "X" and tr["pid"] == kernel_pid:
93 | # op_name = tr["args"]["name"]
94 | # if op_name not in kernel_time_dict:
95 | # kernel_time_dict[op_name] = []
96 | # kernel_time_dict[op_name].append(tr["dur"])
97 | # except:
98 | # pass
99 |
100 | ### TODO (huhanpeng): delete
101 | # for key, durs in time_dict.items():
102 | # if key in kernel_time_dict:
103 | # time_dict[key] = kernel_time_dict[key]
104 |
105 | for key, durs in time_dict.items():
106 | time_dict[key] = (sum(durs) / len(durs), len(durs))
107 | return time_dict
108 |
109 | def get_execution_time_from_trace(trace_path):
110 | with gzip.open(trace_path, "rb") as f:
111 | trace_data = json.loads(f.read().decode("ascii"))
112 | events = trace_data["traceEvents"]
113 | time = 0
114 | count = 0
115 | for ev in events:
116 | try:
117 | if ev["ph"] == "X" and ev["name"] == "_XlaRun":
118 | time += ev["dur"]
119 | count += 1
120 | except:
121 | pass
122 | if count == 0:
123 | # cannot compile
124 | return 0, 0
125 | return time/count, count
126 |
127 | def get_execution_time_from_uncompiled_trace(trace_path):
128 | with gzip.open(trace_path, "rb") as f:
129 | trace_data = json.loads(f.read().decode("ascii"))
130 | events = trace_data["traceEvents"]
131 | time = 0
132 | count = 0
133 | for ev in events:
134 | try:
135 | if ev["ph"] == "X" and ev["name"] == "SessionRun":
136 | time += ev["dur"]
137 | count += 1
138 | except:
139 | pass
140 | if count == 0:
141 | # cannot compile
142 | return 0, 0
143 | return time/count, count
144 |
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/utils.py:
--------------------------------------------------------------------------------
1 | class CMPaths:
2 | DATASET_DIR = "dataset"
3 | DEBUG_DIR = "debug"
4 | HLO_DIR = "hlos"
5 | PROFILE_DIR = "xla_profile"
6 | FEATURE_DIR = "features"
7 | MODULES_DIR = "modules"
8 | RAW_SUBGRAPH_DIR = "generated_subgraph"
9 |
10 | LABEL_FILE = "labels.txt"
11 | TF_SUPPORTED_OPS_FILE = "tf_xla_supported_ops.txt"
12 |
13 | METADATA_FILE = "metadata.json"
14 | RAW_GRAPH_DEF_FILE = "final_graph.json"
15 | CLEANED_GRAPH_DEF_FILE = "cleaned_graph.json"
16 | UNIQUE_OP_HISTORY_FILE = "unique_op_history.txt"
17 |
18 | MAX_CLUSTER_CACHE_FILE = "max_cluster.pickle"
19 | ELEMENTARY_OP_CACHE_FILE = "elementary_ops.txt"
20 |
21 | DATASET_SAVE_FILE = "dataset.pickle"
22 | ELEMENTARY_OP_CACHE_SAVE_FILE = "elem_op_cache.pickle"
23 | OVERHEAD_MODEL_SAVE_FILE = "overhead.pickle"
24 | MODEL_WEIGHT_SAVE_FILE = "model_weights.h5"
25 | MODEL_CONFIG_FILE = "model_config.pickle"
26 | MODULE_CONFIG_FILE = "module_config.txt"
27 | GRAPH_DEF_PICKLE_FILE = "graph_def.pickle"
28 |
29 | AFTER_OPT_TF_DAG_FILE = "partition_def_0.json"
30 | DEBUG_XLA_CANDIATES_FILE = "PLEASE SPECIFY CANDIDATE FILE PATH"
31 | TENSOR_SHAPE_FILE = "tensor_shapes.json"
32 |
33 | class CMEnvs:
34 | WHITE_LIST_PATH = "BPF_XLA_OP_WHITE_LIST_PATH"
35 | TF_PATH = "BPF_TF_PATH"
36 | CM_PROFILE_GPU = "BPF_COST_MODEL_PROFILE_GPU"
37 |
38 |
39 | ### TODO(huhanpeng): ResourceApplyGradientDescent should not be ignored
40 | IGNORE_OP_TYPES = ["ShapeN", "_Arg", "_Send", "_Recv", "VarIsInitializedOp",
41 | "ReadVariableOp",
42 | # "Pad", "SparseSoftmaxCrossEntropyWithLogits",
43 | "VarHandleOp",
44 | "IsVariableInitialized", "ResourceApplyGradientDescent", "IteratorToStringHandle",
45 | "IteratorGetNext", "MakeIterator", "IteratorV2", "NoOp", "Placeholder"]
46 |
47 |
48 | def parse_xla_candidate_ops(candidate_path):
49 | candidates = set()
50 | graph_node_id2name = {}
51 | unsafe_resource_deps_ = set()
52 | with open(candidate_path, "r") as f:
53 | lines = f.readlines()
54 |
55 | idx = 0
56 | while idx < len(lines):
57 | if lines[idx].startswith("unsafe_resource_deps_"):
58 | idx += 1
59 | break
60 | ls = lines[idx].strip().split(" ")
61 | candidates.add(ls[0])
62 | graph_node_id2name[ls[1]] = ls[0]
63 | idx += 1
64 | while idx < len(lines):
65 | ls = lines[idx].strip().split(" ")
66 | unsafe_resource_deps_.add(
67 | (graph_node_id2name[ls[0]], graph_node_id2name[ls[1]]))
68 | idx += 1
69 | return candidates, unsafe_resource_deps_
70 |
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/xla_run_generate_kernel_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo -i
4 | cd ${HOME}/
5 | rm -rf byteprofile-analysis
6 | git clone https://github.com/joapolarbear/byteprofile-analysis.git
7 | cd byteprofile-analysis
8 | ### install requirements
9 | pip3 install -r requirements.txt
10 |
11 | ### Recompile XLA related Part or directly download it
12 | # export PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} \
13 | # OLD_LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH \
14 | # LD_LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cudnn:/usr/local/cuda:/usr/local/cuda/compat:$OLD_LD_LIBRARY_PATH \
15 | # LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/nccl/lib/:$LIBRARY_PATH
16 | # cd /root/tensorflow
17 | # ./build_bpf_tf_modules.sh
18 | cd ${HOME}/
19 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.1/dpro_xla_tools.zip
20 | unzip dpro_xla_tools.zip
21 |
22 | ### Config env
23 | # where the modified tensorflow locates
24 | export BPF_TF_PATH=${HOME}/dpro_xla_tools
25 | # the GPU id to run profiling on (specify one GPU only)
26 | export BPF_COST_MODEL_PROFILE_GPU="0"
27 | export CUDA_VISIBLE_DEVICES=0
28 |
29 |
30 | export PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH}
31 | export LD_LIBRARY_PATH=/usr/local/lib/python3.7/dist-packages/tensorflow/:$LD_LIBRARY_PATH
32 | DRIVER_VERSION=$(nvidia-smi | grep -Po "CUDA Version: \K([0-9]{1,}\.)+[0-9]{1,}")
33 | TOOLKIT_VERSION=$(nvcc --version | grep -Po "release \K([0-9]{1,}\.)+[0-9]{1,}")
34 | echo "CUDA driver version: $DRIVER_VERSION"
35 | echo "CUDA toolkit version: $TOOLKIT_VERSION"
36 | ### If the driver version is lower than the toolkit version, use compatability mode
37 | # export LD_LIBRARY_PATH=/usr/local/cuda/compat/:$LD_LIBRARY_PATH
38 | sudo ln -sf /usr/local/lib/python3.7/dist-packages/tensorflow/libtensorflow_framework.so.2 /usr/lib/
39 |
40 | export DPRO_GRAPHDEF_DFG_PATH=xxx
41 |
42 | # The path where partition_def_0.json, tensor_shapes... are stored
43 | TRACE_DIR=xxx
44 | OUTPUT_DIR="${HOME}/xla"
45 | mkdir -p $OUTPUT_DIR
46 |
47 |
48 | ### resnet
49 | NUM_RANDOM_SAMPLES=5000
50 | MAX_CLUSTER_SAMPLES=5
51 | MIN_CLUSTER_SIZE=4
52 | MAX_CLUSTER_SIZE=800
53 |
54 | ### VGG16
55 | NUM_RANDOM_SAMPLES=5000
56 | MAX_CLUSTER_SAMPLES=5
57 | MIN_CLUSTER_SIZE=4
58 | MAX_CLUSTER_SIZE=200
59 |
60 | ### VGG19
61 | NUM_RANDOM_SAMPLES=2000
62 | MAX_CLUSTER_SAMPLES=5
63 | MIN_CLUSTER_SIZE=4
64 | MAX_CLUSTER_SIZE=200
65 |
66 | ### InceptionV3
67 | NUM_RANDOM_SAMPLES=5000
68 | MAX_CLUSTER_SAMPLES=5
69 | MIN_CLUSTER_SIZE=4
70 | MAX_CLUSTER_SIZE=800
71 |
72 | ### generate data and train
73 | cd ${HOME}/byteprofile-analysis
74 | python3 xla_cm_entry.py --mode 0 \
75 | --trace_dir ${TRACE_DIR} \
76 | --output_dir ${OUTPUT_DIR} \
77 | --num_samples ${NUM_RANDOM_SAMPLES} \
78 | --max_cluster_samples ${MAX_CLUSTER_SAMPLES} \
79 | --min_cluster_size ${MIN_CLUSTER_SIZE} \
80 | --max_cluster_size ${MAX_CLUSTER_SIZE} \
81 | --batch_size 256
82 |
83 | ### exit root
84 | exit
85 | if hdfs dfs -test -e /usr/hphu/xla_model/xla ; then
86 | hdfs dfs -rmr /usr/hphu/xla_model/xla
87 | fi
88 | hdfs dfs -put ${HOME}/xla /usr/hphu/xla_model/
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/xla_run_test_module_cm.sh:
--------------------------------------------------------------------------------
1 | # where the modified tensorflow locates
2 | export BPF_TF_PATH="/root/tensorflow"
3 | # this is the GPU used to compile XLA modules. Cost model will be run on another
4 | # differnt GPU (specify one GPU here only)
5 | export BPF_COST_MODEL_PROFILE_GPU="0"
6 |
7 | # modify these
8 | DATASET_DIR="/PATH/TO/DATASET/DIR"
9 | COST_MODEL_DIR="/PATH/TO/COST/MODEL"
10 |
11 | python3 xla_test_module_cm.py --dataset_dir ${DATASET_DIR} --cost_model_dir ${COST_MODEL_DIR}
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/xla_run_train_module_cm.sh:
--------------------------------------------------------------------------------
1 | # where the modified tensorflow locates
2 | export BPF_TF_PATH="/root/tensorflow"
3 | # this is the GPU used to compile XLA modules. Cost model will be run on another
4 | # differnt GPU (specify one GPU here only)
5 | export BPF_COST_MODEL_PROFILE_GPU="0"
6 |
7 | # modify these
8 | DATASET_DIR="/opt/tiger/xla/kernel_dataset"
9 | OUTPUT_DIR="/opt/tiger/xla/cost_model"
10 |
11 | python3 xla_train_module_cm.py --dataset_dir ${DATASET_DIR} --output_dir ${OUTPUT_DIR}
12 |
--------------------------------------------------------------------------------
/dpro/cost_model/_xla/xlatools.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import subprocess
3 | import os
4 |
5 | from .utils import CMEnvs
6 | from ...logger_utils import SingleLogger
7 | from ...base import bcolors
8 |
9 | if CMEnvs.TF_PATH in os.environ:
10 | BPF_TF_PREFIX = os.environ[CMEnvs.TF_PATH]
11 | else:
12 | BPF_TF_PREFIX = None
13 | SingleLogger().warn(bcolors.CRED + "Environment {} not set. Guessing default TF location.".format(CMEnvs.TF_PATH) + bcolors.ENDC)
14 |
15 | if CMEnvs.CM_PROFILE_GPU in os.environ:
16 | try:
17 | BPF_PROFILE_GPU = int(os.environ[CMEnvs.CM_PROFILE_GPU])
18 | except:
19 | print("[WARNING] Invalid BPF_COST_MODEL_PROFILE_GPU value (must be an integer)."
20 | " {} is given".format(CMEnvs.CM_PROFILE_GPU))
21 | exit(-1)
22 | else:
23 | print("[WARNING] Required environment BPF_COST_MODEL_PROFILE_GPU value not set. Set it as 0 by default")
24 | BPF_PROFILE_GPU = 0
25 | # exit(-1)
26 |
27 | def _check_file_available_for_writing(path):
28 | p = Path(path)
29 | p_dir = p.resolve().parent
30 | if not p_dir.is_dir():
31 | p.mkdir(parents=True)
32 |
33 | def _check_file_exist_for_reading(path):
34 | p = Path(path)
35 | if not p.is_file():
36 | raise FileNotFoundError("Cannot find file {}".format(path))
37 |
38 | def _check_arg_types(args, types):
39 | if len(args) != len(types):
40 | raise RuntimeError("Mismatch number of arguments and types in _check_arg_types. ({} v.s. {})".format(len(args), len(types)))
41 | for index, (arg, arg_type) in enumerate(zip(args, types)):
42 | if not isinstance(arg, arg_type):
43 | raise TypeError("Inappropriate argument type for argument {}. Expected {} but got {}".format(index, arg_type, type(arg)))
44 |
45 | def compile_to_hlo(graph_path, config_path, dump_path_unopt, dump_path_opt, compile_exec=None):
46 | if compile_exec is None:
47 | if BPF_TF_PREFIX is not None:
48 | compile_exec = os.path.join(BPF_TF_PREFIX, "bazel-bin/tensorflow/compiler/byteprofile_xlatools/tfcompile_hlo")
49 | else:
50 | compile_exec = "/root/tensorflow/bazel-bin/tensorflow/compiler/byteprofile_xlatools/tfcompile_hlo"
51 | if not os.path.exists(compile_exec):
52 | print("Cannot find the path to replay_computation_gpu: {}.".format(compile_exec))
53 | exit(-1)
54 |
55 | _check_arg_types([graph_path, config_path, dump_path_unopt, dump_path_opt], [str] * 4)
56 | _check_file_exist_for_reading(graph_path)
57 | _check_file_exist_for_reading(config_path)
58 | _check_file_available_for_writing(dump_path_unopt)
59 | _check_file_available_for_writing(dump_path_opt)
60 | cmd = "CUDA_VISIBLE_DEVICES={} {} {} {} {} {}".format(str(
61 | BPF_PROFILE_GPU), compile_exec, graph_path, config_path, dump_path_unopt, dump_path_opt)
62 | if not os.path.exists(graph_path):
63 | raise ValueError("graph_path:{} do not exists".format(graph_path))
64 | subprocess.run(cmd, stdout=subprocess.DEVNULL,
65 | stderr=subprocess.DEVNULL, check=True, shell=True)
66 | # subprocess.run(cmd, check=True, shell=True)
67 |
68 | def replay_and_generate_kernel_sample(sample_id_start, hlo_path, tmp_dir, dataset_path, replay_exec=None):
69 | if replay_exec is None:
70 | if BPF_TF_PREFIX is not None:
71 | replay_exec = os.path.join(BPF_TF_PREFIX, "bazel-bin/tensorflow/compiler/xla/tools/replay_computation_gpu")
72 | else:
73 | replay_exec = "/root/tensorflow/bazel-bin/tensorflow/compiler/xla/tools/replay_computation_gpu"
74 | if not os.path.exists(replay_exec):
75 | print("Cannot find the path to replay_computation_gpu.")
76 | exit(-1)
77 | my_env = os.environ.copy()
78 | my_env["CUDA_VISIBLE_DEVICES"] = str(BPF_PROFILE_GPU)
79 | opt_1 = "--num_runs=50"
80 | opt_2 = "--use_fake_data=true"
81 | opt_3 = "--print_result=false"
82 | opt_4 = "--dataset_path={}".format(dataset_path)
83 | opt_5 = "--temp_dir_path={}".format(tmp_dir)
84 | opt_6 = "--profile_start=30"
85 | opt_7 = "--profile_end=50"
86 | opt_8 = "--sample_id_start={}".format(sample_id_start)
87 | subprocess.run("CUDA_VISIBLE_DEVICES={} {} {} {} {} {} {} {} {} {} {}".format(
88 | str(BPF_PROFILE_GPU), replay_exec, opt_1, opt_2, opt_3,
89 | opt_4, opt_5, opt_6, opt_7, opt_8, hlo_path),
90 | stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=my_env, shell=True, check=True)
91 |
92 | def extract_kernel_features_from_hlo(hlo_path, tmp_dir, extract_exec=None):
93 | if extract_exec is None:
94 | if BPF_TF_PREFIX is not None:
95 | extract_exec = os.path.join(BPF_TF_PREFIX, "bazel-bin/tensorflow/compiler/xla/tools/extract_features_from_hlo")
96 | else:
97 | extract_exec = "/root/tensorflow/bazel-bin/tensorflow/compiler/xla/tools/extract_features_from_hlo"
98 | if not os.path.exists(extract_exec):
99 | print("Cannot find the path to replay_computation_gpu.")
100 | exit(-1)
101 |
102 | opt_1 = "--hlo_path={}".format(hlo_path)
103 | opt_2 = "--temp_dir_path={}".format(tmp_dir)
104 | subprocess.run([extract_exec, opt_1, opt_2], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
105 |
--------------------------------------------------------------------------------
/dpro/cost_model/base.py:
--------------------------------------------------------------------------------
1 |
2 | class OptApplyStrategyError(Exception):
3 | pass
4 |
5 |
6 | class OptNoValidStrategyError(Exception):
7 | pass
8 |
9 |
10 | class OptQueryCostModelError(Exception):
11 | pass
12 |
13 | class _BaseGraphPass:
14 | def __init__(self, opt):
15 | self.opt = opt
16 | self.dag = self.opt.dag
17 | ### token is the indendifier of each optimization technique
18 | self.token = None
19 | self.meta_info = self.opt.clct.para_dict
20 |
21 | self.ckpt_dir = self.opt.ckpt_dir
22 | self.spec_dir = self.opt.spec_dir
23 |
24 | def init_search_space(self, *args, **kwargs):
25 | raise NotImplementedError()
26 |
27 | def apply(self, s, __dag, __pkg):
28 | raise NotImplementedError()
29 |
30 | def load_init_ckpt(self):
31 | ''' Load the init states BEFORE the search process,
32 | reduce the preprocessing time,
33 | e.g., XLA cost model need to init partition'''
34 | raise NotImplementedError()
35 |
36 | def load_ckpt(self):
37 | ''' Load checkponits during the search process '''
38 | raise NotImplementedError()
39 |
40 | def checkpoint(self):
41 | raise NotImplementedError()
42 |
43 | def flush(self, is_accept):
44 | ''' A strategy may be rejected, so the internal states of
45 | * cost model should not be changed in apply()
46 | * but be changed when the strategy is accepted
47 | Each cost model may cache the change of the internal states,
48 | and flush the change when this function is called
49 | '''
50 | raise NotImplementedError()
51 |
52 |
--------------------------------------------------------------------------------
/dpro/cost_model/gpu_models_info.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | ALL_GPU_MODELS = ["v100", "a100", "p100", "1080ti", "t4"]
3 | CONFIG_NAMES = ["flops_fp32", "flops_fp16"]
4 | ALL_GPU_MODELS_FILTER = [True, True, True, True, True]
5 |
6 | ### refer to https://www.microway.com/knowledge-center-articles/comparison-of-nvidia-geforce-gpus-and-nvidia-tesla-gpus/
7 | ### in tflops
8 | GPU_CONFIG = np.array([
9 | [7.4, 29.7],
10 | [9.7, 78],
11 | [5, 19.95],
12 | [0.355, 0.177],
13 | [0.25, 16.2]
14 | ])
15 | class GPUConfig:
16 | def __init__(self, gpu_model, configs):
17 | self.name = gpu_model
18 | self.flops_fp32 = configs[0]
19 | self.flops_fp16 = configs[1]
20 |
21 | def ret_gpu_config(gpu_model):
22 | if gpu_model not in ALL_GPU_MODELS:
23 | raise ValueError("Invalid GPU Model name: {}".format(gpu_model))
24 | return GPUConfig(gpu_model, GPU_CONFIG[ALL_GPU_MODELS.index(gpu_model)])
25 |
26 | def gpu_filter(gpu_model):
27 | if gpu_model not in ALL_GPU_MODELS:
28 | raise ValueError("Invalid GPU Model name: {}".format(gpu_model))
29 | return ALL_GPU_MODELS_FILTER[ALL_GPU_MODELS.index(gpu_model)]
30 |
--------------------------------------------------------------------------------
/dpro/cost_model/mixed_precision.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | import time
3 | import os
4 | import pickle
5 | import numpy as np
6 | import ujson as json
7 | from tqdm import tqdm
8 |
9 | from ..arg_utils import SingleArg
10 | from ..trace_utils import *
11 | from ._xla.pk_graph import PKGraph
12 | from .base import _BaseGraphPass
13 | from ._mixed_precision.amp_pred import AMPPredictor
14 |
15 | args_ = SingleArg().args
16 |
17 | class AMPGraphPass(_BaseGraphPass):
18 | def __init__(self, opt):
19 | super().__init__(opt)
20 | ### AMP predictor
21 | self.amp_predictor = AMPPredictor(self.meta_info)
22 | self.token = [">", "<"]
23 |
24 | def init_search_space(self, candidates, _dag: nx.DiGraph, _pkg: PKGraph):
25 | search_space = []
26 | weights = []
27 | for n, l in candidates:
28 | # node heat
29 | # heat = self.opt._get_heat_from_history(n)
30 | # ### Nodes that have never been fused
31 | # cat = parse_cat_fine_grained(n)
32 | # pid = parse_pid_from_name(n)
33 |
34 | ### check if mixed precision can be used for this node
35 | if self.amp_predictor.is_need_amp(_dag, n):
36 | search_space.append((">", n, None))
37 | weights.append(l)
38 |
39 | # return [(">", "host1.rank0->BW.gradients/resnet50/conv2_block3_1_conv/Conv2D_grad/Conv2DBackpropFilter", None)], [1]
40 | SingleLogger().info("MP Cost Model init {} strategies.".format(len(search_space)))
41 | return search_space, weights
42 |
43 | def apply(self, s, __dag, __pkg):
44 | op, target, _ = s
45 | nodes_introduced = self.amp_predictor.quantize(__dag, target)
46 | ### apply this strategy to other GPUs' corresponding operators
47 | ### we assume data parallel, use the same model
48 | on_other_ranks = self.opt._debug_convert_to_other_machines(target)
49 | for target in on_other_ranks:
50 | nodes_introduced += self.amp_predictor.quantize(__dag, target)
51 | return True, nodes_introduced, []
52 |
53 | def checkpoint(self):
54 | self.amp_predictor.checkpoint()
55 |
56 | def load_ckpt(self):
57 | self.amp_predictor.load_ckpt()
58 |
59 | def load_init_ckpt(self):
60 | init_ckpt_path = os.path.join(ROOT_PATH, "amp_init_ckpt.pickle")
61 | if os.path.isfile(init_ckpt_path):
62 | with open(init_ckpt_path, "rb") as f:
63 | G, PKG, trajectory, _cast_cnt, _num_nonvar_casts_to_fp16, _op_status = pickle.load(f)
64 | self.amp_predictor.cast_cnt = _cast_cnt
65 | self.amp_predictor.num_nonvar_casts_to_fp16 = _num_nonvar_casts_to_fp16
66 | self.amp_predictor.op_status = _op_status
67 | SingleLogger().info("Reading init graph from cache.")
68 | else:
69 | G = self.dag.copy()
70 | PKG = PKGraph(G)
71 |
72 | source_nodes = [n for n in G.nodes() if "host0.rank0" in n]
73 | trajectory = []
74 | for n in tqdm(source_nodes, total=len(source_nodes)):
75 | if self.amp_predictor.is_need_amp(G, n):
76 | s = (">", n, None)
77 | trajectory.append(s)
78 | self.apply(s, G, PKG)
79 |
80 | with open(init_ckpt_path, "wb") as f:
81 | pickle.dump([G, PKG, trajectory, self.amp_predictor.cast_cnt,
82 | self.amp_predictor.num_nonvar_casts_to_fp16, self.amp_predictor.op_status], f)
83 | SingleLogger().info("Graph cache dumped to {}.".format(init_ckpt_path))
84 |
85 | SingleLogger().info("Successfully initialized mixed precision strategy with {} cast(s).".format(
86 | self.amp_predictor.num_nonvar_casts_to_fp16))
87 | return G, PKG, trajectory
88 |
89 | def flush(self, is_accept: bool):
90 | self.amp_predictor.flush(is_accept)
91 |
--------------------------------------------------------------------------------
/dpro/cost_model/trace_clct.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ### MXNet env
4 | MXNET_CUDNN_AUTOTUNE_DEFAULT=0
5 | MXNET_GPU_WORKER_NTHREADS=1
6 | MXNET_EXEC_BULK_EXEC_TRAIN=0
7 |
8 | ##############################################################################
9 | ### Configuration
10 |
11 | # MODEL="ResNet50"
12 | MODEL="BertBase"
13 | # MODEL="InceptionV3"
14 | # MODEL="VGG16"
15 | # MODEL="Bert256"
16 |
17 | PLATFORM='TF'
18 | echo "Platform: ${PLATFORM}, Model: ${MODEL}"
19 |
20 | BPF_PATH=/home/tiger/byteprofile-analysis/analyze.py
21 | TRACE_PATH=${BYTEPS_TRACE_DIR}/bps_trace_final.json
22 | BPF_CMD="python3 ${BPF_PATH} --pretty --option collect --nccl_algo RING --path ${BYTEPS_TRACE_DIR} --platform TENSORFLOW --force"
23 |
24 | function bert_env {
25 | export BPF_BATCH_PER_GPU="${BS:-32}"
26 | export BPF_NUMSTEPS="${BPF_NUMSTEPS:-100}"
27 | export BERT_ZIP_DIR=/opt/tiger/bert/data/BERT-Base_uncase
28 | export BERT_BASE_DIR=$BERT_ZIP_DIR/uncased_L-12_H-768_A-12
29 | export MAX_SEQ_LENGTH=128
30 | export MAX_PREDICTIONS_PER_SEQ=20
31 | }
32 |
33 | function funcConfigBaseCMD {
34 | if [ "$MODEL" = "ResNet50" ] || [ "$MODEL" = "VGG16" ] || [ "$MODEL" = "InceptionV3" ]; then
35 | FILE_PATH="/home/tiger/horovod_examples/tensorflow/tensorflow_synthetic_benchmark.py"
36 | else
37 | FILE_PATH="/home/tiger/bert/run_pretraining_single_machine.py"
38 | fi
39 |
40 | if [ "$MODEL" = "ResNet50" ]; then
41 | BASE_CMD="python3 ${FILE_PATH} --batch-size ${batch_size} --classes 1000"
42 | elif [ "$MODEL" = "VGG16" ]; then
43 | BASE_CMD="python3 ${FILE_PATH} --batch-size ${batch_size} --classes 1000 --model VGG16"
44 | elif [ "$MODEL" = "InceptionV3" ]; then
45 | BASE_CMD="python3 ${FILE_PATH} --batch-size ${batch_size} --classes 1000 --model InceptionV3"
46 | elif [ "$MODEL" = "Bert256" ]; then
47 | bert_env
48 | BASE_CMD="python3 ${FILE_PATH} --train_batch_size=${batch_size} --input_file=$BERT_BASE_DIR/tf_examples.tfrecord --output_dir=$BERT_BASE_DIR/pretraining_output --do_train=True --do_eval=False --bert_config_file=$BERT_BASE_DIR/bert_config.json --max_seq_length=$MAX_SEQ_LENGTH --max_predictions_per_seq=$MAX_PREDICTIONS_PER_SEQ --num_train_steps=$BPF_NUMSTEPS --num_warmup_steps=10 --learning_rate=2e-5 --synthetic --model bert_default"
49 | elif [ "$MODEL" = "BertBase" ]; then
50 | bert_env
51 | BASE_CMD="python3 ${FILE_PATH} --train_batch_size=${batch_size} --input_file=$BERT_BASE_DIR/tf_examples.tfrecord --output_dir=$BERT_BASE_DIR/pretraining_output --do_train=True --do_eval=False --bert_config_file=$BERT_BASE_DIR/bert_config.json --max_seq_length=$MAX_SEQ_LENGTH --max_predictions_per_seq=$MAX_PREDICTIONS_PER_SEQ --num_train_steps=$BPF_NUMSTEPS --num_warmup_steps=10 --learning_rate=2e-5 --synthetic --model bert_base"
52 | else
53 | echo "Invalid model: $MODEL"
54 | exit
55 | fi
56 | }
57 |
58 | ### Start to train
59 | if [ ! -d "${BYTEPS_TRACE_DIR}/host0" ]; then
60 | mkdir -p "${BYTEPS_TRACE_DIR}/host0"
61 | else
62 | rm -rf ${BYTEPS_TRACE_DIR}/host0/*
63 | fi
64 | echo "Traces are stored at ${BYTEPS_TRACE_DIR}"
65 |
66 | function funcReset {
67 | rm $TRACE_PATH
68 | rm -rf $BYTEPS_TRACE_DIR/host0/*
69 | funcConfigBaseCMD
70 | }
71 |
72 | FIRST_RUN=1
73 | function funcRunAndTest {
74 | funcReset
75 | BYTEPS_TRACE_DIR=$BYTEPS_TRACE_DIR/host0 BYTEPS_TRACE_START_STEP=50 BYTEPS_TRACE_END_STEP=60 \
76 | nohup ${BASE_CMD}
77 | echo "dPRO fp32 BS=$batch_size: " >> ${BYTEPS_TRACE_DIR}/avg.txt
78 | echo "dPRO fp32 BS=$batch_size: "
79 | echo "Run the command: ${BASE_CMD}"
80 | if [ ${FIRST_RUN} == "1" ]; then
81 | echo "${BPF_CMD} --sub_option amp_data_clct,save_names=fp32,model=resnet,platform=tf,showall=True"
82 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=fp32,model=resnet,platform=tf,showall=True
83 | mv $BYTEPS_TRACE_DIR/host0/0 $BYTEPS_TRACE_DIR/.metadata/
84 | nvidia-smi >> $BYTEPS_TRACE_DIR/.metadata/config.txt
85 | # echo "$bs_to_try" >> $BYTEPS_TRACE_DIR/.metadata/config.txt
86 | else
87 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=None,model=resnet,platform=tf,showall=True
88 | fi
89 |
90 | funcReset
91 | BYTEPS_TRACE_DIR=$BYTEPS_TRACE_DIR/host0 BYTEPS_TRACE_START_STEP=50 BYTEPS_TRACE_END_STEP=60 \
92 | nohup ${BASE_CMD} --amp
93 | echo "dPRO fp16 BS=$batch_size: " >> ${BYTEPS_TRACE_DIR}/avg.txt
94 | echo "dPRO fp16 BS=$batch_size: "
95 | echo "Run the command: ${BASE_CMD} --amp"
96 | if [ ${FIRST_RUN} == "1" ]; then
97 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=fp16,model=resnet,platform=tf,showall=True
98 | FIRST_RUN=0
99 | else
100 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=None,model=resnet,platform=tf,showall=True
101 | fi
102 | }
103 |
104 | ### Run with different batch size
105 | bs_to_try=(4 8 16 32 64 128 256 512 1024 2048)
106 | for(( id=0; id < "${#bs_to_try[@]}"; id++ ))
107 | do
108 | batch_size=${bs_to_try[$id]}
109 | funcRunAndTest
110 | done
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/dpro/cost_model/trace_filter.py:
--------------------------------------------------------------------------------
1 | ''' This module is used to filter operators that we focus on for AMP
2 | And write them in avg.txt file
3 | write corresponding names in name.txt
4 | '''
5 | import sys, os
6 |
7 | class TraceFilter:
8 | def __init__(self, save_names=None, model=None, platform=None, showall=None):
9 | self.save_names = save_names
10 | self.platform = platform
11 | self.model = model
12 |
13 | if self.platform == 'tf':
14 | MNIST_CANDIDATES = ["Conv2D", "BiasAdd", "Relu", "MatMul", "Mul", "Cast", "BiasAddGrad", "ApplyAdam", "ReluGrad", "Conv2DBackpropInput", "Conv2DBackpropFilter"]
15 | RESNET50_CANDIDATES = ["Conv2D", "BiasAdd", "Relu", "MatMul", "Mul", "Cast"]
16 | else:
17 | MNIST_CANDIDATES = RESNET50_CANDIDATES = ["conv", "BiasAdd", "Relu", "MatMul", "Mul", "Cast"]
18 |
19 | if "resnet" in self.model.lower():
20 | self._CANDIDATES = RESNET50_CANDIDATES
21 | elif "mnist" in self.model.lower():
22 | self._CANDIDATES = MNIST_CANDIDATES
23 | elif "bert" in self.model.lower():
24 | self._CANDIDATES = None
25 | elif "dense" in self.model.lower():
26 | self._CANDIDATES = ["_dense", "MatMul", "Mat", "Cast"]
27 | else:
28 | self._CANDIDATES = None
29 |
30 | self.showall = showall.lower() in ["true", "t", "1"]
31 |
32 | def _is_ignore_for_sta(self, name):
33 | if self.showall:
34 | return False
35 | ### store the pid for computation
36 | if self._CANDIDATES is None:
37 | return False
38 | for target in self._CANDIDATES:
39 | if target in name:
40 | return False
41 | return True
42 |
43 | def dump_for_cost_model(self, name2sta, _dir):
44 | nameL = []
45 | avg = []
46 | var = []
47 | for name, statistic in sorted(name2sta.items()):
48 | if self._is_ignore_for_sta(name):
49 | continue
50 | name = ".".join(name.split("->")[1].split(".")[1:])
51 | nameL.append(name)
52 | avg.append(statistic["avg"])
53 | var.append(statistic["var"])
54 | # print(nameL, avg)
55 | if self.save_names != "None":
56 | with open(os.path.join(_dir, "name.txt"), "a") as f:
57 | f.write("{}:{}\n".format(self.save_names, str(nameL)))
58 | with open(os.path.join(_dir, "avg.txt"), "a") as f:
59 | f.write(str(avg) + "\n")
60 | f.write(str(var) + "\n")
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/dpro/debug_utils.py:
--------------------------------------------------------------------------------
1 | import ujson as json
2 | import os
3 | import time
4 |
5 | from .base import Singleton
6 |
7 | @Singleton
8 | class DebugRecorder:
9 | def __init__(self, path_=None, is_enable=True):
10 | self.is_enable = is_enable
11 | self.debug_traces = []
12 | self.path_ = path_
13 | self.base_time = self.get_time()
14 | self.ts_list = []
15 |
16 | def get_time(self):
17 | return time.time() * 1e6
18 |
19 | def debug_record(self, name, _ts, pid, tid):
20 | ''' Used for debug, collect traces while replaying
21 | * to optimize the replay algorithm
22 | '''
23 | if not self.is_enable:
24 | return
25 | self.debug_traces.append({
26 | "name": name,
27 | "ts": _ts * 10e6,
28 | "dur": ((self.get_time() - self.base_time) - _ts) ,
29 | "pid": pid,
30 | "ph": "X",
31 | "tid": tid
32 | })
33 |
34 | def debug_event_start(self):
35 | if not self.is_enable:
36 | return
37 | self.ts_list.append(self.get_time() - self.base_time)
38 |
39 | def debug_event_end(self, name, pid, tid):
40 | if not self.is_enable:
41 | return
42 | _ts = self.ts_list.pop()
43 | self.debug_traces.append({
44 | "name": name,
45 | "ts": _ts,
46 | "dur": (self.get_time() - self.base_time - _ts) ,
47 | "pid": pid,
48 | "ph": "X",
49 | "tid": tid
50 | })
51 |
52 | def dump_traces(self, path_=None):
53 | if not self.is_enable:
54 | return
55 | if path_ is not None:
56 | trace_path = path_
57 | elif self.path_ is not None:
58 | trace_path = self.path_
59 | else:
60 | raise ValueError("Trace path must be given")
61 |
62 | with open(os.path.join(trace_path, "debug.json"), 'w') as f:
63 | json.dump({"traceEvents": self.debug_traces,
64 | "displayTimeUnit": "ms"
65 | }, f, indent=4)
66 |
67 |
--------------------------------------------------------------------------------
/dpro/helper/combine_json.py:
--------------------------------------------------------------------------------
1 | ''' Combine trace files
2 | * Usage
3 | python3 combine_trace.py files/to/combine path/to/dump/rst pid/names bias
4 | The first item of the command parameter is the path of the json file to be combined, separated by commas;
5 | The second item is the path to store the result;
6 | The third item is the relabel pid for each input file,
7 | and the bias is used to manually align the time.
8 | An example is python3 combine_trace.py path_a, path_b path_rst pid_a, pid_b 0,0
9 | '''
10 | import ujson as json
11 | import os, sys
12 | ALIGN_TIME = True
13 | KEEP_PID = False
14 |
15 | def combine_files(files, names, bias, output_path):
16 | final_traces = []
17 | for idx, file in enumerate(files):
18 | with open(file, 'r') as fp:
19 | traces = json.load(fp)
20 | if "traceEvents" in traces:
21 | traces = traces["traceEvents"]
22 | ts = None
23 | for trace in traces:
24 | if ALIGN_TIME and ts is None:
25 | ts = trace["ts"]
26 | if not KEEP_PID:
27 | trace["pid"] = names[idx]
28 | else:
29 | trace["pid"] = names[idx] + "." + trace["pid"]
30 | if ALIGN_TIME:
31 | trace["ts"] = trace["ts"] - ts
32 | trace["ts"] += bias[idx]
33 | final_traces += traces
34 |
35 | with open(output_path, 'w') as fp:
36 | json.dump(final_traces, fp)
37 |
38 | files = sys.argv[1]
39 | output_path = sys.argv[2]
40 | if len(sys.argv) >= 5:
41 | bias = [float(n)*1000 for n in sys.argv[4].split(",")]
42 | else:
43 | bias = [0 for _ in files]
44 |
45 | files = files.split(",")
46 |
47 | if len(files) == 1 and os.path.isdir(files[0]):
48 | names = sorted(os.listdir(files[0]))
49 | files = [os.path.join(files[0], n) for n in names]
50 | else:
51 | names = sys.argv[3].split(",")
52 |
53 | combine_files(files, names, bias, output_path)
54 |
--------------------------------------------------------------------------------
/dpro/helper/compare_graph.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | from google.protobuf.json_format import MessageToJson
3 | from google.protobuf.text_format import Parse
4 | import tensorflow as tf
5 | import json
6 |
7 | try:
8 | GraphDef = tf.GraphDef
9 | except:
10 | GraphDef = tf.compat.v1.GraphDef
11 |
12 | def tf_relabel_func(_name, update_nodes_in_dag):
13 | for prefix in ["Comm.", "Comp.", "BW.", "FW.", "UPDATE_."]:
14 | if _name.startswith(prefix):
15 | return _name
16 | if _name.startswith("^"):
17 | _name = _name[1:]
18 | last_slash_pos = _name.rfind("/")
19 | if last_slash_pos != -1 and last_slash_pos < len(_name)-1 and _name[last_slash_pos+1] == "_":
20 | _name = _name[:last_slash_pos]
21 | if "BytePSPushPull" in _name and "tensor" not in _name:
22 | _name = "Comm." + _name
23 | elif "allreduce" in _name.lower():
24 | if "." in _name:
25 | _, tensor_name = _name.split(".")
26 | if "_" in tensor_name:
27 | tensor_name = tensor_name.split("_")[0]
28 | _name = "Comm." + tensor_name
29 | else:
30 | _name = "UPDATE_." + _name
31 | else:
32 | if update_nodes_in_dag is not None and _name in update_nodes_in_dag \
33 | or _name == "GradientDescent":
34 | _name = "UPDATE_." + _name
35 | elif _name == "GradientDescent":
36 | _name = ""
37 | elif _name.startswith("gradients"):
38 | _name = "BW." + _name
39 | else:
40 | _name = "FW." + _name
41 | return _name
42 |
43 | def wrap_read_graphdef(graphdef_path):
44 | if graphdef_path.endswith("pbtxt"):
45 | with open(graphdef_path, "r") as f:
46 | pb = f.read()
47 | graph_def = Parse(pb, GraphDef())
48 | json_string = MessageToJson(graph_def)
49 | graph_def = json.loads(json_string)
50 | else:
51 | with open(graphdef_path, "r") as f:
52 | graph_def = json.load(f)
53 | graph = nx.DiGraph()
54 | for node in graph_def["node"]:
55 | if "input" in node:
56 | for input_tensor_name in node["input"]:
57 | input_node_name = input_tensor_name.split(":")[0]
58 | graph.add_edge(input_node_name, node["name"])
59 | update_nodes_in_dag = set()
60 | def recursive_add_succs(_node):
61 | for succ_ in graph.successors(_node):
62 | update_nodes_in_dag.add(succ_)
63 | recursive_add_succs(succ_)
64 | for node in graph.nodes:
65 | if "allreduce" in node.lower() or "bytepspushpull" in node.lower():
66 | recursive_add_succs(node)
67 | new_graph = nx.DiGraph()
68 | for u, v in graph.edges:
69 | new_graph.add_edge(tf_relabel_func(u, update_nodes_in_dag), tf_relabel_func(v, update_nodes_in_dag))
70 | return new_graph, update_nodes_in_dag
71 |
72 | dag = nx.read_gml("/root/capture_file/run_0_dec8/simple_dag.gml")
73 |
74 | graphdef, update_nodes = wrap_read_graphdef("/root/bert/traces/before_mark_for_compilation_5.pbtxt")
75 |
76 | dag_nodes = set(dag.nodes)
77 | graphdef_nodes = set(graphdef.nodes)
78 |
79 | import code
80 | code.interact(local=locals())
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/dpro/helper/get_iter_time_from_trace.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | with open("/Users/chenyu/Downloads/20210127_02_hvd_tf_vgg16_rdma_apply_xla_no_tensor_fusion/combined.json", "r") as f:
4 | trace = json.load(f)
5 |
6 | # one_pid = -1
7 | pids = set()
8 | for ev in trace["traceEvents"]:
9 | # if ev["ph"] == "M":
10 | # if ev["name"] == "process_name" and "name" in ev["args"]:
11 | # # if "/job:localhost/replica:0/task:0/device:GPU:" in ev["args"]["name"] \
12 | # print(ev["args"]["name"])
13 | # if "stream:all" in ev["args"]["name"] \
14 | # and "Compute" in ev["args"]["name"]:
15 | # one_pid = ev["pid"]
16 | pids.add(ev["pid"])
17 |
18 | evs = sorted(trace["traceEvents"], key=lambda x: x["ts"])
19 |
20 | iter_times_pid = {}
21 |
22 | for pid in pids:
23 | source_sts = []
24 | dep_eds = []
25 | started = False
26 | last_assign = -1
27 | for ev in evs:
28 | if ev["ph"] == "X" and ev["pid"] == pid:
29 | # if "args" in ev and ev["args"]["name"] == "_SOURCE":
30 | if "args" in ev and "ncclAllReduceRingLLKernel" in ev["args"]["name"] and started == False:
31 | source_sts.append(ev["ts"])
32 | if last_assign != -1:
33 | dep_eds.append(last_assign)
34 | last_assign = -1
35 | started = True
36 | # elif "args" in ev and ev["args"]["name"] == "group_deps_1":
37 | elif "args" in ev and "ncclAllReduceRingLLKernel" in ev["args"]["name"]:
38 | # started = False
39 | last_assign = ev["ts"] + ev["dur"]
40 | elif "args" in ev and "GradientDescent" == ev["args"]["name"]:
41 | started = False
42 | if last_assign != -1:
43 | dep_eds.append(last_assign)
44 |
45 | source_sts = sorted(source_sts)
46 | dep_eds = sorted(dep_eds)
47 |
48 | iter_times = []
49 | for i in range(len(source_sts)):
50 | iter_times.append((dep_eds[i] - source_sts[i]) / 1000)
51 | iter_times_pid[pid] = iter_times
52 |
53 | avg = []
54 | avg_per_iter = [float("inf")] * 10
55 | for pid, iter_times in iter_times_pid.items():
56 | print("PID {}: {}".format(pid, iter_times))
57 | avg += iter_times
58 | for idx, time in enumerate(iter_times):
59 | avg_per_iter[idx] = min(time, avg_per_iter[idx])
60 |
61 | print("Average: {}".format(sum(avg) / len(avg)))
62 | print("Average min per iter: {}, details: {}".format(sum(avg_per_iter)/len(avg_per_iter), avg_per_iter))
63 |
--------------------------------------------------------------------------------
/dpro/helper/tf_flops_profile.py:
--------------------------------------------------------------------------------
1 |
2 | '''
3 | Refer to: https://gist.github.com/shinseung428/752f284d1c065870d7f5a7e4208f0583
4 | '''
5 | import json
6 | import os, sys
7 | import tensorflow as tf
8 | from google.protobuf.json_format import Parse as ParseJSON
9 | from google.protobuf.text_format import Parse as ParseText
10 | from google.protobuf.json_format import MessageToJson
11 | try:
12 | GraphDef = tf.GraphDef
13 | except:
14 | GraphDef = tf.compat.v1.GraphDef
15 |
16 | try:
17 | import horovod.tensorflow as hvd
18 | except:
19 | pass
20 |
21 |
22 | def profile_flops(graph_def_path, tmp_path):
23 | with open(graph_def_path, "r") as f:
24 | if graph_def_path.endswith("pbtxt"):
25 | pb = f.read()
26 | graph_def = ParseText(pb, GraphDef())
27 | json_string = MessageToJson(graph_def)
28 | graph_def_as_json = json.loads(json_string)
29 | else:
30 | graph_def_as_json = json.load(f)
31 | cleaned_graph_def_str = json.dumps(graph_def_as_json)
32 | graph_def = ParseJSON(cleaned_graph_def_str, GraphDef())
33 |
34 | with tf.Graph().as_default() as graph:
35 | tf.import_graph_def(graph_def, name='')
36 | with graph.as_default():
37 | opt = (tf.profiler.ProfileOptionBuilder(
38 | tf.profiler.ProfileOptionBuilder.float_operation())
39 | .with_file_output(tmp_path)
40 | .build())
41 | flops = tf.profiler.profile(graph, options=opt)
42 | total_flops = flops.total_float_ops
43 | print ("========================================================")
44 | print ('Total Flops : {}'.format(total_flops))
45 |
46 | # opt = tf.profiler.ProfileOptionBuilder.time_and_memory()
47 | # rst = tf.profiler.profile(graph, options=opt)
48 | # print(type(rst))
49 |
50 | def parse_flops_dict(graph_def_path, tmp_path):
51 | profile_flops(graph_def_path, tmp_path)
52 | op_name2flops = {}
53 | with open(tmp_path, 'r') as fp:
54 | lines = fp.read().split("Profile:\n")[1].split("\n")[2:]
55 | for line in lines:
56 | line_split = line.split(" ")
57 | if len(line_split) < 5:
58 | continue
59 | # print(line_split)
60 | op_name = line_split[2]
61 | flops_str = line_split[3].split("/")[1]
62 | if flops_str[-1] == "k":
63 | flops = float(flops_str[:-1]) * 1e3
64 | elif flops_str[-1] == "m":
65 | flops = float(flops_str[:-1]) * 1e6
66 | elif flops_str[-1] == "b":
67 | flops = float(flops_str[:-1]) * 1e9
68 | elif flops_str[-1] == "p":
69 | flops = float(flops_str[:-1]) * 1e12
70 | else:
71 | flops = float(flops_str)
72 | op_name2flops[op_name] = flops
73 | return op_name2flops
74 |
75 | if __name__ == "__main__":
76 | graph_def_path = sys.argv[1]
77 | parse_flops_dict(graph_def_path, "flops_log.txt")
78 |
--------------------------------------------------------------------------------
/dpro/helper/visualize.py:
--------------------------------------------------------------------------------
1 | import json
2 | import matplotlib.pyplot as plt
3 | import seaborn as sns
4 | import numpy as np
5 | import os, sys
6 | import math
7 |
8 | from ..arg_utils import SingleArg
9 | args = SingleArg().args
10 |
11 | ''' visualize the number of operators being queued on each device for replayer
12 | '''
13 |
14 | def init_fig_base(cnt):
15 | h = math.ceil(math.sqrt(cnt))
16 | w = math.ceil(cnt / h)
17 | fig_base = w * 100 + h * 10 + 1
18 | return fig_base, 0
19 |
20 | with open(os.path.join(args.path, 'queue_status.json'), 'r') as fp:
21 | rst = json.load(fp)
22 |
23 | MAXIMUM_GROUP = 4
24 | plt.figure(num=1, figsize=(8, 6))
25 | clrs = sns.color_palette("husl", MAXIMUM_GROUP+1)
26 |
27 | ### shape = (time+num_of_nodes_queued, num_data)
28 | data = np.array(sorted(rst['data'], key=lambda x:x[0])).T
29 |
30 | sample_num = 1000
31 | if sample_num is None:
32 | mask = np.ones(data.shape[1], dtype=bool)
33 | else:
34 | mask = np.zeros(data.shape[1], dtype=bool)
35 | sample_idx = np.random.choice(data.shape[1], sample_num, replace=False)
36 | mask[sample_idx] = True
37 |
38 | group_dict = {}
39 | for idx, n in sorted(enumerate(rst['names']), key=lambda x: x[1]):
40 | group = n.split('->')[0]
41 | if group not in group_dict:
42 | group_dict[group] = []
43 | group_dict[group].append(idx)
44 |
45 | fig_base, _ = init_fig_base(min(MAXIMUM_GROUP, len(group_dict)))
46 | for idx, (group, name_idx_list) in enumerate(group_dict.items()):
47 | if idx >= MAXIMUM_GROUP:
48 | break
49 | ax = plt.subplot(fig_base + idx)
50 | for idx, name_idx in enumerate(name_idx_list):
51 | ax.plot(data[0][mask]/1000., data[name_idx+1][mask], c=clrs[idx], label=rst['names'][name_idx])
52 | plt.legend()
53 | plt.xlabel('Time (ms)')
54 | plt.ylabel('# of operators being queued')
55 | plt.title(group)
56 | plt.show()
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/dpro/hvd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/hvd/__init__.py
--------------------------------------------------------------------------------
/dpro/logger_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os, sys
3 |
4 | from .base import Singleton
5 |
6 | LOG_LEVEL_NAME = ["DEBUG", "INFO", "WARNING", "ERROR", "FATAL"]
7 |
8 | ## Ref: https://stackoverflow.com/questions/12980512/custom-logger-class-and-correct-line-number-function-name-in-log
9 | # This code is mainly copied from the python logging module, with minor modifications
10 |
11 | # _srcfile is used when walking the stack to check when we've got the first
12 | # caller stack frame.
13 | #
14 | if hasattr(sys, 'frozen'): #support for py2exe
15 | _srcfile = "logging%s__init__%s" % (os.sep, __file__[-4:])
16 | elif __file__[-4:].lower() in ['.pyc', '.pyo']:
17 | _srcfile = __file__[:-4] + '.py'
18 | else:
19 | _srcfile = __file__
20 | _srcfile = os.path.normcase(_srcfile)
21 |
22 | @Singleton
23 | class SingleLogger:
24 | def __init__(self, path, name, logging_level="INFO", is_clean=False, show_progress=False):
25 | dirname = path if os.path.isdir(path) else os.path.dirname(path)
26 | dirname = os.path.join(path, ".log")
27 | if not os.path.exists(dirname):
28 | os.makedirs(dirname)
29 | logfile = os.path.join(dirname, "log_option-" + name + ".txt")
30 | if is_clean and os.path.exists(logfile):
31 | os.remove(logfile)
32 | #! config logging
33 | self.logger = logging.getLogger(name)
34 | log_level = logging_level.lower()
35 | if log_level == "trace":
36 | _log_level = logging.TRACE
37 | elif log_level == "debug":
38 | _log_level = logging.DEBUG
39 | elif log_level == "warn" or log_level == "warning":
40 | _log_level = logging.WARNING
41 | elif log_level == "error":
42 | _log_level = logging.ERROR
43 | else:
44 | _log_level = logging.INFO
45 | self.logger.setLevel(level=_log_level)
46 |
47 | formatter = logging.Formatter('[%(asctime)s] [%(filename)s:%(lineno)d] %(levelname)s - %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
48 |
49 | #! bind some file stream
50 | handler = logging.FileHandler(logfile)
51 | handler.setLevel(_log_level)
52 | handler.setFormatter(formatter)
53 | self.logger.addHandler(handler)
54 |
55 | if not show_progress:
56 | #! if we want show progress, no need to bind the output stream
57 | console = logging.StreamHandler()
58 | console.setLevel(_log_level)
59 | console.setFormatter(formatter)
60 | self.logger.addHandler(console)
61 |
62 | def info(self, msg, *args, **kwargs):
63 | self._log(logging.INFO, msg, args, **kwargs)
64 |
65 | def error(self, msg, *args, **kwargs):
66 | self._log(logging.ERROR, msg, args, **kwargs)
67 |
68 | def debug(self, msg, *args, **kwargs):
69 | self._log(logging.DEBUG, msg, args, **kwargs)
70 |
71 | def warn(self, msg, *args, **kwargs):
72 | self._log(logging.WARNING, msg, args, **kwargs)
73 |
74 | def warning(self, msg, *args, **kwargs):
75 | self._log(logging.WARNING, msg, args, **kwargs)
76 |
77 | def _log(self, level, msg, args, exc_info=None, extra=None):
78 | """
79 | Low-level logging routine which creates a LogRecord and then calls
80 | all the handlers of this logger to handle the record.
81 | """
82 | # Add wrapping functionality here.
83 | if _srcfile:
84 | #IronPython doesn't track Python frames, so findCaller throws an
85 | #exception on some versions of IronPython. We trap it here so that
86 | #IronPython can use logging.
87 | try:
88 | fn, lno, func = self.findCaller()
89 | except ValueError:
90 | fn, lno, func = "(unknown file)", 0, "(unknown function)"
91 | else:
92 | fn, lno, func = "(unknown file)", 0, "(unknown function)"
93 | if exc_info:
94 | if not isinstance(exc_info, tuple):
95 | exc_info = sys.exc_info()
96 | record = self.logger.makeRecord(
97 | self.logger.name, level, fn, lno, msg, args, exc_info, func, extra)
98 | self.logger.handle(record)
99 |
100 |
101 | def findCaller(self):
102 | """
103 | Find the stack frame of the caller so that we can note the source
104 | file name, line number and function name.
105 | """
106 | f = logging.currentframe()
107 | #On some versions of IronPython, currentframe() returns None if
108 | #IronPython isn't run with -X:Frames.
109 | if f is not None:
110 | f = f.f_back
111 | rv = "(unknown file)", 0, "(unknown function)"
112 | while hasattr(f, "f_code"):
113 | co = f.f_code
114 | filename = os.path.normcase(co.co_filename)
115 | if filename == _srcfile:
116 | f = f.f_back
117 | continue
118 | rv = (co.co_filename, f.f_lineno, co.co_name)
119 | break
120 | return rv
121 |
122 |
123 |
--------------------------------------------------------------------------------
/dpro/memory/.gitignore:
--------------------------------------------------------------------------------
1 | *_test.py
2 | scripts/
--------------------------------------------------------------------------------
/dpro/memory/README.md:
--------------------------------------------------------------------------------
1 | # Memory Estimation
2 |
3 | Usage:
4 | ```python
5 | from memory import MemoryEstimator
6 |
7 | memory_estimator = MemoryEstimator("TENSORFLOW")
8 | estimated_memory_usage = memory_estimator.estimate(dag, param_dict)
9 | ```
10 |
11 | default unit: MB
12 |
13 |
14 | ## TODO
15 |
16 | - [ ] workload specific. need to know the model.
17 |
18 | e.g. select forward nodes
19 | ```py
20 | def _is_forward(name):
21 | if name.startswith(DEL.join([RANK0_PREFIX, FORWARD_CAT]) + "bert"):
22 | return True
23 | return False
24 | ```
25 |
26 |
--------------------------------------------------------------------------------
/dpro/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from .estimator import MemoryEstimator
2 |
3 |
4 | __all__ = [
5 | "MemoryEstimator"
6 | ]
--------------------------------------------------------------------------------
/dpro/memory/cost_model.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | from copy import deepcopy
3 |
4 | from .gradient_accumulation import get_gradient_accumulation_edited_graph
5 | from ..logger_utils import SingleLogger
6 | from ..cost_model.base import _BaseGraphPass
7 | from .recomputation import get_recomputation_edited_graph
8 | from ..replay import Replayer
9 | from .utils import *
10 |
11 |
12 | def get_execution_time(dag, clct):
13 | replayer = Replayer(dag=dag, _step_num=1,
14 | leaf_dirs=clct.all_prefix_list(),
15 | dump_path=clct.pm.path,
16 | comm_backend=clct.comm_backend,
17 | byteps_graph=clct.byteps_graph)
18 | step_end_time_ms = [
19 | t / 1000 for t in replayer.replayAndDelay(None).values()]
20 | return max(step_end_time_ms)
21 |
22 | def has_recomputation(schedule):
23 | for op in schedule.operators:
24 | if op.requires_grad is False:
25 | return True
26 | return False
27 |
28 | class MemoryGraphPass(_BaseGraphPass):
29 | def __init__(self, opt):
30 | super().__init__(opt)
31 | self.token = ["gradient_accumulation", "recomputation"]
32 | self.cnts = [0, 0]
33 |
34 | def init_search_space(self, candidates, dag, pkg):
35 | candidate_strategies = []
36 | candidate_weights = []
37 | for i, strategy in enumerate(self.token):
38 | if strategy == "recomputation":
39 | if has_recomputation(self.opt.memory_estimator.schedule):
40 | continue
41 |
42 | func = self.func_factory(strategy)
43 | estimated_time, estimated_memory = func(dag, self.opt.clct)
44 | if estimated_memory > self.opt.memory_budget:
45 | continue
46 | candidate_strategies.append((strategy, None, None))
47 | candidate_weights.append(1./(self.cnts[0] + 1))
48 |
49 | return candidate_strategies, candidate_weights
50 |
51 | def apply(self, s, dag, pkg):
52 | if s[0] == "gradient_accumulation":
53 | if self.opt.memory_estimator.batch_size > 1:
54 | self.opt.memory_estimator.batch_size /= 2
55 | get_gradient_accumulation_edited_graph(dag)
56 | self.cnts[0] += 1
57 | elif s[0] == "recomputation":
58 | get_recomputation_edited_graph(
59 | dag, self.opt.memory_estimator.schedule, "speed")
60 | self.cnts[1] += 1
61 | else:
62 | raise NotImplementedError
63 | return True, [], []
64 |
65 | def func_factory(self, strategy):
66 | func_name = "_get_estimated_time_and_memory_of_" + strategy
67 | return getattr(self, func_name)
68 |
69 | def _get_estimated_time_and_memory_of_gradient_accumulation(self, dag, clct):
70 | dag_copy = deepcopy(dag)
71 | get_gradient_accumulation_edited_graph(dag_copy)
72 | estimated_time = get_execution_time(dag_copy, clct)
73 |
74 | self.opt.memory_estimator.batch_size /= 2
75 | estimated_memory = self.opt.memory_estimator.estimate(
76 | dag, clct.para_dict)
77 | self.opt.memory_estimator.batch_size *= 2 # restore
78 |
79 | SingleLogger().info("Estimated time and memory after applying gradient accumulation: {:.2f}ms, {:.2f}GB".format(
80 | estimated_time, estimated_memory
81 | ))
82 | return estimated_time, estimated_memory
83 |
84 | def _get_estimated_time_and_memory_of_recomputation(self, dag, clct):
85 | dag_copy = deepcopy(dag)
86 | prev_nodes = deepcopy(self.opt.memory_estimator.schedule.operators)
87 | get_recomputation_edited_graph(
88 | dag_copy, self.opt.memory_estimator.schedule, "speed")
89 | estimated_time = get_execution_time(dag_copy, clct)
90 |
91 | estimated_memory = self.opt.memory_estimator.estimate(
92 | dag, clct.para_dict)
93 |
94 | # dirty implementation ...
95 | for op, prev_op in zip(self.opt.memory_estimator.schedule.operators, prev_nodes):
96 | op.requires_grad = prev_op.requires_grad
97 |
98 | SingleLogger().info("Estimated time and memory after applying recomputation: {:.2f}ms, {:.2f}GB".format(
99 | estimated_time, estimated_memory
100 | ))
101 | return estimated_time, estimated_memory
102 |
103 |
104 | class IncreasingBatchSizeCostModel(_BaseGraphPass):
105 | def __init__(self, opt):
106 | super().__init__(opt)
107 | self.token = ["increase_batch_size"]
108 | self.cnt = 0
109 |
110 | def init_search_space(self, candidates, dag, pkg):
111 | candidate_strategies = []
112 | candidate_weights = []
113 | for strategy in self.token:
114 | func = self.func_factory(strategy)
115 | estimated_time, estimated_memory = func(dag, self.opt.clct)
116 | candidate_strategies.append((strategy, None, None))
117 | candidate_weights.append(1./(self.cnt + 1))
118 |
119 | return candidate_strategies, candidate_weights
120 |
121 | def apply(self, s, dag, pkg):
122 | # TODO(yuchen): determine batch size upper bound
123 | if self.opt.memory_estimator.batch_size < 1024:
124 | self.opt.memory_estimator.batch_size *= 2
125 | self._update_dag(dag)
126 | self.cnt += 1
127 | return True, [], []
128 |
129 | def func_factory(self, strategy):
130 | func_name = "_get_estimated_time_and_memory_of_" + strategy
131 | return getattr(self, func_name)
132 |
133 | def _update_dag(self, dag):
134 | computation_nodes = filter_out_comm_nodes(dag)
135 | update_time_by_scale(dag.subgraph(computation_nodes), 0.8)
136 |
137 | def _get_estimated_time_and_memory_of_increase_batch_size(self, dag, clct):
138 | dag_copy = deepcopy(dag)
139 | self._update_dag(dag_copy)
140 |
141 | estimated_time = get_execution_time(dag_copy, clct)
142 |
143 | self.opt.memory_estimator.batch_size *= 2
144 | estimated_memory = self.opt.memory_estimator.estimate(
145 | dag, clct.para_dict)
146 | self.opt.memory_estimator.batch_size /= 2 # restore
147 |
148 | SingleLogger().info("Estimated time and memory after applying increasing batch size: {:.2f}ms, {:.2f}GB".format(
149 | estimated_time, estimated_memory
150 | ))
151 | return estimated_time, estimated_memory
152 |
153 | def load_init_ckpt(self, G_prime=None):
154 | return None, None, []
155 |
156 | def load_ckpt(self):
157 | return
158 |
159 | def checkpoint(self):
160 | return
161 |
162 | def flush(self, is_accept):
163 | return
164 |
--------------------------------------------------------------------------------
/dpro/memory/estimator.py:
--------------------------------------------------------------------------------
1 | from .node import Node
2 | from .schedule import Schedule
3 | from .utils import *
4 |
5 | import networkx as nx
6 |
7 |
8 | class MemoryEstimator:
9 |
10 | def __init__(self, platform):
11 | self.platform = platform
12 | self.default_batch_size = 32 # TODO(yuchen): should read from graph
13 | self.batch_size = self.default_batch_size
14 | self._schedule = None
15 | self._cached_result = 0
16 |
17 | @property
18 | def schedule(self):
19 | return self._schedule
20 |
21 | @schedule.setter
22 | def schedule(self, val):
23 | self._schedule = val
24 |
25 | def _compose_operator_schedule(self, dag, param_dict) -> Schedule:
26 | forward_nodes = get_forward_nodes(dag.nodes)
27 | forward_graph = dag.subgraph(forward_nodes).copy()
28 |
29 | leaf_nodes = get_leaf_nodes(forward_graph)
30 | forward_graph.remove_nodes_from(leaf_nodes)
31 | leaf_nodes = remove_nodes_prefix(
32 | leaf_nodes, DEL.join([RANK0_PREFIX, FORWARD_CAT]))
33 |
34 | sorted_forward_nodes = nx.topological_sort(forward_graph)
35 | sorted_forward_nodes = remove_nodes_prefix(
36 | sorted_forward_nodes, DEL.join([RANK0_PREFIX, FORWARD_CAT]))
37 |
38 | metadata = param_dict.metainfo.tf_meta
39 | operator_schedule = Schedule(self.platform)
40 | trace_times = nx.get_node_attributes(dag, "avg")
41 | trace_times = {remove_node_prefix(k, DEL.join(
42 | [RANK0_PREFIX, FORWARD_CAT])): v for k, v in trace_times.items()}
43 | for node in leaf_nodes:
44 | op = Node.from_metadata(
45 | node, metadata, trace_times[node])
46 | operator_schedule.add(op)
47 |
48 | for node in sorted_forward_nodes:
49 | op = Node.from_metadata(node, metadata, trace_times[node])
50 | operator_schedule.add(op)
51 |
52 | return operator_schedule
53 |
54 | def _simulate_memory_allocation(self, operator_schedule) -> float:
55 | peak_size = 0
56 | total_activations = 0
57 | total_param_size = 0
58 |
59 | def _get_param_size():
60 | # including optimizer states, such as momentums
61 | nonlocal total_param_size
62 | for param in operator_schedule.parameters:
63 | total_param_size += param.get_output_size()
64 |
65 | def _simulate_forward_propagation():
66 | nonlocal total_activations, peak_size
67 | for op in operator_schedule.operators:
68 | if op.requires_grad:
69 | total_activations += op.get_output_size()
70 |
71 | temp_size = op.get_temp_size()
72 | peak_size = max(peak_size, total_activations + temp_size)
73 |
74 | def _simulate_backward_propagation():
75 | nonlocal total_activations, peak_size
76 | restore_list = []
77 | for i, op in reversed(list(enumerate(operator_schedule.operators))):
78 | output_grad_size = op.get_output_size()
79 |
80 | j = i
81 | while j >= 0 and not operator_schedule.operators[j].requires_grad:
82 | total_activations += operator_schedule.operators[j].get_output_size(
83 | )
84 | operator_schedule.operators[j].requires_grad = True
85 | restore_list.append(operator_schedule.operators[j])
86 | j -= 1
87 |
88 | temp_size = op.get_temp_size()
89 | peak_size = max(peak_size, total_activations +
90 | output_grad_size + temp_size)
91 | total_activations -= output_grad_size
92 |
93 | # restore
94 | for op in restore_list:
95 | op.requires_grad = False
96 |
97 | def _byte_to_GB(size):
98 | return size / (1000**3)
99 |
100 | _get_param_size()
101 | _simulate_forward_propagation()
102 | _simulate_backward_propagation()
103 |
104 | peak_size, total_param_size = _byte_to_GB(
105 | peak_size), _byte_to_GB(total_param_size)
106 |
107 | peak_size *= self.batch_size / self.default_batch_size
108 |
109 | # TODO(yuchen): Not expandable. This is for Adam.
110 | total = peak_size + total_param_size / 3 * 8
111 | self._cached_result = total
112 | return total
113 |
114 | def estimate(self, dag, param_dict):
115 | """Estimate memory usage based on computation graph
116 |
117 | Args:
118 | dag (nx.DiGraph): computation graph
119 | param_dict (ParameterDict): operator information
120 |
121 | Returns:
122 | [float]: memory usage in GB
123 | """
124 | if not self._schedule:
125 | self._schedule = self._compose_operator_schedule(dag, param_dict)
126 | return self._simulate_memory_allocation(self._schedule)
127 |
128 | @property
129 | def cached_memory_estimation(self):
130 | return self._cached_result
131 |
--------------------------------------------------------------------------------
/dpro/memory/gradient_accumulation.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 |
3 |
4 | def get_gradient_accumulation_edited_graph(dag, verbose=False):
5 | _apply_gradient_accumulation(dag, verbose)
6 | return True
7 |
8 |
9 | def _apply_gradient_accumulation(dag, verbose):
10 | _update_dag(dag, verbose)
11 |
12 |
13 | def _update_dag(dag, verbose):
14 | computation_nodes = filter_out_comm_nodes(dag)
15 | update_time_by_scale(dag.subgraph(computation_nodes), 0.8)
16 |
17 | # TODO(yuchen): deal with other ranks
18 | filtered_nodes = get_forward_backward_nodes(dag.nodes)
19 | subgraph = dag.subgraph(filtered_nodes)
20 |
21 | target = filtered_nodes[0] # first node
22 |
23 | mapping = {node: node+"_ga" for node in subgraph.nodes}
24 | subgraph = nx.relabel_nodes(subgraph, mapping)
25 |
26 | insert_nodes(dag, subgraph, target)
27 |
--------------------------------------------------------------------------------
/dpro/memory/node.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Node:
5 | def __init__(self, name, op, input, dtype, shape, time):
6 | self._name = name
7 | self._op = op
8 | self._input = input
9 | self._dtype = dtype
10 | self._shape = shape
11 | self._requires_grad = True
12 | self._inplace = False
13 | self._time = time
14 |
15 | @classmethod
16 | def from_metadata(cls, name, metadata, time):
17 | """create node from metadata
18 |
19 | Args:
20 | name (str): node name
21 | metadata (dict): from metadata.json
22 | time (float): execution time based on trace
23 |
24 | Returns:
25 | [Node]: created node
26 | """
27 | def _get_op(node_def):
28 | return node_def.get("op").lower()
29 |
30 | def _get_input(node_def):
31 | inputs = node_def.get("input")
32 | if not inputs:
33 | return None
34 |
35 | names = []
36 | for input in inputs:
37 | full_name = input.get("name")
38 | name = full_name.rsplit(':')[0]
39 | names.append(name)
40 | return names
41 |
42 | def _get_dtype(node_def):
43 | output = node_def.get("output")
44 | if not output:
45 | return None
46 | dtype = output[0].get("dtype")
47 | if not dtype or dtype == "string":
48 | return None
49 |
50 | if dtype.endswith("_ref"):
51 | dtype = dtype[:-len("_ref")]
52 |
53 | return np.dtype(dtype)
54 |
55 | def _get_shape(node_def):
56 | output = node_def.get("output")
57 | if not output:
58 | return None
59 | return tuple(output[0].get("shape"))
60 |
61 | if name not in metadata:
62 | return None
63 |
64 | node_def = metadata[name]
65 |
66 | return cls(name,
67 | _get_op(node_def),
68 | _get_input(node_def),
69 | _get_dtype(node_def),
70 | _get_shape(node_def),
71 | time)
72 |
73 | def is_valid(self):
74 | """check the node's validity
75 |
76 | Returns:
77 | [bool]: validity
78 | """
79 | def _is_valid_op(op):
80 | if op in ["NoOp"]:
81 | return False
82 | return True
83 |
84 | def _is_not_none(value):
85 | if value is None:
86 | return False
87 | return True
88 |
89 | def _is_valid_shape(shape):
90 | if not isinstance(shape, tuple):
91 | return False
92 | if not shape or shape[0] == -1:
93 | return False
94 | return True
95 |
96 | return all([
97 | _is_valid_op(self.op),
98 | _is_not_none(self.dtype),
99 | _is_not_none(self.input),
100 | _is_valid_shape(self.shape)
101 | ])
102 |
103 | def is_parameter(self):
104 | """Whether this node is parameter node
105 |
106 | Returns:
107 | [bool]: is parameter node
108 | """
109 | if self._op == "variablev2":
110 | return True
111 | return False
112 |
113 | def get_num_ele(self):
114 | """get number of elements
115 |
116 | Returns:
117 | [int]: number of elements
118 | """
119 | return np.prod(self.shape)
120 |
121 | def get_output_size(self):
122 | """get output size
123 |
124 | Returns:
125 | [float]: size in Byte
126 | """
127 | return np.prod(self.shape) * self.dtype.itemsize
128 |
129 | def get_temp_size(self):
130 | """get temporary buffer size
131 |
132 | useful for cudnn workspace size
133 |
134 | Returns:
135 | [foat]: size in Byte
136 | """
137 | return 0
138 |
139 | @property
140 | def name(self):
141 | """get name
142 |
143 | Returns:
144 | [str]: node name
145 | """
146 | return self._name
147 |
148 | @property
149 | def op(self):
150 | """get operator type
151 |
152 | Returns:
153 | [str]: operator type
154 | """
155 | return self._op
156 |
157 | @property
158 | def input(self):
159 | """get input list
160 |
161 | Returns:
162 | [list]: input node name list
163 | """
164 | return self._input
165 |
166 | @property
167 | def dtype(self):
168 | """get data type
169 |
170 | Returns:
171 | [numpy.dtype]: data type
172 | """
173 | return self._dtype
174 |
175 | @property
176 | def shape(self):
177 | """get output shape
178 |
179 | Returns:
180 | [tuple]: output shape
181 | """
182 | return self._shape
183 |
184 | @property
185 | def requires_grad(self):
186 | """get requires_grad
187 |
188 | Returns:
189 | [bool]: requires_grad
190 | """
191 | return self._requires_grad
192 |
193 | @requires_grad.setter
194 | def requires_grad(self, val):
195 | self._requires_grad = val
196 |
197 | @property
198 | def inplace(self):
199 | """get inplace status
200 |
201 | Returns:
202 | [bool]: inplace
203 | """
204 | return self._inplace
205 |
206 | @inplace.setter
207 | def inplace(self, val):
208 | self._inplace = val
209 |
210 | @property
211 | def time(self):
212 | return self._time
213 |
214 | def __repr__(self):
215 | return "Name: %s, op: %s, input: [%s], dtype: %s, shape: %s" % (
216 | self.name, self.op, ", ".join(self.input), str(
217 | self.dtype), str(self.shape)
218 | )
219 |
--------------------------------------------------------------------------------
/dpro/memory/recomputation.py:
--------------------------------------------------------------------------------
1 | import re
2 | import networkx as nx
3 | from itertools import islice
4 |
5 | from .utils import *
6 | from ..logger_utils import SingleLogger
7 |
8 | class CheckpointsSelector:
9 | @classmethod
10 | def get_checkpoint_selector(cls, mode):
11 | if mode == "speed":
12 | return SpeedCheckpointsSelector()
13 | elif mode == "memory":
14 | return MemoryCheckpointsSelector()
15 | elif mode == "topk":
16 | return TopkCheckpointsSelector()
17 | else:
18 | raise ValueError("%s is not found" % mode)
19 |
20 | @staticmethod
21 | def select_checkpoints(schedule):
22 | raise NotImplementedError
23 |
24 |
25 | class SpeedCheckpointsSelector(CheckpointsSelector):
26 | @staticmethod
27 | def select_checkpoints(schedule):
28 | return list(filter(lambda n: len(re.findall("conv2d|conv|matmul", n.op))
29 | > 0, schedule.operators))
30 |
31 |
32 | class MemoryCheckpointsSelector(CheckpointsSelector):
33 | @staticmethod
34 | def select_checkpoints(schedule):
35 | # TODO(yuchen): https://arxiv.org/pdf/1604.06174.pdf
36 | raise NotImplementedError
37 |
38 |
39 | class TopkCheckpointsSelector(CheckpointsSelector):
40 | k = 0.1
41 |
42 | @staticmethod
43 | def select_checkpoints(schedule):
44 | num_checkpoints = int(
45 | TopkCheckpointsSelector.k * len(schedule.operators))
46 | sorted_ops_indices = [i for i, _ in sorted(
47 | enumerate(schedule.operators), key=lambda n:n[1].time)]
48 | topk_indices = sorted(sorted_ops_indices[-num_checkpoints:])
49 | expensive_ops = [schedule.operators[i] for i in topk_indices]
50 | return expensive_ops
51 |
52 |
53 | def get_recomputation_edited_graph(dag, schedule, mode, verbose=False):
54 | selector = CheckpointsSelector.get_checkpoint_selector(mode)
55 | checkpoints = selector.select_checkpoints(schedule)
56 | if not checkpoints:
57 | SingleLogger().warn("No checkpoints found! Recomputation Aborted!")
58 | return False
59 |
60 | if verbose:
61 | names = [node.name for node in checkpoints]
62 | SingleLogger().info("select %d checkpoints: %s" %
63 | (len(names), ', '.join(names)))
64 |
65 | _apply_recomputation(dag, schedule, checkpoints, verbose)
66 |
67 | return True
68 |
69 |
70 | def _update_schedule(schedule, checkpoints):
71 | name_to_checkpoints = {node.name: node for node in checkpoints}
72 | for op in schedule.operators:
73 | if op.name in name_to_checkpoints:
74 | op.requires_grad = True
75 | else:
76 | op.requires_grad = False
77 |
78 |
79 | def _apply_recomputation(dag, schedule, checkpoints, verbose):
80 | _update_schedule(schedule, checkpoints)
81 | _update_dag(dag, checkpoints, verbose)
82 |
83 |
84 | def _compose_subgraph_between_two_nodes(dag, source, target):
85 | if not nx.has_path(dag, source, target):
86 | # it is possible. e.g. matmul in k, q, v
87 | return None
88 |
89 | paths_between_two_nodes = list(
90 | islice(nx.shortest_simple_paths(dag, source, target), 10))
91 | nodes_between_set = {
92 | node for path in paths_between_two_nodes for node in path}
93 |
94 | subgraph = dag.subgraph(nodes_between_set)
95 |
96 | # add suffix to avoid the same name in a graph
97 | mapping = {node: node+"_sg" for node in subgraph.nodes}
98 | return nx.relabel_nodes(subgraph, mapping)
99 |
100 |
101 | def _get_last_forward_node(dag):
102 | forward_nodes = get_forward_nodes(dag.nodes)
103 | forward_graph = dag.subgraph(forward_nodes).copy()
104 | leaf_nodes = get_leaf_nodes(forward_graph)
105 | forward_graph.remove_nodes_from(leaf_nodes)
106 | sorted_forward_nodes = list(nx.topological_sort(forward_graph))
107 | sorted_forward_nodes = filter_out_node_by_name(
108 | sorted_forward_nodes, "read")
109 | return sorted_forward_nodes[-1]
110 |
111 |
112 | def _get_target_backward_node(dag, target):
113 | target_bwp_op_name = target.replace("->FW.", "->BW.gradients/")
114 | target_bwp_op_name += "_grad/" + target_bwp_op_name.rsplit('/')[-1]
115 | if target_bwp_op_name in dag.nodes:
116 | return target_bwp_op_name
117 | return None
118 |
119 |
120 | def _update_dag(dag, checkpoints, verbose):
121 | filtered_nodes = filter_out_comm_nodes(dag.nodes)
122 | # TODO(yuchen): deal with other ranks
123 | filtered_nodes = get_rank0_nodes(filtered_nodes)
124 | names_to_nodes = {get_node_name(node): node for node in filtered_nodes}
125 | checkpoints_to_nodes = {node.name: names_to_nodes[node.name]
126 | for node in checkpoints if node.name in names_to_nodes}
127 |
128 | target = _get_last_forward_node(dag) # last node in forward
129 | if verbose:
130 | SingleLogger().info("Get the last forward node %s." % target)
131 |
132 | for checkpoint in checkpoints[::-1]:
133 | source = checkpoints_to_nodes[checkpoint.name]
134 | if verbose:
135 | SingleLogger().info("source %s, target %s" % (source, target))
136 | subgraph = _compose_subgraph_between_two_nodes(dag, source, target)
137 |
138 | if subgraph:
139 | if verbose:
140 | SingleLogger().info("ops to be copied: %s" % (', '.join(subgraph.nodes)))
141 |
142 | target_bwp_op = _get_target_backward_node(dag, target)
143 | if verbose:
144 | SingleLogger().info("target backward op: %s" % (str(target_bwp_op)))
145 |
146 | # rewire
147 | insert_nodes(dag, subgraph, target_bwp_op)
148 |
149 | target = source
150 |
--------------------------------------------------------------------------------
/dpro/memory/schedule.py:
--------------------------------------------------------------------------------
1 | from .node import Node
2 |
3 |
4 | class Schedule:
5 | def __init__(self, platform):
6 | self._parameters = []
7 | self._operators = []
8 | self._node_collection = {}
9 | self.lists = self._get_platform_memory_lists(platform)
10 |
11 | def add(self, node):
12 | """add node into schedule and determine whether it runs in place
13 |
14 | Args:
15 | node ([Node]): operator
16 |
17 | Returns:
18 | [bool]: Status
19 | """
20 | if not isinstance(node, Node):
21 | return False
22 |
23 | self._node_collection[node.name] = node
24 |
25 | if node.is_parameter():
26 | self._parameters.append(node)
27 | elif node.is_valid() and self._is_in_whitelist(node):
28 | self._set_inplace(node)
29 | self._operators.append(node)
30 | else:
31 | return False
32 |
33 | return True
34 |
35 | @property
36 | def parameters(self):
37 | return self._parameters
38 |
39 | @property
40 | def operators(self):
41 | return self._operators
42 |
43 | def _is_in_whitelist(self, node):
44 | if node.op not in self.lists.WHITE_LIST:
45 | return False
46 | return True
47 |
48 | def _should_inplace(self, input_node, output_node):
49 | if output_node.op not in self.lists.CWISE_LIST:
50 | return False
51 |
52 | if input_node.inplace:
53 | return False
54 |
55 | if input_node.dtype != output_node.dtype:
56 | return False
57 |
58 | if input_node.get_num_ele() != output_node.get_num_ele():
59 | return False
60 |
61 | return True
62 |
63 | def _set_inplace(self, node):
64 | input_names = node.input
65 | for input_name in input_names:
66 | input_node = self._node_collection.get(input_name)
67 | if input_node and self._should_inplace(input_node, node):
68 | node.inplace = True
69 | break
70 |
71 | def _get_platform_memory_lists(self, platform):
72 | try:
73 | if platform.lower() == "tensorflow":
74 | from ..ml_platform.tensorflow import memory_lists
75 | elif platform.lower() == "mxnet":
76 | from ..ml_platform.mxnet import memory_lists
77 | else:
78 | raise NotImplementedError()
79 | self.lists = memory_lists
80 | except:
81 | raise NotImplementedError(
82 | "Memory Estimator Does Not Support %s" % platform)
83 |
84 | return self.lists
85 |
--------------------------------------------------------------------------------
/dpro/memory/utils.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | import networkx as nx
3 |
4 | DEL = "->"
5 | RANK0_PREFIX = "host0.rank0"
6 | FORWARD_CAT = "FW."
7 | BACKWARD_CAT = "BW."
8 |
9 |
10 | def remove_prefix(text, prefix):
11 | if text.startswith(prefix):
12 | return text[len(prefix):]
13 | return text
14 |
15 |
16 | def remove_nodes_prefix(nodes, prefix):
17 | func = partial(remove_prefix, prefix=prefix)
18 | return list(map(func, nodes))
19 |
20 |
21 | def remove_node_prefix(node, prefix):
22 | func = partial(remove_prefix, prefix=prefix)
23 | return func(node)
24 |
25 |
26 | def filter_out_comm_nodes(nodes):
27 | def _is_comm(name):
28 | if name.startswith("server") or name.startswith("worker"):
29 | return False
30 | return True
31 |
32 | return list(filter(_is_comm, nodes))
33 |
34 |
35 | def get_node_name(name):
36 | return name.rsplit(".")[-1]
37 |
38 |
39 | def get_rank0_nodes(nodes):
40 | # TODO(yuchen): Not expandable
41 | def _is_rank0(name):
42 | if name.startswith(RANK0_PREFIX):
43 | return True
44 | return False
45 |
46 | return list(filter(_is_rank0, nodes))
47 |
48 |
49 | def get_leaf_nodes(dag):
50 | return [node for node in dag.nodes if dag.out_degree(node) == 1
51 | and dag.in_degree(node) == 0]
52 |
53 |
54 | def filter_out_node_by_name(nodes, name):
55 | return list(filter(lambda node: False if name in node else True, nodes))
56 |
57 |
58 | def get_forward_nodes(nodes):
59 | # TODO(yuchen): Not expandable
60 | def _is_forward(name):
61 | if name.startswith(DEL.join([RANK0_PREFIX, FORWARD_CAT]) + "bert"):
62 | return True
63 | return False
64 |
65 | return list(filter(_is_forward, nodes))
66 |
67 |
68 | def get_forward_backward_nodes(nodes):
69 | # TODO(yuchen): Not expandable
70 | def _is_forward(name):
71 | if name.startswith(DEL.join([RANK0_PREFIX, FORWARD_CAT]) + "bert") or \
72 | name.startswith(DEL.join([RANK0_PREFIX, BACKWARD_CAT]) + "bert"):
73 | return True
74 | return False
75 |
76 | return list(filter(_is_forward, nodes))
77 |
78 |
79 | def get_input_nodes(dag):
80 | return [u for u, deg in dag.in_degree() if not deg]
81 |
82 |
83 | def get_output_nodes(dag):
84 | return [u for u, deg in dag.out_degree() if not deg]
85 |
86 |
87 | def insert_nodes(dag, subgraph, target):
88 | if not target:
89 | return
90 | # copy subgraph
91 | dag.add_nodes_from(subgraph.nodes.data())
92 | dag.add_edges_from(subgraph.edges.data())
93 |
94 | # remove previous nodes
95 | prev_nodes = list(dag.predecessors(target))
96 | for prev_node in prev_nodes:
97 | dag.remove_edge(prev_node, target)
98 |
99 | # connect subgraph output to target
100 | outputs = get_output_nodes(subgraph)
101 | dag.add_edge(outputs[0], target)
102 |
103 |
104 | def update_time_by_scale(dag, scale):
105 | trace_times = nx.get_node_attributes(dag, "avg")
106 | for k, v in trace_times.items():
107 | trace_times[k] = v * scale
108 | nx.set_node_attributes(dag, trace_times, "avg")
109 |
--------------------------------------------------------------------------------
/dpro/mg_generate_dataset.py:
--------------------------------------------------------------------------------
1 | ''' Generate Dataset to train the Cost Model
2 | '''
3 | from tqdm import tqdm
4 | import os,sys
5 |
6 | from .arg_utils import SingleArg
7 | from .logger_utils import SingleLogger
8 |
9 | args = SingleArg().args
10 | logger = SingleLogger(args.path.split(',')[0],
11 | args.option, args.logging_level,
12 | is_clean=args.clean,
13 | show_progress=args.progress)
14 | logger.info(args)
15 |
16 | if args.option == "optimize":
17 | if args.sub_option == "train_amp":
18 | from cost_model._mixed_precision.amp_pred import AMPPredictor, train_amp_model
19 | train_amp_model()
20 | exit(0)
21 | elif args.sub_option == "train_gpu":
22 | from cost_model._gpu_predict.gpu_pred import train_gpu_model
23 | train_gpu_model()
24 | exit(0)
--------------------------------------------------------------------------------
/dpro/ml_platform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/ml_platform/__init__.py
--------------------------------------------------------------------------------
/dpro/ml_platform/mxnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/ml_platform/mxnet/__init__.py
--------------------------------------------------------------------------------
/dpro/ml_platform/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from . import memory_lists
--------------------------------------------------------------------------------
/dpro/ml_platform/tensorflow/amp_lists.py:
--------------------------------------------------------------------------------
1 |
2 | whitelist = [
3 | #if CUDA_VERSION >= 9010 // Fp16 BatchMatMul is slow before CUDA 9.1.
4 | "BatchMatMul",
5 | "BlockLSTM", "BlockLSTMGrad", "Conv2D", "Conv2DBackpropFilter",
6 | "Conv2DBackpropInput",
7 |
8 | # # TODO(benbarsdell): Enable these when Tensor Core kernels are
9 | # # available for 3D convolutions.
10 | # "Conv3D",
11 | # "Conv3DBackpropFilter",
12 | # "Conv3DBackpropFilterV2",
13 | # "Conv3DBackpropInput",
14 | # "Conv3DBackpropInputV2",
15 | # "CudnnRNN", "CudnnRNNBackprop", "CudnnRNNBackpropV2",
16 | # "CudnnRNNBackpropV3", "CudnnRNNV2", "CudnnRNNV3", "GRUBlockCell",
17 | # "GRUBlockCellGrad", "LSTMBlockCell", "LSTMBlockCellGrad",
18 |
19 |
20 | # # TODO(benbarsdell): Enable these when fast and safe fp16 kernels are
21 | # available for depthwise convolutions.
22 | # "DepthwiseConv2dNative",
23 | # "DepthwiseConv2dNativeBackpropFilter",
24 | # "DepthwiseConv2dNativeBackpropInput",
25 |
26 | "MatMul",
27 | ]
28 |
29 | greylist = [
30 | "Add",
31 | "AddN",
32 | "AddV2",
33 | "AvgPool",
34 | "AvgPool3D",
35 | "AvgPool3DGrad",
36 | "AvgPoolGrad",
37 | "BiasAdd",
38 | "BiasAddGrad",
39 | "BiasAddV1",
40 | "Elu",
41 | "EluGrad",
42 | "Erf",
43 | "Erfc",
44 | "FloorDiv",
45 | "FusedBatchNormV2",
46 | "FusedBatchNormGradV2",
47 | "FusedBatchNormV3",
48 | "FusedBatchNormGradV3",
49 | "Inv",
50 | "LeakyRelu",
51 | "LeakyReluGrad",
52 | "Mul",
53 | "Prod",
54 | "RealDiv",
55 | "Reciprocal",
56 | "Sigmoid",
57 | "SigmoidGrad",
58 | "Softplus",
59 | "SoftplusGrad",
60 | "Sqrt",
61 | "Sub",
62 | "Tanh",
63 | "TanhGrad",
64 | ]
65 |
66 | blacklist = [
67 | "Exp",
68 | "Expm1",
69 | "L2Loss",
70 | "Log",
71 | "Log1p",
72 | "LogSoftmax",
73 | "Mean",
74 | "Pow",
75 | "SaveV2",
76 | "Softmax",
77 | "SoftmaxCrossEntropyWithLogits",
78 | "SparseSoftmaxCrossEntropyWithLogits",
79 | "Sum",
80 | ]
81 |
82 | clearlist = [
83 | "Abs",
84 | "ArgMax",
85 | "ArgMin",
86 | "BatchToSpace",
87 | "BatchToSpaceND",
88 | "BroadcastTo",
89 | "Ceil",
90 | "CheckNumerics",
91 | "ClipByValue",
92 | "Concat",
93 | "ConcatV2",
94 | "DepthToSpace",
95 | "DynamicPartition",
96 | "DynamicStitch",
97 | "Enter",
98 | "EnsureShape",
99 | "Equal",
100 | "Exit",
101 | "ExpandDims",
102 | "Fill",
103 | "Floor",
104 | "Gather",
105 | "GatherNd",
106 | "GatherV2",
107 | "Greater",
108 | "GreaterEqual",
109 | "Identity",
110 | "IdentityN",
111 | "IsFinite",
112 | "IsInf",
113 | "IsNan",
114 | "Less",
115 | "LessEqual",
116 | "Max",
117 | "MaxPool",
118 | "MaxPool3D",
119 | "MaxPool3DGrad",
120 | "MaxPool3DGradGrad",
121 | "MaxPoolGrad",
122 | "MaxPoolGradGrad",
123 | "MaxPoolGradGradV2",
124 | "MaxPoolGradV2",
125 | "MaxPoolV2",
126 | "Maximum",
127 | "Merge",
128 | "Min",
129 | "Minimum",
130 | "MirrorPad",
131 | "MirrorPadGrad",
132 | "Neg",
133 | "NextIteration",
134 | "NotEqual",
135 | "OneHot",
136 | "OnesLike",
137 | "Pack",
138 | "Pad",
139 | "PadV2",
140 | "PreventGradient",
141 | "Rank",
142 | "Relu",
143 | "Relu6",
144 | "Relu6Grad",
145 | "ReluGrad",
146 | "Reshape",
147 | "ResizeNearestNeighbor",
148 | "ResizeNearestNeighborGrad",
149 | "Reverse",
150 | "ReverseSequence",
151 | "ReverseV2",
152 | "Round",
153 | "Select",
154 | "Shape",
155 | "ShapeN",
156 | "Sign",
157 | "Size",
158 | "Slice",
159 | "Snapshot",
160 | "SpaceToBatch",
161 | "SpaceToBatchND",
162 | "SpaceToDepth",
163 | "Split",
164 | "SplitV",
165 | "Squeeze",
166 | "StackPopV2",
167 | "StackPushV2",
168 | "StopGradient",
169 | "StridedSlice",
170 | "StridedSliceGrad",
171 | "Switch",
172 | "TensorArrayConcatV3",
173 | "TensorArrayGatherV3",
174 | "TensorArrayReadV3",
175 | "TensorArrayScatterV3",
176 | "TensorArraySplitV3",
177 | "TensorArrayWriteV3",
178 | "Tile",
179 | "TopK",
180 | "TopKV2",
181 | "Transpose",
182 | "Where",
183 | "ZerosLike",
184 | ]
185 |
--------------------------------------------------------------------------------
/dpro/ml_platform/tensorflow/memory_lists.py:
--------------------------------------------------------------------------------
1 |
2 | # TODO(yuchen): support CNN
3 | # it only works for BERT now
4 | WHITE_LIST = [
5 | 'mul',
6 | 'addv2',
7 | 'batchmatmulv2',
8 | 'square',
9 | 'l2loss',
10 | 'matmul',
11 | 'sum',
12 | 'tile',
13 | 'sqrt',
14 | 'transpose',
15 | 'neg',
16 | 'randomuniform',
17 | 'cast',
18 | 'greaterequal',
19 | 'squareddifference',
20 | 'softmax',
21 | 'pow',
22 | 'gatherv2',
23 | 'onehot',
24 | 'unsortedsegmentsum',
25 | 'logsoftmax',
26 | 'pad',
27 | 'mean',
28 | 'sub',
29 | 'realdiv',
30 | 'stridedslice',
31 | ]
32 |
33 | # coefficient-wise operator
34 | # see https://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
35 | CWISE_LIST = [
36 | 'mul',
37 | 'addv2',
38 | 'square',
39 | 'sqrt',
40 | 'neg',
41 | 'squareddifference',
42 | 'pow',
43 | 'sub',
44 | 'realdiv'
45 | ]
46 |
--------------------------------------------------------------------------------
/dpro/ml_platform/tensorflow/util.py:
--------------------------------------------------------------------------------
1 | from google.protobuf.json_format import MessageToJson
2 | from google.protobuf.text_format import Parse
3 | import tensorflow as tf
4 | import sys, os
5 | import json
6 | import networkx as nx
7 |
8 | def wrap_read_graphdef(graphdef_path):
9 | try:
10 | GraphDef = tf.GraphDef
11 | except:
12 | GraphDef = tf.compat.v1.GraphDef
13 | if graphdef_path.endswith("pbtxt"):
14 | with open(graphdef_path, "r") as f:
15 | pb = f.read()
16 | graph_def = Parse(pb, GraphDef())
17 | json_string = MessageToJson(graph_def)
18 | graph_def = json.loads(json_string)
19 | else:
20 | with open(graphdef_path, "r") as f:
21 | graph_def = json.load(f)
22 | graph = nx.DiGraph()
23 | for node in graph_def["node"]:
24 | if "input" in node:
25 | for input_tensor_name in node["input"]:
26 | input_node_name = input_tensor_name.split(":")[0]
27 | graph.add_edge(input_node_name, node["name"])
28 | gml_path = os.path.join(os.path.dirname(graphdef_path), "graphdef_dag.gml")
29 | nx.write_gml(graph, gml_path)
30 | print("Create gml file at {}".format(gml_path))
31 |
32 | if __name__ == "__main__":
33 | wrap_read_graphdef(sys.argv[1])
--------------------------------------------------------------------------------
/dpro/nvprof/analyze.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import os
3 | import json
4 | import argparse
5 | import networkx as nx
6 | import sys
7 |
8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
9 |
10 | from ..logger_utils import get_logger
11 |
12 | parser = argparse.ArgumentParser(description="Trace Analysis",
13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
14 | parser.add_argument("--option", type=str, default="gpu_trace",
15 | choices=["gpu_trace"],
16 | help="The type of analysis to process. including:\n" +
17 | "* statistic: show the statistic results\n" +
18 | "* graph: show the dependency graph\n")
19 | parser.add_argument("--path", type=str, required=True, help="The paths of traces you want to analyze, support multiple paths seperated with comma.")
20 | parser.add_argument("--logging_level", type=int, default="20", help="Logging level")
21 | parser.add_argument("--clean", action="store_true", help="Flush the log file")
22 | parser.add_argument("--progress", action="store_true", help="Show the progress bar if it is set, disable the std output")
23 | args = parser.parse_args()
24 |
25 | logger = get_logger(args)
26 | logger.info(args)
27 |
28 | def printIter(_iter, prefix=''):
29 | for _cmp in _iter:
30 | logger.info(prefix + _cmp)
31 |
32 | def handle(path, platform):
33 | with open(path, 'r') as fp:
34 | s = fp.readlines()
35 | i = 0
36 | sta = {}
37 | while i < len(s):
38 | if "Device Context Stream" in s[i]:
39 | i += 1
40 | break
41 | i += 1
42 | while i < len(s):
43 | if len(s[i]) < 162:
44 | break
45 | try:
46 | stream_id = int(s[i][162:168])
47 | except:
48 | logger.info(len(s[i]), s[i-1])
49 | raise
50 | #! delete the index of each kernel, reduce the duplication number of each kernal
51 | #! only focus on the name of each kernal
52 | name = s[i][170:].split(" [")[0].split("<")[0]
53 | if stream_id not in sta:
54 | sta[stream_id] = {"cmp": set(), "mem": set()}
55 | if "memcpy" in name or "memset" in name:
56 | sta[stream_id]["mem"].add(name)
57 | else:
58 | sta[stream_id]["cmp"].add(name)
59 | i += 1
60 | for k, v in sta.items():
61 | logger.info("Stream ID: %-2d => cmp: %-10d : mem %-10d %s" % (k, len(v["cmp"]), len(v["mem"]), '' if len(v["mem"]) <= 2 else str(v["mem"])))
62 | #! Used for debug
63 | sta1 = sta2 = None
64 | if platform == 'pytorch':
65 | sta1 = sta[7]
66 | sta2 = sta[21]
67 | elif platform == "tensorflow":
68 | sta1 = sta[182]
69 | sta2 = sta[214]
70 | if sta1 is not None and sta2 is not None:
71 | logger.info("platform: %s" % (platform))
72 | logger.info(" intersection: ")
73 | printIter(sta1["cmp"].intersection(sta2["cmp"]), prefix="\t ")
74 | logger.info(" minor set: ")
75 | printIter(sta2["cmp"], prefix="\t ")
76 | logger.info(" major set: ")
77 | printIter(sta1["cmp"], prefix="\t ")
78 |
79 | if __name__ == "__main__":
80 | if args.option == "gpu_trace":
81 | cur_dir = os.path.abspath(args.path)
82 | root, dirs, files = list(os.walk(cur_dir, topdown=True))[0]
83 | for file in files:
84 | #! file name must follow the following format ___.txt
85 | #! e.g., 20191217_04_pytorch_mnist.txt, and must in lowercase.
86 | if "txt" in file and "log" not in file:
87 | #! Get the platform name, e.g. mxnet, tensorflow or pytorch
88 | platform = file.split("_")[2]
89 | cur_path = os.path.join(root, file)
90 | logger.info(cur_path)
91 | handle(cur_path, platform)
92 | else:
93 | raise NotImplementedError()
94 |
95 |
--------------------------------------------------------------------------------
/dpro/optimizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/optimizer/__init__.py
--------------------------------------------------------------------------------
/dpro/optimizer/mcts.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import math
3 | from enum import Enum
4 |
5 | from .base import Optimizer, GraphState, args_
6 | from ..logger_utils import SingleLogger
7 |
8 | MAX_LOOP = 1000
9 | MAX_TREE_DEPTH = 1000
10 | UCB_GAMMA = args_.ucb_gamma
11 |
12 | class GraphExpand(Enum):
13 | NOT = 0
14 | PARTIAL = 1
15 | FULLY = 2
16 |
17 | class GraphState:
18 | def __init__(self, depth):
19 | self.visit_cnt = 1
20 | self.quality = -1
21 |
22 | self.space = None
23 | self.childs = None
24 | self.parent = None
25 | self.depth = depth
26 |
27 | # Whether the actions have been tranversed, not, partial or fully
28 | self.state = GraphExpand.NOT
29 |
30 | self.strategy = None
31 | self.iter_time = None
32 |
33 | def update_expand_state(self):
34 | if self.childs is None:
35 | self.state = GraphExpand.NOT
36 | return
37 | assert not self.space is None
38 | if len(self.childs) == len(self.space):
39 | self.state = GraphExpand.FULLY
40 | else:
41 | self.state = GraphExpand.PARTIAL
42 |
43 | class MCTSOptimizer(Optimizer):
44 | ''' Monte Carlo Tree Search '''
45 |
46 | def __init__(self, *args, **kwargs):
47 | super(MCTSOptimizer, self).__init__(*args, **kwargs)
48 | self.loop_cnt = 0
49 | self.GS_root = None
50 | self.opt_GS = None
51 | self.ucb_type = args_.ucb_type
52 | if self.ucb_type != "MAX" and self.ucb_type != "AVG":
53 | raise ValueError(
54 | "UCB type should be MAX or AVG, but {} is given.".format(self.ucb_type))
55 | self.no_mutation = args_.no_mutation
56 |
57 | def search(self):
58 | ### Initialize the root graph state
59 | self.GS_root = GraphState(depth=0)
60 | self.GS_root.strategy = []
61 |
62 | while self.check_loop_time() and self.check_loop_num():
63 | GS = self.tree_policy(self.GS_root)
64 | reward = self.default_policy(GS)
65 | SingleLogger().info("Speedup to the origin %6.4f %%" % (100 * reward))
66 | self.backpropagation(GS, reward)
67 | if args_.ucb_visual:
68 | self.visualize_tree()
69 | self.show_opt_strategies()
70 | return
71 |
72 | def visualize_tree(self):
73 | def iter_print(GS, cnt):
74 | ### `cnt` is used to decide how many parent branches to print for current nodes
75 | LENOFNODE = 11
76 | LENOFARROW = 5
77 | node_string = " %5.4f %% " % (
78 | GS.quality * 100) if GS.quality >= 0 else " -%5.4f %% " % (-GS.quality * 100)
79 | sys.stdout.write(node_string)
80 | assert len(node_string) == LENOFNODE
81 | if GS.childs is None:
82 | return
83 | for idx, child in enumerate(GS.childs):
84 | if idx > 0:
85 | sys.stdout.write("\n{}".format(" "*(LENOFNODE + LENOFARROW//2)))
86 | sys.stdout.write("{}".format(" "*((LENOFNODE + LENOFARROW) * (GS.depth - cnt))))
87 | sys.stdout.write("{}".format(("|" + " " * (LENOFNODE + LENOFARROW - 1))*(cnt)))
88 | sys.stdout.write("{}".format("|" if idx < (len(GS.childs) - 1) else "\\"))
89 | sys.stdout.write("{}".format("-"*(LENOFARROW - LENOFARROW//2 - 1)))
90 | else:
91 | sys.stdout.write("{}".format('-'*LENOFARROW))
92 | if idx < (len(GS.childs) - 1):
93 | next_cnt = cnt + 1
94 | else:
95 | next_cnt = cnt
96 | iter_print(child, next_cnt)
97 |
98 | iter_print(self.GS_root, 0)
99 | sys.stdout.write("\n")
100 |
101 | def show_opt_strategies(self):
102 | SingleLogger().info("Best speedup: %d th layer, speed up to the origin: %6.4f %%" %
103 | (len(self.opt_GS.strategy), 100 * self.opt_GS.quality))
104 |
105 | def check_loop_num(self):
106 | self.loop_cnt += 1
107 | if self.loop_cnt > MAX_LOOP:
108 | return False # End
109 | else:
110 | return True # continue
111 |
112 | def check_loop_time(self):
113 | return True # continue
114 |
115 | def tree_policy(self, GS):
116 | while self.fully_expanded(GS):
117 | GS = self.best_UCB(GS)
118 | return self.expansion(GS)
119 |
120 | def default_policy(self, GS):
121 | if not self.no_mutation:
122 | while not self.terminal(GS):
123 | action = self.pick_strategy(GS.space)[0]
124 | GS_c = GraphState(depth=(GS.depth+1))
125 | GS_c.strategy = GS.strategy.copy()
126 | GS_c.strategy.append(action)
127 | GS = GS_c
128 | ### Evaluate the final graph
129 | if GS.iter_time is None:
130 | self.check_search_space(GS)
131 | cost = GS.iter_time
132 | SingleLogger().debug("Evaluate the strategy %s" % (str(GS.strategy)))
133 | return (self.base_cost - cost)/self.base_cost
134 |
135 | def backpropagation(self, GS, reward):
136 | if self.ucb_type == "MAX":
137 | GS.quality = max(reward, GS.quality)
138 | elif self.ucb_type == "AVG":
139 | GS.quality += reward
140 | GS.visit_cnt += 1
141 | if GS.depth == 0:
142 | return
143 | else:
144 | self.backpropagation(GS.parent, reward)
145 |
146 | def best_UCB(self, GS):
147 | GS_opt = c_opt = None
148 | for GS_c in GS.childs:
149 | if self.ucb_type == "MAX":
150 | c = GS_c.quality + UCB_GAMMA * \
151 | math.sqrt((2 * math.log(GS.visit_cnt)) / GS_c.visit_cnt)
152 | elif self.ucb_type == "AVG":
153 | c = GS_c.quality / GS_c.visit_cnt + UCB_GAMMA * \
154 | math.sqrt((2 * math.log(GS.visit_cnt)) / GS_c.visit_cnt)
155 | else:
156 | raise RuntimeError("Invalid UCB_type")
157 | if GS_opt is None or c > c_opt:
158 | c_opt = c
159 | GS_opt = GS_c
160 | return GS_opt
161 |
162 | def fully_expanded(self, GS):
163 | if self.terminal(GS):
164 | return False
165 |
166 | if GS.state == GraphExpand.NOT or GS.state == GraphExpand.PARTIAL:
167 | return False
168 | else:
169 | return True
170 |
171 | def expansion(self, GS):
172 | ### Pick an unvisided child to expand
173 | assert GS.state == GraphExpand.NOT or GS.state == GraphExpand.PARTIAL
174 | action = self.pick_unvisited(GS)
175 | if action is None:
176 | ### Current state is the terminal state, expansion failed
177 | return GS
178 |
179 | GS_c = GraphState(depth=(GS.depth+1))
180 | GS_c.strategy = GS.strategy.copy()
181 | GS_c.strategy.append(action)
182 | GS_c.parent = GS
183 | if GS.childs is None:
184 | GS.childs = []
185 | GS.childs.append(GS_c)
186 |
187 | if len(GS.space) == len(GS.childs):
188 | GS.state = GraphExpand.FULLY
189 | else:
190 | GS.state = GraphExpand.PARTIAL
191 |
192 | return GS_c
193 |
194 | def pick_unvisited(self, GS):
195 | ### TODO (huhanpeng): how to pick with some heuristic
196 | for idx in range(len(GS.space)):
197 | if GS.space[idx][1] == 0:
198 | GS.space[idx][1] += 1
199 | return GS.space[idx][0]
200 | return None
201 |
202 | def check_search_space(self, GS):
203 | ### TODO (huhanpeng): we can do some pruning here
204 | if GS.space is None:
205 | candidates, new_dag = self.candidate_selection(GS, topk=None)
206 | search_space, _ = self.init_search_space(candidates, new_dag)
207 | # The integer value is used as a counter
208 | GS.space = [[action, 0] for action in search_space]
209 |
210 | def terminal(self, GS):
211 | self.check_search_space(GS)
212 | if GS.depth > MAX_TREE_DEPTH or len(GS.space) == 0:
213 | return True
214 | else:
215 | return False
216 |
--------------------------------------------------------------------------------
/dpro/parameter.py:
--------------------------------------------------------------------------------
1 | ''' Manage the parameter info of a DNN model
2 | '''
3 | import re
4 |
5 | from .trace_utils import *
6 |
7 | class ParameterDict:
8 | def __init__(self, _pm, platform, metadata_path=None):
9 | ### collect metadata
10 | if metadata_path is None:
11 | metadata_path = os.path.dirname(_pm.search(FileName.METADATA))
12 | if metadata_path is None:
13 | SingleLogger().error(
14 | "{} not found. Fail to load metadata".format(FileName.METADATA.value))
15 |
16 | if platform == "MXNET":
17 | from .ml_platform.mxnet.metadata import MetaInfo
18 | SingleLogger().info("Use MXNET metadata")
19 | elif platform == "TENSORFLOW":
20 | from .ml_platform.tensorflow.metadata import MetaInfo
21 | SingleLogger().info("Use TENSORFLOW metadata")
22 | else:
23 | raise NotImplementedError()
24 |
25 | self.metainfo = MetaInfo(metadata_path)
26 | self.cnt = len(self.metainfo.gradient_name_list)
27 |
28 | def gradient_name_list(self):
29 | return self.metainfo.gradient_name_list
30 |
31 | def gradient_num(self):
32 | return self.cnt
33 |
34 | def wrap_read_dfg(self, *args, **kwargs):
35 | return self.metainfo.wrap_read_dfg(*args, **kwargs)
36 |
37 | def standard_name(self, op_name):
38 | ''' Convert op_names in the original traces to standard names
39 | `op_cat.op_name.sub_op`
40 | '''
41 | return self.metainfo.standard_name(op_name)
42 |
43 | ### below methods are related to tensors/Communication
44 |
45 | def tensor_id_to_tensor_name(self, tensor_id):
46 | return self.metainfo.tensor_id_to_tensor_name(tensor_id)
47 |
48 | def tensor_name_to_tensor_id(self, name):
49 | return self.metainfo.tensor_name_to_tensor_id(name)
50 |
51 | def tensor_id2size(self, tensor_id):
52 | return self.metainfo.ret_tensor_size(tensor_id)
53 |
54 | def tensor_id2update_id(self, tensor_id):
55 | '''tensor id may be 'max' to return the maximum update id '''
56 | return self.metainfo.tensor_id2update_id(tensor_id)
57 |
58 | def tensor_grp_size(self, op_name):
59 | total_size = 0
60 | for tensor_id_str in op_name.split("+"):
61 | tensor_id = int(tensor_id_str)
62 | total_size += self.tensor_id2size(tensor_id)
63 | return total_size
64 |
65 | ### below is related op_name
66 |
67 | def ret_metadata(self, *args, **kwargs):
68 | return self.metainfo.ret_metadata(*args, **kwargs)
69 |
70 | def ret_rawmeta(self, op_name):
71 | return self.metainfo.ret_rawmeta(op_name)
72 |
73 | def check_amp_lists(self, op_name):
74 | return self.metainfo.check_amp_lists(op_name)
75 |
76 | def parse_op_type(self, op_name):
77 | return self.metainfo.parse_op_type(op_name)
78 |
79 | def ret_op_precision(self, op_name):
80 | return self.metainfo.ret_op_precision(op_name)
81 |
82 | def in_metadata(self, op_name):
83 | return self.metainfo.in_metadata(op_name)
84 |
85 | def is_const(self, op_name):
86 | return self.metainfo.is_const(op_name)
87 |
88 | def is_variable(self, op_name):
89 | return self.metainfo.is_variable(op_name)
90 |
91 | def parse_model_name(self):
92 | return self.metainfo.parse_model_name()
--------------------------------------------------------------------------------
/dpro/xla_cm_entry.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 |
4 | from .logger_utils import SingleLogger
5 | from .cost_model._xla.xla_module_cost_model import XLAModuleCostModel
6 | from .cost_model._xla.gen_dataset_utils import XlaKernelDataset
7 |
8 | try:
9 | import byteps.tensorflow as bps
10 | except:
11 | pass
12 |
13 | try:
14 | import horovod.tensorflow as hvd
15 | except:
16 | pass
17 |
18 | parser = argparse.ArgumentParser(description="Script to launch the kernel dataset generator and train the XLA module cost model.",
19 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
20 |
21 | parser.add_argument("--mode", type=int, default=0,
22 | help="Different actions with different mode:\n"
23 | " 0: generate training data and train the cost model\n"
24 | " 1: only generate training data\n"
25 | " 2: only traing the cost model\n"
26 | " 3: only test the cost model")
27 |
28 | parser.add_argument("--trace_dir", type=str, help="Path to the directory containing trace files for a GPU.")
29 | parser.add_argument("--output_dir", type=str, help="Directory where the generated dataset files will be dumped to.")
30 | parser.add_argument("--num_samples", type=int, help="Number of random samples to generate.")
31 | parser.add_argument("--max_cluster_samples", type=int, default=0, help="Number of max cluster samples to generate.")
32 | parser.add_argument("--min_cluster_size", type=int, default=4, help="Minimum subgraph size.")
33 | parser.add_argument("--max_cluster_size", type=int, default=800, help="Maximum subgraph size.")
34 |
35 | parser.add_argument("--batch_size", type=int, default=256,
36 | help="Directory where the generated cost model files will be dumped to.")
37 |
38 | parser.add_argument("--dataset_dir", type=str,
39 | help="Path to the directory containing generated dataset files.")
40 |
41 | args = parser.parse_args()
42 |
43 | logger = SingleLogger(args.output_dir, "xla_cm", "INFO")
44 | logger.info(args)
45 |
46 | if args.mode == 0 or args.mode == 1:
47 | SingleLogger().info("Generate Kernel dataset ...")
48 | print("""Using configuation:
49 | \t Trace Dir: {}\n\t Output Dir: {}\n\t # Random Samples: {}
50 | \t # Max Cluster Samples: {}\n\t Min Cluster Size: {}\n\t Max Cluster Size: {}""".format(
51 | args.trace_dir, args.output_dir, args.num_samples,
52 | args.max_cluster_samples, args.min_cluster_size, args.max_cluster_size
53 | ))
54 |
55 | ### Generate Kernel dataset
56 | XlaKernelDataset.construct_kernel_dataset(args.trace_dir , os.path.join(args.output_dir, "kernel_dataset"),
57 | num_samples=args.num_samples,
58 | num_max_cluster_samples=args.max_cluster_samples,
59 | min_subgraph_level=args.min_cluster_size,
60 | max_subgraph_level=args.max_cluster_size)
61 |
62 | if args.mode == 0 or args.mode == 2:
63 | SingleLogger().info("Train the cost model ...")
64 | ### Train the cost model
65 | assert os.path.exists(os.path.join(args.output_dir, "kernel_dataset"))
66 | XLAModuleCostModel.train_on_dataset(
67 | os.path.join(args.output_dir, "kernel_dataset"),
68 | os.path.join(args.output_dir, "cost_model"),
69 | args.batch_size)
70 |
71 | if args.mode == 3:
72 | SingleLogger().info("Test the cost model ...")
73 | module_cost_model = XLAModuleCostModel(os.path.join(args.output_dir, "cost_model"))
74 | module_cost_model.test_on_dataset(args.dataset_dir)
75 |
76 |
77 |
--------------------------------------------------------------------------------
/dpro/xla_test_generate_cluster_spec.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import networkx as nx
3 | import pickle
4 | import os
5 |
6 | from google.protobuf.json_format import MessageToJson
7 | from google.protobuf.text_format import Parse
8 | import tensorflow as tf
9 | import json
10 |
11 | try:
12 | GraphDef = tf.GraphDef
13 | except:
14 | GraphDef = tf.compat.v1.GraphDef
15 |
16 | # from collect import Collector
17 | from cost_model._xla.pk_graph import PKGraph, postorder_contract_nx
18 | from trace_utils import parse_op_name, parse_pid_from_name
19 |
20 | TRACE_PATH = "/root/capture_file/run_0_dec8"
21 | OUTPUT_PATH = "/root/cluster_spec_test.txt"
22 |
23 | name2index = {}
24 | index2name = {}
25 | index2pid = {}
26 | index2newname = {}
27 |
28 | # logger = SingleLogger("/root", "trash_logger", "info")
29 |
30 | def tf_relabel_func(_name, update_nodes_in_dag):
31 | for prefix in ["Comm.", "Comp.", "BW.", "FW.", "UPDATE_."]:
32 | if _name.startswith(prefix):
33 | return _name
34 | if _name.startswith("^"):
35 | _name = _name[1:]
36 | last_slash_pos = _name.rfind("/")
37 | if last_slash_pos != -1 and last_slash_pos < len(_name)-1 and _name[last_slash_pos+1] == "_":
38 | _name = _name[:last_slash_pos]
39 | if "BytePSPushPull" in _name and "tensor" not in _name:
40 | _name = "Comm." + _name
41 | elif "allreduce" in _name.lower():
42 | if "." in _name:
43 | _, tensor_name = _name.split(".")
44 | if "_" in tensor_name:
45 | tensor_name = tensor_name.split("_")[0]
46 | _name = "Comm." + tensor_name
47 | else:
48 | _name = "UPDATE_." + _name
49 | else:
50 | if update_nodes_in_dag is not None and _name in update_nodes_in_dag:
51 | _name = "UPDATE_." + _name
52 | elif _name.startswith("gradients"):
53 | _name = "BW." + _name
54 | else:
55 | _name = "FW." + _name
56 | return _name
57 |
58 | def wrap_read_graphdef(graphdef_path):
59 | if graphdef_path.endswith("pbtxt"):
60 | with open(graphdef_path, "r") as f:
61 | pb = f.read()
62 | graph_def = Parse(pb, GraphDef())
63 | json_string = MessageToJson(graph_def)
64 | graph_def = json.loads(json_string)
65 | else:
66 | with open(graphdef_path, "r") as f:
67 | graph_def = json.load(f)
68 | graph = nx.DiGraph()
69 | for node in graph_def["node"]:
70 | if "input" in node:
71 | for input_tensor_name in node["input"]:
72 | input_node_name = input_tensor_name.split(":")[0]
73 | graph.add_edge(input_node_name, node["name"])
74 | update_nodes_in_dag = set()
75 | def recursive_add_succs(_node):
76 | for succ_ in graph.successors(_node):
77 | update_nodes_in_dag.add(succ_)
78 | recursive_add_succs(succ_)
79 | for node in graph.nodes:
80 | if "allreduce" in node.lower() or "bytepspushpull" in node.lower():
81 | recursive_add_succs(node)
82 | new_graph = nx.DiGraph()
83 | for u, v in graph.edges:
84 | new_graph.add_edge(tf_relabel_func(u, update_nodes_in_dag), tf_relabel_func(v, update_nodes_in_dag))
85 | return new_graph, update_nodes_in_dag
86 |
87 | def relabel_dag_node(_dag) -> nx.DiGraph:
88 | def relabel_func(old_label):
89 | if ("BW" in old_label or "FW" in old_label or "Comm" in old_label or "UPDATE" in old_label) and "^" not in old_label:
90 | layer_name = parse_op_name(old_label)
91 | layer_pid = parse_pid_from_name(old_label)
92 | # if layer_pid not in self.cost_models or layer_name not in self.cost_models[layer_pid].graph_def_util.operation_names:
93 | # return "DEL~"+old_label
94 | # TODO (huhanpeng): different pids share the same index
95 | # if "Comm" in old_label and layer_name in name2index and layer_pid in name2index[layer_name]:
96 | # layer_index = name2index[layer_name][layer_pid]
97 | # new_name = ("[%d]"%layer_index).join(old_label.split(layer_name))
98 | # return new_name
99 |
100 | layer_index = len(index2name)
101 | new_name = ("[%d]"%layer_index).join(old_label.split(layer_name))
102 | index2name[layer_index] = layer_name
103 | index2pid[layer_index] = layer_pid
104 | if layer_name not in name2index:
105 | name2index[layer_name] = {}
106 | name2index[layer_name][layer_pid] = layer_index
107 | new_label = ("[%d]"%layer_index).join(old_label.split(layer_name))
108 | index2newname[layer_index] = new_label
109 | return new_label
110 | else:
111 | return old_label
112 | return nx.relabel_nodes(_dag, relabel_func)
113 |
114 |
115 | # remove dependency from FW to UPDATE
116 | # for (u, v) in list(dag.edges):
117 | # dag.remove_edge(u, v)
118 | xla_candidates = set()
119 | with open("/root/xla_candidates.txt", "r") as f:
120 | for line in f:
121 | xla_candidates.add(line.strip())
122 |
123 | dag = wrap_read_graphdef("/root/bert/traces/before_mark_for_compilation_5.pbtxt")
124 |
125 | dag = relabel_dag_node(dag)
126 |
127 | pkg = PKGraph(dag, dag)
128 |
129 | fw_nodes = []
130 | bw_nodes = []
131 | comm_nodes = []
132 | update_nodes = []
133 |
134 | for node in dag.nodes:
135 | if "FW" in node:
136 | fw_nodes.append(node)
137 | elif "BW" in node:
138 | bw_nodes.append(node)
139 | elif "Comm" in node:
140 | comm_nodes.append(node)
141 | elif "UPDATE" in node:
142 | update_nodes.append(node)
143 |
144 | print("Len FW nodes: {}, Len BW nodes: {}, Len COMM nodes: {}, Len UPDATE nodes: {}" \
145 | .format(len(fw_nodes), len(bw_nodes), len(comm_nodes), len(update_nodes)))
146 |
147 | BW_graph = dag.subgraph(bw_nodes)
148 | BW_sequence = list(nx.topological_sort(BW_graph))
149 |
150 | num_forbidden = int(len(BW_sequence) / 2)
151 | forbidden_bw = BW_sequence[num_forbidden:]
152 |
153 |
154 |
155 | filtered_nodes = []
156 | for node in dag.nodes:
157 | index = int(node.split("[")[1].split("]")[0])
158 | orig_name = index2name[index]
159 | if orig_name.split(".")[1] not in xla_candidates:
160 | filtered_nodes.append(node)
161 |
162 | if not os.path.exists("/root/alter_cluster_spec.pickle"):
163 | # Cluster all FW
164 | source_nodes = sorted(list(dag.nodes), key=lambda x: dag.in_degree(x))
165 |
166 | # Run post order traversal on G
167 | print("Finding maximal clusters in FW...")
168 | visited_nodes = set()
169 | for source in tqdm(source_nodes, total=len(source_nodes)):
170 | if source not in visited_nodes and source in dag.nodes:
171 | _, _, dag = postorder_contract_nx(dag, pkg, source, visited_nodes, forbidden_list= filtered_nodes + comm_nodes + bw_nodes)
172 |
173 | with open("/root/alter_cluster_spec.pickle", "wb") as f:
174 | pickle.dump([fw_nodes, bw_nodes, comm_nodes, update_nodes,
175 | filtered_nodes, index2name, index2pid, dag, pkg], f)
176 | else:
177 | with open("/root/alter_cluster_spec.pickle", "rb") as f:
178 | ( fw_nodes, bw_nodes, comm_nodes, update_nodes, filtered_nodes,
179 | index2name, index2pid, dag, pkg )= pickle.load(f)
180 |
181 | # new_fw_nodes = [node for node in dag.nodes if "FW" in node]
182 |
183 | # # all BW
184 | # print("Finding maximal clusters in all BW...")
185 | # source_nodes = sorted(list(dag.nodes), key=lambda x: dag.in_degree(x))
186 | # visited_nodes = set()
187 | # for source in tqdm(source_nodes, total=len(source_nodes)):
188 | # if source not in visited_nodes and source in dag.nodes:
189 | # _, _, dag = postorder_contract_nx(dag, pkg, source, visited_nodes, forbidden_list= filtered_nodes + comm_nodes + new_fw_nodes)
190 |
191 | # # all BW, size limit 1/2
192 | # print("Finding maximal clusters in all BW...")
193 | # source_nodes = sorted(list(dag.nodes), key=lambda x: dag.in_degree(x))
194 | # visited_nodes = set()
195 | # for source in tqdm(source_nodes, total=len(source_nodes)):
196 | # if source not in visited_nodes and source in dag.nodes:
197 | # _, _, dag = postorder_contract_nx(dag, pkg, source, visited_nodes, forbidden_list= filtered_nodes + comm_nodes + new_fw_nodes, size_limit=int(len(bw_nodes)/2))
198 |
199 | def _get_original_name_pid_from_index(name_):
200 | try:
201 | index = int(name_.split("[")[1].split("]")[0])
202 | except:
203 | print(name_)
204 | input()
205 | return index2name[index], index2pid[index]
206 |
207 | def _get_original_name_pid_from_fused_node(u_):
208 | single_pid = None
209 | orig_names = []
210 | for node_name in u_.split("+"):
211 | orig_name, pid = _get_original_name_pid_from_index(node_name)
212 | orig_names.append(orig_name)
213 | if single_pid is None:
214 | single_pid = pid
215 | else:
216 | if single_pid != pid:
217 | raise RuntimeError("Fused DAG node {} contains ops from different machines.".format(u_))
218 | return orig_names, single_pid
219 |
220 | bw_cluster_sizes = []
221 | bw_cluster_nodes = []
222 | single_pid = -1
223 | for node in dag.nodes:
224 | if "+" in node and "BW" in node:
225 | orig_names, pid = _get_original_name_pid_from_fused_node(node)
226 | if single_pid == -1:
227 | single_pid = pid
228 | else:
229 | if single_pid != pid:
230 | continue
231 | bw_cluster_sizes.append(len(node.split("+")))
232 | bw_cluster_nodes.append(node)
233 |
234 | for idx, node_size in enumerate(bw_cluster_sizes):
235 | if node_size > 10:
236 | print("idx: {}, size: {}".format(idx, node_size))
237 |
238 | clusters_to_ignore = []
239 | while True:
240 | s = input("Choose a cluster to disgard: ")
241 | try:
242 | discard_id = int(s.strip())
243 | clusters_to_ignore.append(discard_id)
244 | print("Remaining clusters:")
245 | for idx, node_size in enumerate(bw_cluster_sizes):
246 | if node_size > 10 and idx not in clusters_to_ignore:
247 | print("idx: {}, size: {}".format(idx, node_size))
248 | except:
249 | break
250 |
251 | nodes_to_ignore = set()
252 | for idx in clusters_to_ignore:
253 | nodes_to_ignore.add(bw_cluster_nodes[idx])
254 |
255 | # dump cluster mapping
256 | cluster_index = 0
257 | with open("/root/partitions_spec.txt", "w") as f:
258 | for node in dag.nodes():
259 | if "+" in node:
260 | orig_names, pid = _get_original_name_pid_from_fused_node(node)
261 | if pid != single_pid:
262 | continue
263 | if node not in nodes_to_ignore:
264 | for orig_node_name in orig_names:
265 | f.write("{} {}\n".format(orig_node_name, cluster_index))
266 | cluster_index += 1
267 | else:
268 | for orig_node_name in orig_names:
269 | f.write("{} {}\n".format(orig_node_name, cluster_index))
270 | cluster_index += 1
--------------------------------------------------------------------------------
/dpro_cli:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | '''
3 | ****************************************
4 | * _______________________________ *
5 | * ______ /__ __ \__ __ \_ __ \ *
6 | * _ __ /__ /_/ /_ /_/ / / / / *
7 | * / /_/ / _ ____/_ _, _// /_/ / *
8 | * \__,_/ /_/ /_/ |_| \____/ *
9 | * *
10 | ****************************************
11 | '''
12 | import os, sys
13 | import yaml
14 | from jinja2 import Environment, FileSystemLoader
15 | from dpro.base import bcolors, dpro_dir
16 |
17 | usage_prompt = "usage: dpro