├── .gitignore ├── .gitmodules ├── README.md ├── data ├── TF1.15 │ ├── xla_candidates_bert.txt │ ├── xla_candidates_inceptionv3.txt │ ├── xla_candidates_resnet.txt │ ├── xla_candidates_vgg.txt │ └── xla_candidates_vgg19.txt ├── TF2.4 │ └── xla_candidates_resnet50.txt ├── mx_20200824_resnet50.py ├── tf_20200720.py ├── tf_20200731_resnet50.py └── tf_20200811_pred_error.py ├── docker └── tensorflow.Dockerfile ├── docs ├── backup.md ├── dependency.md ├── format.md ├── nvprof.md ├── profile.md ├── sample_config.yaml └── usage.md ├── dpro ├── __init__.py ├── analyze.py ├── arg_utils.py ├── base.py ├── bps_helper │ ├── __init__.py │ ├── graph.py │ └── preprocess.py ├── collect.py ├── cost_model │ ├── __init__.py │ ├── _gpu_predict │ │ ├── __init__.py │ │ ├── dataloader.py │ │ ├── dim_reduce.py │ │ ├── gpu_cost_model.py │ │ ├── gpu_pred.py │ │ └── grouper.py │ ├── _mixed_precision │ │ ├── .cost_model │ │ │ ├── CastToFp16.txt │ │ │ ├── CastToFp32.txt │ │ │ ├── Conv2D.txt │ │ │ └── MatMul.txt │ │ ├── __init__.py │ │ ├── amp_cost_model.py │ │ ├── amp_pred.py │ │ ├── dataloader.py │ │ ├── dim_reduce.py │ │ ├── grouper.py │ │ └── test_rst.py │ ├── _tsfs │ │ ├── __init__.py │ │ └── cost_model.py │ ├── _xla │ │ ├── __init__.py │ │ ├── execute_graph.py │ │ ├── gen_dataset_utils.py │ │ ├── gen_samples.py │ │ ├── p_dispersion.py │ │ ├── pk_graph.py │ │ ├── process_trace.py │ │ ├── utils.py │ │ ├── xla_module_cost_model.py │ │ ├── xla_run_generate_kernel_dataset.sh │ │ ├── xla_run_test_module_cm.sh │ │ ├── xla_run_train_module_cm.sh │ │ └── xlatools.py │ ├── base.py │ ├── gpu_models_info.py │ ├── mixed_precision.py │ ├── op_fusion.py │ ├── tensor_fusion.py │ ├── trace_clct.sh │ └── trace_filter.py ├── dag_utils.py ├── debug_utils.py ├── helper │ ├── combine_json.py │ ├── compare_graph.py │ ├── get_iter_time_from_trace.py │ ├── tf_flops_profile.py │ ├── tf_helper.py │ └── visualize.py ├── hvd │ ├── __init__.py │ └── graph.py ├── logger_utils.py ├── memory │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── cost_model.py │ ├── estimator.py │ ├── gradient_accumulation.py │ ├── node.py │ ├── recomputation.py │ ├── schedule.py │ └── utils.py ├── mg_generate_dataset.py ├── ml_platform │ ├── __init__.py │ ├── mxnet │ │ ├── __init__.py │ │ └── metadata.py │ └── tensorflow │ │ ├── __init__.py │ │ ├── amp_lists.py │ │ ├── memory_lists.py │ │ ├── metadata.py │ │ └── util.py ├── nvprof │ └── analyze.py ├── optimizer │ ├── __init__.py │ ├── base.py │ ├── dp.py │ ├── mcmc.py │ └── mcts.py ├── parameter.py ├── replay.py ├── trace_utils.py ├── xla_cm_entry.py └── xla_test_generate_cluster_spec.py ├── dpro_cli ├── requirements.txt ├── setup.py ├── setup.sh └── sleep.sh /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_Store 3 | *.pyc 4 | build/ 5 | .vscode/ 6 | .env 7 | data/data_* 8 | dist/dist/byteprofile_analysis-0.1-py3.8.egg 9 | byteprofile_analysis.egg-info 10 | dist/ 11 | */cost_model/_xla/.cost_model/ 12 | */cost_model/_gpu_predict/.cost_model/ 13 | .idea/ 14 | *egg-info -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/mixbench"] 2 | path = 3rdparty/mixbench 3 | url = https://github.com/ekondis/mixbench.git 4 | [submodule "3rdparty/nvprof2json"] 5 | path = 3rdparty/nvprof2json 6 | url = https://github.com/joapolarbear/nvprof2json.git 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This project is used to analyze the trace results profiled via [byteprofile](https://github.com/joapolarbear/byteps) a developed version of [BytePS](https://github.com/bytedance/byteps). 4 | 5 | # Usage 6 | By choosing different `--option`, this project supports the functionalities as shown below. 7 | 8 | ## Statistic 9 | Set arg `--option statistic` to show the statistic results, and arg `--path` must be set to the exact trace file path (ending with `.json` ). 10 | 11 | ## Visualize the DAG 12 | Set arg `--option graph`, visualize the dependency dag graph. and arg `--path` must be set to the exact DAG path (ending with `.gml`). 13 | 14 | ## Combine Trace Files 15 | Set arg `--option combine`, this can be used to combine several trace files into one file, e.g., one worker may has two GPUs, each of which generates a trace file, you can use this option and list the paths of these two files using `--path`. 16 | 17 | There are two options to define the trace paths. 18 | 19 | 1. Use file paths. In this case, `--path` should be a list of file paths, each of which denotes a trace file. The combined trace file will be stored under the same directory as the first trace file. 20 | 2. Use directory paths. In this case, `--path` is a list of directory paths, each of which denotes one worker and contains trace directories of GPUs on this worker. By default, the combined trace file will be stored under the first directory. 21 | 22 | **Note: please ensure that all of paths are file paths or all of them are diretory paths.** 23 | 24 | 25 | If you do not want combine all the traces, you can use `--filter` to give a list communication operations seperated with comma, then only these communication operations will appear in the combined trace file. For now, the filter only supports communication nodes. An example is shown below. 26 | 27 | ```bash 28 | python3 analyze.py --option combine --path ... --filter Comm.gradient_1,Comm.gradient_2 29 | ``` 30 | 31 | 32 | An example of combined timeline of 2 GPUs visualized by [chrome trace tool](chrome://tracing/) is shown below, which uses mnist as the dataset, running on 2 worker, each with 2 V100 GPUs. Here the prefix `Process 0`, `0` denotes the local rank of this GPU. 33 | 34 | 35 | 36 | ## Compare two trace files 37 | Set arg `--option compare`. Similar to option `combine`, the argument `--path` could be a list of worker trace directories or a list of trace files. When a list of directories is given, traces on one worker will automatically be merged. 38 | 39 | Besides, you can 40 | * set `--xlsx` to export the comparison results to an XLSX file. 41 | * set `--sort` to sort the comparison results. 42 | * set `--head ` to display first `` of comparison results. 43 | 44 | 45 | ## Calculate the Critical Path of the DAG 46 | Set arg `--option critical`, here `--path` should be the root trace directory, by default, it's `BYTEPS_TRACE_DIR`. 47 | 48 | **Note that, you must use the latest version of byteprofile to run this option.** 49 | 50 | ## Replay based on the traces 51 | Set arg `--option replay` to replay the traces for one worker. 52 | * Use `--path` to specify the path where the worker traces are stored. 53 | * Set `--del_queue` to include each partition and QueueType for communication traces. 54 | * Use `--step_num` to give the number of steps to replay. 55 | * Set `--pretty` to output necessary info. 56 | 57 | ## Update final traces 58 | Set arg `--option collect` to update the final traces. In the meanwhile, the average iteration time would be outputed. `--path` should be the root directory of a worker or a GPU. 59 | * `--sub_option iter_time`, only calculate the iteration time and FW+BW time 60 | * `--sub_option operator`, update operator traces based on the source files. 61 | * others, re-combine all traces based on the source files. 62 | 63 | ## `--option 3dcompare` 64 | Ignore partition id 65 | 66 | # Requirements 67 | pip3 packet: intervaltree, networkx, ujson, xlsxwriter, scapy 68 | -------------------------------------------------------------------------------- /data/tf_20200811_pred_error.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from matplotlib.ticker import (AutoMinorLocator, MultipleLocator) 4 | 5 | ax = plt.subplot(111) 6 | 7 | # all_data[label id][test error for 100 times of repeated test] 8 | all_data = [ 9 | [13.956024, 14.433242, 36.365207, 10.380671, 13.353506, 50.151516, 11.076908, 12.229296, 13.197254, 44.824227, 35.116051, 64.394852, 10.480341, 13.720852, 88.557334, 37.497621, 11.776002, 11.423172, 55.909996, 16.600878, 14.260358, 13.863314, 48.258129, 15.427159, 9.433073, 11.831115, 10.212262, 12.354979, 43.799844, 12.839599, 56.183875, 11.828298, 12.073554, 85.360527, 49.737677, 11.457030, 9.551553, 12.788189, 14.407476, 36.279821, 37.727431, 53.597884, 10.865854, 49.453092, 11.744117, 16.065677, 49.642372, 60.080919, 10.576669, 56.671448, 11.937124, 59.463291, 12.169872, 9.903194, 68.125121, 9.870735, 12.528896, 12.528278, 13.134355, 92.268916, 10.767118, 10.133862, 12.381869, 11.146730, 12.500672, 11.041856, 46.560807, 11.221893, 15.538326, 11.204281, 53.683452, 11.305852, 13.934167, 44.761666, 11.418703, 39.687794, 39.263057, 14.782713, 56.810139, 57.797041, 13.545492, 63.703707, 13.209352, 12.424941, 14.129963, 56.139439, 14.960313, 58.006742, 52.780320, 10.053853, 11.335702, 13.900809, 13.569368, 8.642230, 12.184775, 44.865501, 49.213126, 11.585204, 13.603950, 47.714876], 10 | [57.166523, 6.982943, 7.753159, 6.544062, 6.370150, 58.934236, 5.821519, 7.581446, 31.132802, 7.275806, 7.342372, 6.154925, 4.845664, 5.375899, 5.250347, 5.409458, 47.462595, 7.730537, 8.586070, 5.298019, 42.381557, 9.279349, 67.919465, 3.912002, 5.851241, 7.560528, 5.731278, 6.970734, 5.871561, 60.336510, 6.118150, 7.916642, 6.013825, 7.946192, 5.281653, 6.026306, 4.679249, 5.957760, 4.761488, 6.515127, 88.822437, 6.525346, 6.549746, 74.405183, 51.872324, 68.615514, 61.264099, 6.917063, 5.913056, 6.204503, 7.289733, 6.271309, 78.464292, 6.500042, 7.200059, 58.205104, 6.920534, 5.753805, 5.271359, 46.357106, 68.670515, 83.747194, 58.410471, 5.499968, 54.382253, 62.618250, 9.220569, 37.116831, 6.494756, 7.962453, 7.138617, 90.676043, 7.098806, 82.908148, 8.753192, 53.015609, 6.137569, 61.078094, 6.131989, 55.992310, 54.107132, 5.951071, 41.104941, 89.356646, 51.611572, 6.954993, 56.234897, 7.386588, 48.450425, 5.957338, 49.784691, 70.344788, 6.639423, 7.197308, 6.826844, 70.026041, 6.429870, 6.276725, 46.879217, 5.949097], 11 | 12 | [10.515195, 45.434328, 51.414588, 15.402890, 9.034116, 55.602559, 69.823075, 9.650470, 10.743006, 9.906061, 13.349366, 12.244130, 10.966927, 10.924287, 13.885874, 12.222774, 65.770478, 10.316878, 39.384284, 14.644657, 11.249421, 41.947391, 15.741479, 8.898329, 12.156129, 14.583042, 9.001215, 11.812949, 9.541513, 12.339056, 11.040557, 11.452073, 12.472323, 9.783736, 57.167140, 40.489246, 43.608511, 61.649401, 12.363932, 94.788833, 8.643454, 44.381189, 9.933129, 60.882352, 14.526221, 6.390452, 13.461222, 11.654321, 52.283489, 11.649394, 11.015001, 7.754916, 10.947251, 11.494571, 9.674303, 12.610179, 42.955850, 9.670831, 13.405886, 9.058590, 11.744207, 13.501221, 43.761783, 12.304239, 10.588361, 12.464460, 69.354573, 9.949217, 43.991506, 13.569916, 11.333769, 9.802779, 10.881983, 11.404215, 62.910407, 12.273455, 8.903062, 10.182866, 12.851778, 10.193168, 42.471023, 37.978502, 74.606463, 12.347102, 10.987125, 13.633108, 15.169739, 12.006797, 13.912672, 12.773112, 17.140734, 12.890699, 9.391158, 54.936686, 12.315391, 9.843667, 12.330516, 11.492315, 17.803154, 13.282663], 13 | [6.957377, 64.313606, 56.916500, 45.012399, 5.190539, 6.840786, 57.126490, 53.727270, 7.356256, 6.667304, 6.476066, 5.256859, 6.906330, 70.462762, 7.660466, 7.012304, 7.090306, 6.105001, 6.178831, 66.952585, 61.379096, 7.649414, 7.092564, 5.626575, 5.648277, 64.937245, 7.898103, 5.554414, 7.115182, 6.907596, 9.395951, 5.095690, 43.717707, 6.670211, 39.997334, 5.986490, 3.661167, 69.984995, 4.068519, 5.391135, 5.804065, 66.306257, 34.389040, 5.428332, 6.674271, 58.584086, 5.627205, 36.384092, 88.157494, 6.184002, 5.053334, 6.944832, 6.302980, 7.521219, 7.880967, 53.725881, 44.218278, 6.332340, 7.813679, 6.771026, 6.560170, 7.177000, 4.961530, 9.494012, 66.770047, 4.043301, 4.791953, 6.563170, 5.435724, 5.899271, 5.110108, 7.188095, 5.768912, 65.809342, 7.354027, 6.562866, 4.582166, 5.480870, 6.541070, 36.262630, 6.259378, 6.305899, 51.754131, 6.250982, 6.641401, 35.641656, 9.512910, 4.657524, 7.552413, 61.726023, 8.057250, 53.622812, 7.819604, 5.348360, 5.155510, 75.192289, 59.524181, 69.975642, 5.468333, 5.369140], 14 | 15 | [15.341629, 14.678412, 14.665693, 13.633613, 13.982519, 11.178406, 13.697694, 10.461185, 7.996832, 58.611549, 7.431059, 11.076401, 13.597231, 13.139524, 13.495966, 10.273490, 11.452367, 10.167614, 11.316935, 12.724643, 11.802032, 12.646366, 18.280768, 12.672976, 63.965102, 10.045398, 14.369920, 9.424832, 9.887030, 15.149035, 50.458433, 11.725453, 68.129204, 11.349208, 8.817826, 12.674762, 11.424591, 9.955086, 15.008888, 13.119296, 53.527772, 12.351472, 11.709062, 10.334177, 14.370902, 11.150850, 49.790847, 11.347105, 12.541186, 11.364660, 8.964185, 13.050379, 13.894001, 12.638281, 12.705963, 10.696781, 12.950993, 12.468188, 10.024792, 11.244612, 8.306938, 14.698039, 52.269719, 11.344778, 10.537424, 12.616367, 11.841470, 16.306481, 9.616387, 6.608985, 31.705475, 51.275058, 51.768775, 6.900935, 5.397402, 35.518681, 10.463089, 50.515297, 10.941296, 15.312960, 9.595655, 10.641053, 12.715098, 51.306539, 11.187088, 12.041390, 12.267792, 9.327741, 13.728392, 9.170088, 15.267901, 46.450641, 9.717752, 9.947808, 11.213984, 10.757838, 13.362415, 11.037630, 12.577730, 12.897338], 16 | [7.313773, 7.248264, 5.067444, 8.842977, 7.608721, 8.653894, 62.134383, 5.102347, 5.519440, 81.937181, 5.452010, 8.626065, 8.645136, 7.088186, 5.837918, 54.474982, 5.189909, 6.546465, 4.629266, 70.000192, 9.395889, 4.155231, 5.089076, 4.218533, 74.919280, 6.717917, 53.014032, 3.969286, 8.605599, 4.849044, 60.630324, 65.410434, 6.596810, 62.020646, 71.857941, 5.895819, 68.392247, 65.461754, 3.318386, 5.016175, 8.190272, 57.106412, 4.494780, 7.278212, 6.401423, 7.194343, 6.375702, 4.365551, 64.258294, 7.937419, 6.915316, 7.233481, 4.928482, 6.972867, 5.505200, 6.332903, 6.686514, 8.309472, 4.569678, 4.482371, 6.715820, 57.047777, 7.832438, 6.657865, 4.348972, 70.160085, 41.008560, 3.262324, 7.509626, 4.000181, 5.597471, 6.570225, 9.739884, 5.706723, 7.667952, 6.558006, 65.735783, 9.167421, 55.175922, 7.146803, 5.051127, 7.391683, 3.779323, 97.722961, 51.246519, 3.141432, 4.814731, 5.119214, 7.471215, 69.127114, 56.380045, 6.883611, 6.874813, 4.792079, 3.996554, 7.456756, 7.623711, 34.324562, 6.483892, 6.905238], 17 | 18 | [7.588937, 9.964588, 5.913335, 14.336912, 13.438318, 64.998050, 14.631312, 11.699449, 12.018943, 17.035852, 33.823215, 14.634302, 10.590887, 6.548417, 6.542256, 52.161803, 11.575169, 4.272291, 10.748003, 13.584780, 10.553002, 7.793081, 8.634919, 10.588455, 10.426559, 25.592627, 6.528304, 6.643721, 12.551807, 6.212011, 12.035708, 11.079625, 11.048117, 6.524898, 9.812931, 11.548364, 13.711194, 17.094585, 10.433940, 9.024216, 13.412596, 14.670297, 52.645683, 18.038336, 16.261581, 18.772533, 9.714485, 8.775804, 10.928841, 12.796661, 14.090557, 6.190334, 11.486405, 12.780759, 6.899154, 14.491948, 13.350100, 13.927349, 10.942886, 13.221856, 55.177635, 8.480024, 13.393132, 5.715504, 40.622648, 11.076411, 13.566425, 9.729145, 17.015641, 10.556499, 16.986986, 7.519732, 5.639839, 15.001418, 7.281235, 10.256605, 12.889487, 4.466114, 72.670157, 12.998940, 12.355639, 63.626501, 9.878510, 11.465503, 8.883998, 12.721210, 17.134859, 64.329050, 13.715173, 7.411055, 10.270259, 13.054703, 42.645570, 11.476296, 13.285420, 7.498828, 13.339854, 10.922121, 7.470640, 13.386343], 19 | [7.070935, 5.232053, 4.053272, 62.656732, 5.062426, 9.154989, 7.826296, 75.240219, 59.411407, 4.992232, 76.335673, 10.298458, 5.990009, 6.480765, 56.082957, 7.455960, 75.049727, 5.104337, 10.104645, 8.023306, 5.274468, 5.660575, 41.476173, 4.449734, 4.155806, 54.748757, 79.477174, 4.416540, 5.932159, 6.119858, 6.648595, 7.458493, 6.435249, 4.800734, 14.987605, 10.404192, 6.928261, 6.226757, 55.388482, 8.294353, 5.280141, 11.410110, 63.670893, 53.467722, 5.892295, 11.209773, 5.881369, 3.327046, 3.848187, 7.901931, 8.173404, 2.997336, 2.130338, 10.497846, 10.303912, 5.364018, 47.910314, 44.345466, 4.803757, 3.830554, 8.968367, 3.300144, 51.769722, 6.027482, 5.917627, 12.123815, 9.938891, 7.154330, 56.591873, 77.392856, 52.516691, 8.520153, 7.513137, 8.637736, 5.819892, 1.602784, 66.231463, 4.927559, 7.980261, 4.327471, 7.357223, 6.400759, 2.846838, 4.951876, 11.456527, 9.133238, 7.623498, 5.560661, 2.278189, 32.366328, 5.101368, 7.557821, 51.718406, 8.412207, 7.562392, 4.207905, 85.316843, 2.467279, 9.525035, 3.514291], 20 | 21 | [5.802008, 8.326228, 8.165158, 16.077765, 11.404562, 20.139854, 15.238530, 10.588925, 12.392165, 11.837725, 7.154748, 5.875821, 10.407793, 11.690149, 8.048689, 16.255622, 15.526473, 11.459678, 12.052782, 14.200299, 4.264560, 88.325601, 14.532483, 16.565511, 17.438027, 8.135357, 3.824198, 3.467835, 10.596049, 18.631508, 14.997960, 31.864815, 14.841692, 8.964419, 16.878293, 6.764996, 14.238110, 25.446969, 10.016874, 12.221086, 12.377311, 9.950374, 18.485064, 16.869409, 19.702920, 3.454878, 12.833186, 5.692901, 11.024399, 13.564892, 19.688321, 10.989238, 41.383812, 62.427356, 4.397385, 19.694828, 21.198388, 13.547000, 14.133338, 10.734672, 25.351968, 13.517704, 21.242162, 11.340485, 5.907095, 12.182404, 11.363229, 14.015253, 60.685415, 17.550834, 13.884081, 14.959678, 2.485946, 14.537648, 9.575146, 15.711994, 8.527022, 13.957632, 13.892153, 2.227685, 33.477207, 9.593510, 5.948852, 6.847351, 10.919294, 4.699008, 40.898343, 8.204610, 16.438709, 3.779228, 4.759139, 10.056058, 8.071077, 8.019649, 15.380003, 2.677506, 14.841307, 10.080636, 15.473311, 4.767401], 22 | [43.318922, 6.468859, 9.913257, 55.903135, 7.152742, 6.173400, 2.891720, 4.458493, 27.375968, 10.632708, 5.867643, 64.593774, 4.494313, 7.137831, 1.818833, 5.329690, 8.931935, 8.405413, 1.048821, 65.639816, 4.528360, 8.039385, 6.093105, 11.571390, 62.971126, 5.784095, 51.295974, 25.464415, 6.198689, 74.464807, 57.764465, 1.976959, 2.721981, 6.619558, 2.583216, 5.255425, 2.634106, 36.394218, 6.374182, 74.863759, 75.432708, 59.118688, 1.349001, 51.663599, 94.537953, 4.214529, 41.511808, 6.025560, 2.026080, 4.719750, 2.235685, 54.522440, 3.547660, 6.277774, 6.484477, 69.144590, 91.105996, 26.620359, 6.041705, 8.359846, 4.649439, 11.478467, 33.835395, 49.054471, 62.870999, 0.866062, 8.030661, 1.979249, 18.575960, 5.919261, 7.368157, 5.224024, 52.748799, 10.431567, 5.649330, 6.407153, 18.211424, 5.390051, 4.327642, 6.499954, 6.056569, 0.409378, 13.968299, 13.279833, 28.487376, 74.997135, 63.198839, 38.402265, 0.731686, 56.470007, 1.779763, 3.175988, 4.355366, 13.662878, 10.410555, 62.581137, 2.014510, 79.898078, 3.744049, 1.151870], 23 | 24 | # with threshold B > 4 25 | ] 26 | 27 | labels = [ 28 | 'no threshold\nTrain:Test=6:4', 'B>4\nTrain:Test=6:4', 29 | 'no threshold\nTrain:Test=7:3', 'B>4\nTrain:Test=7:3', 30 | 'no threshold\nTrain:Test=8:2', 'B>4\nTrain:Test=8:2', 31 | 'no threshold\nTrain:Test=9:1', 'B>4\nTrain:Test=9:1', 32 | 'no threshold\nTrain:Test=95:5', 'B>4\nTrain:Test=95:5' 33 | ] 34 | 35 | bplot = ax.boxplot(all_data[:2*4], patch_artist=True, labels=labels[:2*4]) 36 | plt.title('Evaluate the cost model for AMP with Conv2D') 37 | 38 | # colors = ['pink', 'lightblue', 'lightgreen'] 39 | # for patch, color in zip(bplot['boxes'], colors): 40 | # patch.set_facecolor(color) 41 | 42 | ax.yaxis.set_major_locator(MultipleLocator(10)) 43 | ax.yaxis.grid(True, which="both") 44 | # plt.xlabel('Three separate samples') 45 | plt.ylabel('Prediction Error (%)') 46 | plt.show() -------------------------------------------------------------------------------- /docker/tensorflow.Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | 16 | FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 17 | 18 | # RUN rm -f /tmp/pip.conf &&\ 19 | # echo -e '[global]\nindex-url = https://pypi.douban.com/simple' >> /tmp/pip.conf 20 | 21 | ENV USE_CUDA_PATH=/usr/local/cuda:/usr/local/cudnn/lib64 \ 22 | PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} \ 23 | OLD_LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH \ 24 | LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:$OLD_LD_LIBRARY_PATH \ 25 | LIBRARY_PATH=/usr/local/lib:/usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/nccl/lib/:$LIBRARY_PATH 26 | 27 | ENV BYTEPS_SERVER_MXNET_LINK=https://github.com/joapolarbear/incubator-mxnet.git \ 28 | MXNET_BUILD_OPTS="USE_OPENCV=1 \ 29 | USE_BLAS=openblas \ 30 | USE_CUDNN=1 \ 31 | USE_CUDA=1 \ 32 | USE_CUDA_PATH=/usr/local/cuda \ 33 | USE_MKLDNN=0 \ 34 | USE_DIST_KVSTORE=1 \ 35 | USE_NCCL=1 \ 36 | USE_NCCL_PATH=/usr/local/nccl" \ 37 | BYTEPS_BASE_PATH=/usr/local \ 38 | BYTEPS_PATH=${BYTEPS_BASE_PATH}/byteps 39 | 40 | # ----------------------------- Install dependencies ----------------------------- 41 | RUN apt-get update && \ 42 | apt-get install -y software-properties-common && \ 43 | add-apt-repository ppa:ubuntu-toolchain-r/test && \ 44 | add-apt-repository ppa:deadsnakes/ppa && \ 45 | apt-get update && \ 46 | apt-get install -y --allow-unauthenticated --allow-downgrades --allow-change-held-packages --no-install-recommends --fix-missing \ 47 | build-essential \ 48 | ca-certificates \ 49 | git \ 50 | curl \ 51 | wget \ 52 | vim \ 53 | libopenblas-dev \ 54 | liblapack-dev \ 55 | libopencv-dev \ 56 | python \ 57 | python-pip \ 58 | python-dev \ 59 | python-setuptools \ 60 | libjemalloc-dev \ 61 | graphviz \ 62 | cmake \ 63 | libjpeg-dev \ 64 | libpng-dev \ 65 | iftop \ 66 | lsb-release \ 67 | libnuma-dev \ 68 | gcc-4.9 \ 69 | g++-4.9 \ 70 | gcc-4.9-base \ 71 | gcc-7 \ 72 | g++-7 \ 73 | python3.7 \ 74 | python3.7-dev \ 75 | python3-pip \ 76 | python3-setuptools \ 77 | ssh \ 78 | librdmacm-dev \ 79 | zip unzip 80 | 81 | ### pin python3 version to 3.7 82 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 10 83 | 84 | RUN python -m pip install --upgrade pip && \ 85 | pip --no-cache-dir install \ 86 | matplotlib \ 87 | numpy==1.15.2 \ 88 | scipy \ 89 | sklearn \ 90 | pandas \ 91 | graphviz==0.9.0 \ 92 | mxboard \ 93 | tensorboard==1.0.0a6 \ 94 | networkx 95 | 96 | RUN python3 -m pip install --upgrade pip && \ 97 | python3 -m pip install Cython && \ 98 | python3 -m pip install --upgrade --force-reinstall setuptools && \ 99 | python3 -m pip --no-cache-dir install \ 100 | wheel \ 101 | matplotlib \ 102 | numpy==1.17.2 \ 103 | pandas \ 104 | mxboard \ 105 | XlsxWriter \ 106 | cvxopt \ 107 | cvxpy \ 108 | intervaltree \ 109 | networkx==2.5 \ 110 | protobuf \ 111 | scapy \ 112 | scipy \ 113 | scikit-learn \ 114 | tqdm \ 115 | ujson \ 116 | setuptools 117 | 118 | WORKDIR /root/ 119 | 120 | RUN git clone https://github.com/NVIDIA/cuda-samples.git 121 | 122 | # ----------------------------- Install OpenMPI 4.0.3 ----------------------------- 123 | RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz && \ 124 | tar -xvf openmpi-* && cd openmpi-* && \ 125 | ./configure --prefix="/usr" && \ 126 | make -j && make all install && \ 127 | ln -sf /home/$USER/.openmpi/bin/* /usr/bin/ 128 | 129 | # ----------------------------- Install NCCL ----------------------------- 130 | RUN git clone --recurse-submodules -b byteprofile https://github.com/joapolarbear/nccl.git && \ 131 | cd /root/nccl && make -j src.build && make pkg.txz.build && \ 132 | mkdir -p /usr/local/nccl && \ 133 | tar -Jxf ./build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 && \ 134 | echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 135 | ldconfig && ln -sf /usr/local/nccl/include/* /usr/include/ 136 | 137 | # ----------------------------- Install MXNet ----------------------------- 138 | 139 | ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH 140 | 141 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 142 | 143 | RUN git clone --single-branch --branch byteprofile --recurse-submodules $BYTEPS_SERVER_MXNET_LINK customized-mxnet && \ 144 | cd /root/customized-mxnet && \ 145 | make clean_all && make -j16 $MXNET_BUILD_OPTS 146 | 147 | #! python3 required 148 | RUN python3 -m pip --no-cache-dir install numpy==1.17.2 && \ 149 | cd /root/customized-mxnet/python && \ 150 | python3 setup.py build && \ 151 | python3 setup.py install && \ 152 | python3 setup.py bdist_wheel && \ 153 | cd && MX_PATH=`python3 -c "import mxnet; path=str(mxnet.__path__); print(path.split(\"'\")[1])"` && \ 154 | ln -sf /root/customized-mxnet/include $MX_PATH/include && echo $MX_PATH 155 | 156 | # ----------------------------- Install Tensorflow ----------------------------- 157 | ### install bazel 158 | RUN python3 -m pip --no-cache-dir install keras_applications --no-deps \ 159 | keras_preprocessing --no-deps \ 160 | h5py --no-deps 161 | 162 | RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.7.4/bazelisk-linux-amd64 && \ 163 | chmod +x bazelisk-linux-amd64 && \ 164 | mv bazelisk-linux-amd64 /usr/local/bin/bazel 165 | 166 | # RUN ln -sf /usr/local/cuda/lib64/libcupti.so /usr/local/cuda/lib64/libcupti.so.10.0 && \ 167 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so.10.0 && \ 168 | # ln -sf /usr/lib/x86_64-linux-gnu/libcublas.so /usr/lib/x86_64-linux-gnu/libcublas.so.10.0 && \ 169 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcufft.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcufft.so.10.0 && \ 170 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcurand.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcurand.so.10.0 && \ 171 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusolver.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusolver.so.10.0 && \ 172 | # ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusparse.so /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcusparse.so.10.0 173 | 174 | ### pin gcc and g++ version to 7 175 | # RUN update-alternatives --remove-all gcc && \ 176 | # update-alternatives --remove-all g++ && \ 177 | # update-alternatives --remove-all x86_64-linux-gnu-gcc && \ 178 | # update-alternatives --remove-all x86_64-linux-gnu-g++ 179 | 180 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 10 && \ 181 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 20 && \ 182 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 10 && \ 183 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-7 20 && \ 184 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 10 && \ 185 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-7 20 && \ 186 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 10 && \ 187 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-7 20 188 | 189 | RUN update-alternatives --set gcc /usr/bin/gcc-7 && \ 190 | update-alternatives --set g++ /usr/bin/g++-7 && \ 191 | update-alternatives --set x86_64-linux-gnu-gcc /usr/bin/gcc-7 && \ 192 | update-alternatives --set x86_64-linux-gnu-g++ /usr/bin/g++-7 193 | 194 | ENV BPF_TENSORFLOW_LINK=https://github.com/chenyu-jiang/tensorflow.git 195 | 196 | ENV PYTHON_BIN_PATH="/usr/bin/python3" 197 | ENV USE_DEFAULT_PYTHON_LIB_PATH=1 198 | ENV TF_ENABLE_XLA=1 199 | ENV TF_NEED_HDFS=0 200 | 201 | ### Download and build tensorflow 202 | RUN ln -sf /usr/bin/python3 /usr/bin/python && \ 203 | git clone --single-branch --branch r1.15 --recurse-submodules ${BPF_TENSORFLOW_LINK} && \ 204 | cd tensorflow && /usr/local/bin/bazel build -j auto --config=cuda //tensorflow/tools/pip_package:build_pip_package && \ 205 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg && \ 206 | pip3 --no-cache-dir install $(ls /tmp/tensorflow_pkg/tensorflow-1.15.4*) && \ 207 | chmod +x build_bpf_tf_modules.sh && \ 208 | ./build_bpf_tf_modules.sh 209 | 210 | ENV BPF_TF_PATH=$(pwd) 211 | 212 | # ----------------------------- Install BytePS ----------------------------- 213 | 214 | RUN cd /usr/lib/python3/dist-packages && ln -s $(ls apt_pkg.cpython-*-linux-gnu.so) apt_pkg.so && \ 215 | cd ${WORKDIR} 216 | 217 | RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \ 218 | | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \ 219 | apt-add-repository 'deb https://apt.kitware.com/ubuntu/ xenial main' && \ 220 | apt-get update && \ 221 | apt-get install cmake -y 222 | 223 | RUN git clone https://github.com/gabime/spdlog.git && \ 224 | cd spdlog && mkdir build && cd build && \ 225 | cmake .. && make -j && make install 226 | 227 | ENV BPF_BYTEPS_LINK=https://github.com/chenyu-jiang/byteps.git 228 | 229 | #! Install BytePS 230 | RUN cd /usr/local && \ 231 | git clone --single-branch --branch byteprofile --recurse-submodules ${BPF_BYTEPS_LINK} && \ 232 | cd byteps && \ 233 | BYTEPS_WITHOUT_PYTORCH=1 python3 setup.py install 234 | 235 | # ----------------------------- Install Horovod ----------------------------- 236 | RUN cd /usr/local && \ 237 | git clone --recurse-submodules -b byteprofile https://github.com/joapolarbear/horovod && \ 238 | cd /usr/local/horovod && python3 setup.py sdist && \ 239 | HOROVOD_NCCL_HOME=/usr/local/nccl \ 240 | HOROVOD_GPU_ALLREDUCE=NCCL \ 241 | HOROVOD_GPU_BROADCAST=NCCL \ 242 | HOROVOD_WITH_MPI=1 \ 243 | HOROVOD_WITH_TENSORFLOW=1 \ 244 | HOROVOD_WITHOUT_PYTORCH=1 \ 245 | HOROVOD_WITH_MXNET=1 pip3 install --no-cache-dir dist/horovod* && \ 246 | cp -r /usr/local/horovod/examples /root/horovod_examples 247 | 248 | # ----------------------------- Install gluon-nlp ----------------------------- 249 | RUN git clone -b bert-byteprofile https://github.com/joapolarbear/gluon-nlp.git && \ 250 | cd gluon-nlp && python3 setup.py install && \ 251 | mkdir -p /root/.mxnet/models && \ 252 | cd /root/.mxnet/models && \ 253 | wget https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip && unzip -o *.zip 254 | 255 | ### Set the environment for developing. 256 | ENV LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH \ 257 | BYTEPS_TRACE_ON=1 \ 258 | BYTEPS_TRACE_END_STEP=30 \ 259 | BYTEPS_TRACE_START_STEP=10 \ 260 | BYTEPS_TRACE_DIR=/root/traces \ 261 | MXNET_GPU_WORKER_NTHREADS=1 \ 262 | MXNET_EXEC_BULK_EXEC_TRAIN=0 263 | 264 | # # -------- install byteprofile analysis 265 | # RUN git clone --recurse-submodules https://github.com/joapolarbear/byteprofile-analysis.git && \ 266 | # cd byteprofile-analysis && python3 setup.py install 267 | 268 | ### Sample command to start the docker -------------------------------------------------------------------------------- /docs/backup.md: -------------------------------------------------------------------------------- 1 | # Commands for dPRO 2 | We have `bash setup.sh` to install dPRO now, the following commands are for archive 3 | 4 | ## Install dPRO 5 | ` 6 | cd ${HOME}/ 7 | rm -rf dpro 8 | git clone https://github.com/joapolarbear/dpro.git 9 | cd dpro && sudo bash setup.sh 10 | ` 11 | ## debug mode 12 | ` 13 | pip3 install -e $HOME/ws/git/dpro 14 | ` 15 | —-- 16 | 17 | ## Reinstall customized TF 18 | ``` 19 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.2/tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl 20 | pip3 --no-cache-dir install --force-reinstall tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl 21 | ``` 22 | 23 | ## RUN 24 | ``` 25 | export HOROVOD_FUSION_THRESHOLD="${HOROVOD_FUSION_THRESHOLD:-67108864}" 26 | export HOROVOD_CYCLE_TIME="${HOROVOD_CYCLE_TIME:-0}" 27 | export HOROVOD_LOG_LEVEL="${HOROVOD_LOG_LEVEL:-warning}" 28 | export NCCL_DEBUG="${NCCL_DEBUG:-INFO}" 29 | export NCCL_DEBUG_SUBSYS="${NCCL_DEBUG_SUBSYS:-INIT}" 30 | export NCCL_ALGO="${NCCL_ALGO:-Ring}" 31 | 32 | export HOROVOD_FUSION_THRESHOLD=0 33 | export HOROVOD_CYCLE_TIME=5 34 | 35 | bash mpirun.sh python3 $HOME/horovod_examples/tensorflow/tensorflow_synthetic_benchmark.py --model VGG16 --num-iters 5 36 | 37 | bash mpirun.sh nsys profile -o 1ib_overlap_xlaoff_gpu%q{OMPI_COMM_WORLD_RANK}.qdrep python3 $HOME/horovod_examples/tensorflow/tensorflow_synthetic_benchmark.py --model VGG16 --num-iters 5 38 | 39 | TF_XLA_FLAGS=--tf_xla_auto_jit=2 40 | 41 | for (( id=0; id < 8; id++ )); do 42 | python3 $HOME/nvprof2json/nvprof2json.py --filename $HOME/global_traces/host0/simple.${id}.nvprof --filter CUPTI_ACTIVITY_KIND_MEMCPY,CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL > $HOME/global_traces/rank${id}.json 43 | done 44 | for (( id=8; id < 16; id++ )); do 45 | python3 $HOME/nvprof2json/nvprof2json.py --filename $HOME/global_traces/host1/simple.${id}.nvprof --filter CUPTI_ACTIVITY_KIND_MEMCPY,CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL > $HOME/global_traces/rank${id}.json 46 | done 47 | ``` 48 | 49 | --- 50 | ## Train xla cost model 51 | ``` 52 | cd ${HOME}/ 53 | rm -rf dpro 54 | git clone https://github.com/joapolarbear/dpro.git 55 | cd dpro && sudo bash setup.sh 56 | 57 | cd ${HOME}/ 58 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.1/dpro_xla_tools.zip 59 | unzip dpro_xla_tools.zip 60 | export BPF_TF_PATH=${HOME}/dpro_xla_tools 61 | sudo ln -sf /usr/local/lib/python3.7/dist-packages/tensorflow/libtensorflow_framework.so.2 /usr/lib/ 62 | ``` 63 | 64 | ## The GPU id to run profiling on (specify one GPU only) 65 | ``` 66 | export BPF_COST_MODEL_PROFILE_GPU="0" 67 | export CUDA_VISIBLE_DEVICES=0 68 | 69 | cd ${HOME} 70 | COMM_BACKEND_LAUNCHER="python3 /usr/local/byteps/launcher/launch.py python3 test.py --comm_backend bps" 71 | COMM_BACKEND_LAUNCHER="horovod -np 1 python3 test.py" 72 | ``` 73 | ### RUN 74 | ``` 75 | $COMM_BACKEND_LAUNCHER 76 | ALL_TRACE_DIR=${HOME}/trace_dirs_vgg16 77 | mv $HOME/traces $ALL_TRACE_DIR 78 | 79 | export XLA_DUMP_DIR=${HOME}/xla_dump 80 | mkdir -p $XLA_DUMP_DIR 81 | TF_DUMP_GRAPH_PREFIX=${XLA_DUMP_DIR} TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2" $COMM_BACKEND_LAUNCHER 82 | 83 | export DPRO_GRAPHDEF_DFG_PATH=${XLA_DUMP_DIR}/graphdef_dag.gml 84 | export TRACE_DIR=$ALL_TRACE_DIR/0 85 | export OUTPUT_DIR="${HOME}/xla_vgg16" 86 | mkdir -p $OUTPUT_DIR 87 | 88 | NUM_RANDOM_SAMPLES=5000 89 | MAX_CLUSTER_SAMPLES=5 90 | MIN_CLUSTER_SIZE=4 91 | MAX_CLUSTER_SIZE=800 92 | 93 | cd ${HOME}/dpro 94 | python3 xla_cm_entry.py --mode 0 \ 95 | --trace_dir ${TRACE_DIR} \ 96 | --output_dir ${OUTPUT_DIR} \ 97 | --num_samples ${NUM_RANDOM_SAMPLES} \ 98 | --max_cluster_samples ${MAX_CLUSTER_SAMPLES} \ 99 | --min_cluster_size ${MIN_CLUSTER_SIZE} \ 100 | --max_cluster_size ${MAX_CLUSTER_SIZE} \ 101 | --batch_size 256 102 | 103 | ``` 104 | ## TEST the searched results 105 | ``` 106 | hdfs dfs -rm -r /usr/hphu/search_rst && hdfs dfs -mkdir /usr/hphu/search_rst 107 | 108 | function put_spec_to_hdfs { 109 | hdfs dfs -put $1/spec /usr/hphu/search_rst/$1_spec 110 | } 111 | 112 | put_spec_to_hdfs 20210929_01_bps_tf_resnet50_tcp_2w8g2s_tsfs_tspart_optws 113 | 114 | hdfs dfs -ls /usr/hphu/search_rst 115 | ``` 116 | 117 | 118 | # BytePS 119 | 120 | ## 重装byteps 121 | ``` 122 | cd /usr/local/byteps && git pull && git submodule update 123 | cd /usr/local/byteps/3rdparty/ps-lite && make clean && make -j USE_RDMA=1 && \ 124 | cd /usr/local/byteps/ && rm -rf build && \ 125 | BYTEPS_USE_RDMA=1 BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_MXNET=1 python3 setup.py install 126 | ``` 127 | ### Test byteps 128 | ``` 129 | export DMLC_ROLE=scheduler 130 | export DMLC_ROLE=worker 131 | export DMLC_WORKER_ID=0 132 | 133 | export DMLC_NUM_WORKER=2 134 | export DMLC_NUM_SERVER=1 135 | export DMLC_PS_ROOT_URI=10.129.120.196 136 | export DMLC_PS_ROOT_PORT=8008 137 | 138 | unset NCCL_ALGO 139 | unset NCCL_DEBUG_SUBSYS 140 | unset NCCL_DEBUG 141 | unset NCCL_TRACE_START_STEP 142 | unset NCCL_TRACE_DIR 143 | unset NCCL_TRACE_END_STEP 144 | unset NCCL_ENABLE_TIMELINE 145 | export BYTEPS_LOG_LEVEL=INFO 146 | 147 | cd $HOME/bert && sudo git checkout b_tf2_4 148 | python3 /usr/local/byteps/launcher/launch.py python3 $HOME/bert/run_pretraining.py 149 | ``` 150 | 151 | # NCCL Contention 测试 152 | ``` 153 | rm -rf /usr/local/nccl 154 | pip3 uninstall -y horovod 155 | 156 | cd /usr/local && git clone https://github.com/NVIDIA/nccl.git 157 | cd /usr/local/nccl && git checkout v2.10.3-1 158 | rm -rf /usr/include/nccl.h 159 | 160 | make -j src.build && make pkg.txz.build 161 | tar -Jxf ./build/pkg/txz/nccl*.txz -C /usr/local/nccl/ --strip-components 1 162 | echo "/usr/local/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 163 | ldconfig && ln -sf /usr/local/nccl/include/* /usr/include/ 164 | 165 | HOROVOD_NCCL_HOME=/usr/local/nccl \ 166 | HOROVOD_NCCL_HOME=/usr/local/nccl \ 167 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL \ 168 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 \ 169 | pip3 install --no-cache-dir horovod==0.21.0 170 | ``` 171 | 172 | -------------------------------------------------------------------------------- /docs/dependency.md: -------------------------------------------------------------------------------- 1 | 2 | # 3rdparty version 3 | 4 | ## Frameworks 5 | * [MXNet](https://github.com/joapolarbear/incubator-mxnet/tree/mlsys2022) 6 | * [TensorFlow](https://github.com/joapolarbear/tensorflow/tree/mlsys2022) 7 | * [BytePS](https://github.com/joapolarbear/byteps/tree/mlsys2022) 8 | * [pslite](https://github.com/joapolarbear/ps-lite/tree/mlsys2022) 9 | * [ZMQ](https://github.com/chenyu-jiang/libzmq/commit/5ed25589f000dc613e1a8575ba193eb78eb9b86e) 10 | * [Horovod](https://github.com/joapolarbear/horovod/tree/mlsys2022) 11 | * [NCCL](https://github.com/joapolarbear/nccl/tree/mlsys2022) 12 | 13 | 14 | ## Benchmarks 15 | * [BERT]( https://github.com/joapolarbear/bert/tree/mlsys2022) 16 | * [gluon-nlp](https://github.com/joapolarbear/gluon-nlp/tree/mlsys2022) 17 | 18 | ## Tools 19 | * [spdlog](https://github.com/gabime/spdlog/commit/6aafa89d20eef25ec75462ffb7eedc328f135638) 20 | * [nvprof2json](https://github.com/joapolarbear/nvprof2json): convert nvprof results to JSON format 21 | * [catapult](https://github.com/joapolarbear/catapult): convert JSON files to a HTML in the format of chrome://tracing. 22 | 23 | 24 | # Installation 25 | 26 | ## TensorFlow 27 | 28 | You can installed our compiled version of TensorFlow if you are using python3.7 29 | ``` 30 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.2/tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl && \ 31 | pip3 --no-cache-dir install --force-reinstall tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl && \ 32 | rm tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl 33 | ``` 34 | 35 | Or you can build our customized TensorFlow yourself using bazel. First, clone our customized TensorFlow 36 | ``` 37 | git clone --recurse-submodules -b r2.4_dev https://github.com/joapolarbear/tensorflow.git 38 | cd tensorflow 39 | ``` 40 | Then, you need to config the building process, if you are using python3.7 and cuda11, you can also use our configuration file 41 | ``` 42 | cp tools/sample_config/cuda11.3_python3.7 .tf_configure.bazelrc 43 | ``` 44 | Install dependencies 45 | ``` 46 | pip3 install -U --user keras_applications --no-deps 47 | pip3 install -U --user keras_preprocessing --no-deps 48 | ``` 49 | Pin default python to python3.7 50 | ``` 51 | ln -sf /usr/bin/python3 /usr/bin/python 52 | ``` 53 | 54 | Then, follow the commands below to build and install TensorFlow. 55 | ``` 56 | cd /root/tensorflow && bazel build -j 32 --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package 57 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg 58 | ls -lh /tmp/tensorflow_pkg 59 | pip3 --no-cache-dir install --force-reinstall /tmp/tensorflow_pkg/tensorflow-2.4.1-cp37-cp37m-linux_x86_64.whl 60 | bazel clean && ln -sf /usr/bin/python2.7 /usr/bin/python && rm -rf /tmp/tensorflow_pkg/* 61 | rm -rf tensorflow && rm -rf /var/lib/apt/lists/* 62 | ``` 63 | 64 | ## MXNet 65 | ``` 66 | cd customized-mxnet 67 | make clean_all && make -j16 USE_OPENCV=1 \ 68 | USE_BLAS=openblas \ 69 | USE_CUDNN=1 \ 70 | USE_CUDA=1 \ 71 | USE_CUDA_PATH=/usr/local/cuda \ 72 | USE_MKLDNN=0 \ 73 | USE_DIST_KVSTORE=1 \ 74 | USE_NCCL=1 \ 75 | USE_NCCL_PATH=/usr/local/nccl 76 | cd python 77 | python3 setup.py build 78 | python3 setup.py install 79 | python3 setup.py bdist_wheel 80 | ln -sf /usr/local/cuda-10.2/targets/x86_64-linux/lib/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so.1 81 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 82 | ln -sf /root/customized-mxnet/include $MX_PATH/include && echo $MX_PATH 83 | ``` 84 | 85 | ## BytePS + pslite + ZMQ 86 | ``` 87 | cd $HOME && git clone https://github.com/gabime/spdlog.git 88 | cd $HOME/spdlog && mkdir build && cd build && cmake .. && make -j && make install 89 | cd $HOME && git clone --single-branch --branch byteprofile_rdma --recurse-submodules https://github.com/joapolarbear/byteps.git 90 | cd $HOME/byteps/3rdparty/ps-lite && make -j USE_RDMA=1 91 | cd $HOME/byteps/ 92 | BYTEPS_USE_RDMA=1 BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_MXNET=1 python3 setup.py install 93 | BYTEPS_WITHOUT_PYTORCH=1 BYTEPS_WITHOUT_MXNET=1 python3 setup.py bdist_wheel 94 | ``` 95 | 96 | ## Horovod + NCCL 97 | Install OpenMPI first 98 | ``` 99 | cd $HOME 100 | wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz 101 | rm -rf /usr/lib/x86_64-linux-gnu/openmpi 102 | tar -xvf openmpi-4.0.3.tar.gz && cd openmpi-4.0.3 103 | ./configure --prefix="/usr" 104 | make -j && make all install 105 | ``` 106 | 107 | Then install NCCL 108 | ``` 109 | cd $HOME && git clone --recurse-submodules -b byteprofile https://github.com/joapolarbear/nccl.git 110 | rm -rf /usr/include/nccl.h 111 | cd $HOME/nccl && make -j src.build && make pkg.txz.build 112 | mkdir -p $HOME/nccl 113 | tar -Jxf ./build/pkg/txz/nccl*.txz -C $HOME/nccl/ --strip-components 1 114 | echo "$HOME/nccl/lib" >> /etc/ld.so.conf.d/nvidia.conf 115 | ldconfig && ln -sf $HOME/nccl/include/* /usr/include/ 116 | ``` 117 | 118 | And install Horovod 119 | ``` 120 | cd $HOME && git clone --recurse-submodules -b b_v0.21.0 https://github.com/joapolarbear/horovod 121 | cd $HOME/horovod && python3 setup.py sdist 122 | pip3 install cloudpickle psutil pyyaml cffi==1.4.0 pycparser 123 | HOROVOD_NCCL_HOME=$HOME/nccl \ 124 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL \ 125 | HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 \ 126 | pip3 install --no-cache-dir dist/horovod* 127 | cp -r $HOME/horovod/examples $HOME/horovod_examples 128 | ``` -------------------------------------------------------------------------------- /docs/format.md: -------------------------------------------------------------------------------- 1 | # Format Specification 2 | 3 | ## Trace Format 4 | 5 | ```python 6 | ### Uniform template 7 | { 8 | "name": op_cat.op_name.sub_op, 9 | "ts": time_in_us, 10 | "dur": time_in_us, 11 | "pid": process_id, # e.g., host0.rank0 12 | "args": { 13 | "name": op_type.op_name.sub_op~>suffix, 14 | ... 15 | "cnt": 2, # how many times the op occurs, 16 | "step": 5, # which step the op is in, may be larger than "cnt" 17 | } 18 | } 19 | ``` 20 | 21 | 22 | - `op_cat` is one of `BW`, `FW`, `Comm`, `UPDATE_`, or specially, `trace["name"] = UPDATE_...`, e.g., `UPDATE_CAL`, `UPDATE_0`, `UPDATE_1`, ... And `trace["name"] = I/O_...` 23 | - `op_name` is the raw name profiled by the built-in profiler of each ML framework. 24 | - For `Comm`, `sub_op` could be `SEND`, `RECV`, and `suffix` could be `0_1_6_0` (for NCCL) denoteing `loopId`, `channelId`, `chunkId`, and `liceId` respectively 25 | 26 | We call the names follow the format of `op_cat.op_name.sub_op` as `standard name` 27 | 28 | ### For communication traces 29 | Name should be tensor index `tensor_id` or `tensor_id_1+tensor_id_2+...+tensor_id_n` and the corresponding `tensor_name` should be stored in the `gradient_name_list` field in `/metadata.json`. 30 | 31 | ### Detailed communication traces 32 | `"comm_detail"` in `trace["tid"]` 33 | 34 | 35 | ## Trace Statistic Format 36 | ``` python 37 | name2sta = { 38 | op_long_name: { 39 | "avg": ... 40 | "cnt": 41 | "time": 42 | "min_t": 43 | "max_t": 44 | "id": ,# the order the op is created in the dict 45 | "step_ids": [] # a list of index, denotes where this operator appears in the traces 46 | } 47 | 48 | op_long_name = event["pid"]->event["name"] 49 | or event["pid"]->event["name"]~>suffix 50 | ``` 51 | 52 | ## Dependency Graph 53 | Nodes: 54 | ```python 55 | op_long_name: { 56 | "avg": time_in_us, 57 | gap_string:time_in_us 58 | } 59 | ``` 60 | `gap_string` denotes different kinds of gaps 61 | 62 | Special Nodes `END`, the end node. 63 | 64 | ## NCCL Graph 65 | - During trace collection, NCCL graph needs to parse at least one GPU's NCCL traces to get `chunkNum`, `sliceNum`, `channelNum`, `loopNum` for each `raw_name` (`op_cat.op_name`, without `sub_op`) 66 | - During trace collection, we need to parse `nccl_rank_graph.json` to get the connection information of this GPU. 67 | 68 | ## ParameterDict 69 | Manage the parameter info of a DNN model. Seek to implement a unified `ParameterDict`, but now, it is only for MXNet. 70 | 71 | ### MXNet 72 | Contains: 73 | - `gradient_name_list`, which maps `tensor_id` to `tensor_name`; 74 | - `tensor2update`, which maps `tensor_id` to `update_id` 75 | 76 | 77 | ## Rules of converting Framework traces 78 | ### Tensorflow 79 | #### UPADATE operators 80 | 1. Take all down stream operators of `Comm` as UPDATE operators 81 | 2. There may be depedency between UPDATE operators 82 | #### FW and BW operators 83 | 1. **Assumption**: in TensorFlow, some operators may have multiple traces with the same name in one step, which we call as sub_trace, we assume they are continuous and combine them into one single operator. 84 | 85 | #### Statistic the number of step 86 | 1. pre_cat not in [io, fw], cur_cat in [io, fw], step cnt + 1 87 | 88 | ### MXNET 89 | #### UPDATE operators 90 | 1. We assume there is no dependency between UPDATE operators, except for `UPDATE_CAL`->`UPDATE_ID` 91 | -------------------------------------------------------------------------------- /docs/nvprof.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This tutorial gives an introduction of how to compare ByteProfile Traces and NvProf traces 4 | 5 | # How to use 6 | 7 | ## Traces Collection 8 | 9 | Use `nvprof` to collect NvProf traces. 10 | ```nvprof -o foo.nvvp your_program``` 11 | 12 | In the meanwhile, you should customized the source code and enable BPF_related environment variabless correspondingly to collect ByteProfile Traces. 13 | 14 | Then you can run `python3 nvprof2json.py --filename > ` to convert NvProf Traces from `.nvvp` to `JSON` format. 15 | 16 | ## Comparison 17 | 18 | For comparison, you can simply run 19 | `python3 analyze.py --option mapping --path ,` 20 | Then the statitical results will be stored in `mapfrom_op2kernels.xlsx` under the same folder as `` 21 | 22 | The rational is 23 | 1. for the first iteration, check the bias. check those relatively large kernel-level traces, it overlapping with a op-level trace but is not convered by that 24 | 2. for the second iteration, generate the mapping -------------------------------------------------------------------------------- /docs/profile.md: -------------------------------------------------------------------------------- 1 | # Profiler 2 | ### Horovod + Tensorflow 3 | Please follow the instructions in [docs/dependency.md](./dependency.md) to install our customized Horovod and TensorFlow. 4 | To enable profiling, add the following code to your script for a training job using Horovod. 5 | 6 | ``` 7 | recorder = hvd.Recorder() 8 | 9 | @hvd.profile(recorder) 10 | @tf.function 11 | def benchmark_step(first_batch): 12 | ... 13 | with tf.GradientTape() as tape: 14 | ... 15 | tape = hvd.DistributedGradientTape(tape) 16 | ... 17 | ``` 18 | 19 | Besides, the following environment variables needs to be set 20 | ``` 21 | export BYTEPS_TRACE_ON=1 22 | export BYTEPS_TRACE_DIR=path/to/store/traces 23 | export BYTEPS_TRACE_START_STEP= 24 | export BYTEPS_TRACE_END_STEP== 25 | ``` 26 | Then, launch the distributed trianing job on a cluster. 27 | 28 | 29 | Before analyzing traces using the dPRO toolkit, you need to collect traces from different workers to one device and organize them in the following manner. 30 | ``` 31 | global_traces/ 32 | | 33 | - host0/ # traces of device 0 34 | | 35 | - 0/ # traces of GPU 0 on device 0 36 | | 37 | - 1/ # traces of GPU 1 on device 0 38 | ... 39 | | 40 | - host1/ 41 | | 42 | ... 43 | ``` -------------------------------------------------------------------------------- /docs/sample_config.yaml: -------------------------------------------------------------------------------- 1 | # Normal arguments 2 | platform: TENSORFLOW 3 | comm_backend: NCCL 4 | nccl_algo: RING 5 | optimizer: MCMC 6 | xla_candidate_path: {{path}}/.xla_dump/unsafe_resource_deps.txt 7 | 8 | # Stroe true argements 9 | store_true: 10 | pretty: 1 11 | layer_by_layer: 1 12 | 13 | # environment variables 14 | env: 15 | DPRO_GRAPHDEF_DFG_PATH: {{path}}/.xla_dump/graphdef_dag.gml -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | 2 | This script introduces how to analyze the traces of a distributed trianing job using dPRO toolkit. 3 | Suppose `DPRO_PATH` denotes the path where dpro is installed (check by entering `python3 -c "improt dpro; print(dpro.__path__)`), and `GLOBAL_TRACE_PATH` represents the path to store the final traces. 4 | 5 | ### Basic Usage 6 | You can check the help information using the following commands 7 | ``` 8 | dpro_cli -h 9 | dpro_cli --help 10 | ``` 11 | 12 | It will output all arguments of `DPRO_PATH/analyze.py`. So you can use dPRO as a python script. 13 | ``` 14 | python3 DPRO_PATH/analyze.py 15 | ``` 16 | 17 | It's tedious to set so many arguments everytime we want to analyze the traces of a job, so dPRO provides a command line tool, namely `dpro_cli`, which automatically search for a configuration file in YAML format (file extension .yaml). 18 | * The first argument `dpro_cli` must be `option`, which can be `collect`, `replay` or `optimize`. 19 | * The second argument `path` can be set as `GLOBAL_TRACE_PATH`. 20 | * Users can write a configuration file for each job. [sample_config.yaml](./sample_config.yaml) shows an example with three fields: 1) normal arguments; 2) `store_true` argument, which corresponds to the arguments set with `action=store_true`, see the help info for more details; 3) `env` field, which allow users set some environment variables of dPRO. 21 | * `dpro_cli` will automaticall substitute `{{path}}` in the config file with the second argument `path`. 22 | 23 | Writing a configuration file benefits when you want to fix some arguments for a job, e.g., platform (TENSORFLOW or MXNET), comm_backend (BYTEPS or NCCL), and so on. You can also add additional arguments for different analysis methods, e.g., sub_option, optimizer (DP or MCMC). 24 | 25 | 26 | # Statistic 27 | Users can statistic traces using the following commands. 28 | ``` 29 | python3 /home/tiger/byteprofile-analysis/analyze.py \ 30 | --option collect \ 31 | --platform TENSORFLOW \ 32 | --comm_backend NCCL --nccl_algo RING --pretty \ 33 | --path $GLOBAL_TRACE_PATH 34 | ``` 35 | or 36 | ``` 37 | dpro_cli collect $GLOBAL_TRACE_PATH 38 | ``` 39 | 40 | # Replay 41 | Users can simulate a training job using the following commands. 42 | ``` 43 | python3 /home/tiger/byteprofile-analysis/analyze.py \ 44 | --option replay \ 45 | --platform TENSORFLOW \ 46 | --comm_backend NCCL --nccl_algo RING --pretty \ 47 | --path $GLOBAL_TRACE_PATH 48 | ``` 49 | or 50 | ``` 51 | dpro_cli replay $GLOBAL_TRACE_PATH 52 | ``` 53 | 54 | --- 55 | # Optimizer 56 | 57 | ## Operator Fusion 58 | ### Search operator fusion strategies 59 | Sample commands, put the XLA cost model in the path of `./cost_model/_xla/.cost_model` 60 | ``` 61 | python3 analyze.py --option optimize --sub_option xla,^memory \ 62 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \ 63 | --path $GLOBAL_TRACE_PATH --layer_by_layer --mcmc_beta 10 \ 64 | --xla_candidate_path data/xla_candidates_resnet.txt 65 | ``` 66 | If you do not have a XLA cost model, run the following command to search with estimated fusion time: 67 | ``` 68 | python3 analyze.py --option optimize --sub_option xla \ 69 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty --simulate \ 70 | --path $GLOBAL_TRACE_PATH \ 71 | --workspace $GLOBAL_TRACE_PATH \ 72 | --xla_candidate_path data/xla_candidates_resnet.txt \ 73 | --update_infi_para --layer_by_layer 74 | ``` 75 | 76 | ### Sample some example strategies 77 | Fuse operators layer by layer, below is an exmple where each 2 layers' operators are fused. 78 | ``` 79 | python3 analyze.py --option optimize --sub_option xla,^memory \ 80 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \ 81 | --path path/to/trace/directory \ 82 | --xla_candidate_path path/to/candidate/file/ \ 83 | --update_infi_para --simulate --layer_num_limit 2 84 | ``` 85 | 86 | ## Tensor Fusion 87 | ### Search tensor fusion strategies 88 | Sample commands 89 | ``` 90 | python3 analyze.py --option optimize --sub_option tensor_fusion \ 91 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \ 92 | --path $GLOBAL_TRACE_PATH \ 93 | --workspace $GLOBAL_TRACE_PATH 94 | ``` 95 | or 96 | ``` 97 | dpro_cli optimize $GLOBAL_TRACE_PATH --sub_option tsfs 98 | ``` 99 | 100 | ## Combine Tensor Fusion and Operator Fusion 101 | ### Search both tensor fusion and operator fusion strategies 102 | Sample commands 103 | ``` 104 | python3 analyze.py --option optimize --sub_option tensor_fusion,xla \ 105 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \ 106 | --path $GLOBAL_TRACE_PATH \ 107 | --workspace $GLOBAL_TRACE_PATH \ 108 | --xla_candidate_path /root/byteprofile-analysis/data/xla_candidates_resnet.txt 109 | ``` 110 | 111 | ### Generate tensor fusion strategies according to operator fusion strategies 112 | Sample commands 113 | ``` 114 | python3 analyze.py --option optimize --sub_option from_opfs2tsfs \ 115 | --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --pretty \ 116 | --path $GLOBAL_TRACE_PATH, 117 | ``` 118 | where `` denotes the path to the cluster_mapping.txt (operator fusion search result). 119 | 120 | 121 | ## Mixed Precision Training 122 | `TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_PRIORLIST_FILE`: a file containing ops to force quantize, seperated by \n 123 | `TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_PRIORLIST_ADD`: ops to force quantize, seperated by comma 124 | `TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_FORCE`: clear the CLEARLIST and BCACKLIST if set 125 | 126 | --- 127 | # Train Cost Model 128 | ## Cost Model for MultiGPU 129 | 130 | ``` 131 | python3 mg_generate_dataset.py --option optimize --sub_option train_gpu --platform TENSORFLOW --comm_backend NCCL --nccl_algo RING --path $GLOBAL_TRACE_PATH 132 | ``` 133 | `--path` specifies where traces are stored, organized by the GPU model name and ML model name 134 | 135 | --- 136 | 137 | ### Heat-based Search Algorithm for Operator Fusion 138 | #### Requirements of Weights 139 | 1. Initially, each strategy has a weight of 1 140 | 2. If fuse(a, b) generating c brings speedup, 141 | 1. The weights of fusion strategies involving a, b, c > 1 142 | 2. The weights of de-fusion strategies involving a, b, c < 1 143 | 3. If fuse(a, b) generating c leads to worse performance, 144 | 3. The weights of fusion strategies involving a, b, c < 1 145 | 4. The weights of de-fusion strategies involving a, b, c > 1 146 | 4. If defuse(c) generating a, b is better, the same as item 3 147 | 5. If defuse(c) generating a, b is worse, the same as item 2 148 | 6. 149 | #### Solution 150 | - The heat is directional, i.e., a large heat means an operator is expected to participate in operator fusion, but not in operator partition 151 | - After applying a strategy, if it's a fusion strategy, record Delta T at the heat history list, otherwise, ecord - Delta T at the heat history list. 152 | - To calculate the final heat H of one operator, if the heat history list is empty, return 0, otherwise, return ..., k > 1, thus H > -1 153 | $$H = \frac{1}{n}\sum_i^{n}\frac{e^{\Delta T_i} - 1}{k \Delta t_i}$$ 154 | - With the heat H, calculate the final weight W as follows 155 | $$W = \left\{ 156 | \begin{array}{rcl} 157 | 1 + H & &, fusion \quad strategy\\ 158 | 1 + \frac{1}{H+1} - 1 = \frac{1}{H+1} & & , partition \quad strategy\\ 159 | \end{array} \right.$$ 160 | 161 | 162 | ## Apply strategies 163 | 164 | ### Operator Fusion 165 | We know XLA is enabled by setting `TF_XLA_FLAGS="--tf_xla_auto_jit=2"`, to apply customized XLA clustering strategies, set `XLA_CLUSTER_SPEC` to the path of the clustering specification file, where each row is in the format of `operator name` `cluster_id` 166 | 167 | Besides, We can set `XLA_DUMP_DIR` to the path to store the intermediate informantion, which can be used to train the XLA cost model. 168 | * `xla_candidates.txt`: candidates 169 | * `unsafe_resource_deps.txt`: unsafe_resource_deps, __currently this file also contains xla_candidates.__ 170 | * `xla_clustering.txt`: the clustring strategies being applied, exits only when using default XLA (`XLA_CLUSTER_SPEC` is not set) 171 | 172 | We can further set `TF_DUMP_GRAPH_PREFIX=${XLA_DUMP_DIR} TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2"` to dump graph_def. 173 | 174 | 175 | ### Tensor Fusion 176 | 177 | #### BytePS 178 | Use https://github.com/joapolarbear/byteps.git:byteprofile_rdma 179 | 180 | You can also configure to fuse tensors to multiple tensor groups, by 181 | 1. specifying the number of tensor groups by setting `BYTEPS_TENSOR_GROUP_NUM=x` 182 | 2. using a specification file by setting `BYTEPS_TENSOR_GROUP_FILE=/path/to/spec`. The file should be a json file and in the following format, where `0, 1, ...` denotes the indexes of tensors. 183 | ``` 184 | { 185 | "mapping": [ 186 | "0+1+2", 187 | "3+4" 188 | ] 189 | } 190 | ``` 191 | 192 | 193 | You can also configure the tensor partition size. A smaller size improves BytePS pipelining, but may have higher other overhead like NCCL coordination, ZMQ message headers, etc. The default and recommended value is 4096000 (in bytes). 194 | 195 | ``` 196 | export BYTEPS_PARTITION_BYTES=y 197 | ``` 198 | 199 | You can also configure the tensor partition size for each tensor use a specification file. Each line of the specification file should follow the format of ` `. 200 | ``` 201 | export BYTEPS_PARTITION_SPEC_FILE=/path/to/spec 202 | ``` 203 | Another way to specify the tensor partition size for a tensor is to use `BYTEPS_PARTITION_SPEC==`. You can also specify the tensor size for multiple tensors by seperating their specification with comma. 204 | 205 | #### Horovod 206 | Use https://github.com/joapolarbear/horovod:b_v0.21.0 207 | 208 | You can also configure to fuse tensors to multiple tensor groups, by 209 | 1. specifying the number of tensor groups by setting `HOROVOD_TENSOR_GROUP_NUM=x` 210 | 2. using a specification file by setting `HOROVOD_TENSOR_GROUP_FILE=/path/to/spec`. The file should be a json file and in the following format, where `0, 1, ...` denotes the indexes of tensors. Note that all tensor indexes should be specificed, even if one tensor is not fused with other tensors. 211 | ``` 212 | { 213 | "mapping": [ 214 | "0+1+2", 215 | "3+4" 216 | ] 217 | } 218 | ``` 219 | -------------------------------------------------------------------------------- /dpro/__init__.py: -------------------------------------------------------------------------------- 1 | from . import base 2 | from . import logger_utils 3 | from . import collect 4 | from . import trace_utils 5 | from . import replay 6 | 7 | def init(workspace, name, **kwargs): 8 | from dpro.logger_utils import SingleLogger 9 | logger = SingleLogger(workspace, name, **kwargs) -------------------------------------------------------------------------------- /dpro/arg_utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from .base import Singleton 3 | 4 | parser = argparse.ArgumentParser(description="dPRO Arguments", 5 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 6 | # parser.add_argument("-s", action="store_true", help="sort the output result") 7 | parser.add_argument("--option", type=str, 8 | choices=["statistic", "graph", "combine", "mapping", "compare", "critical", "timeline", "replay", "topo_sort", "collect", "3dcompare", "optimize"], 9 | help="The type of analysis to process. including:\n" + 10 | "* statistic: show the statistic results\n" + 11 | "* graph: show the dependency graph\n") 12 | parser.add_argument("--sub_option", type=str, default=None, help="Sub options for each option") 13 | parser.add_argument("--path", type=str, required=True, help="The paths of traces you want to analyze, support multiple paths seperated with comma.") 14 | parser.add_argument("--del_queue", action="store_true", help="If set True, delete the queue time in communication traces. ") 15 | parser.add_argument("--logging_level", type=str, default="INFO", help="Logging level") 16 | parser.add_argument("--clean", action="store_true", help="Flush the log file") 17 | parser.add_argument("--pretty", action="store_true", help="Output necessary info if set") 18 | parser.add_argument("--filter", type=str, default=None, help="Used to show part of communication operations, seperated with comma.") 19 | parser.add_argument("--progress", action="store_true", help="Show the progress bar if it is set, disable the std output") 20 | parser.add_argument("--debug_traces", action="store_true", help="If set, output traces profiled for the analysis process") 21 | 22 | ### collect 23 | group_clct = parser.add_argument_group('Trace Collection') 24 | group_clct.add_argument("--comm_backend", type=str, default="NCCL", choices=["NCCL", "BYTEPS", "NONE"], help="Communication backend") 25 | group_clct.add_argument("--platform", type=str, default="TENSORFLOW", choices=["TENSORFLOW", "MXNET"], help="Platform used to run the model") 26 | group_clct.add_argument("--nccl_algo", type=str, default=None, help="NCCL algorithm, Tree or Ring") 27 | group_clct.add_argument("--trace_level", type=str, choices=["debug", "info"], default="info", help="if set to debug, show some trival traces") 28 | group_clct.add_argument("--disable_revise", action="store_true", help="By default, revise traces according to SEND-RECV dependency, set to disable this argument to disable") 29 | group_clct.add_argument("--force", action="store_true", help="Force to re-generate traces, graphs") 30 | group_clct.add_argument("--update_infi_para", action="store_true", help="Tensorflow timeline display UPDATE traces in parallel, set `update_infi_para` to True to keep all UPDATE traces") 31 | 32 | ### Used for BytePS traces collection 33 | group_clct_bps = parser.add_argument_group('Trace Collection for BytePS') 34 | group_clct_bps.add_argument("--pcap_file_path", type=str, default=None, help="Path to the directory containing BytePS communication pcap files.") 35 | group_clct_bps.add_argument("--zmq_log_path", type=str, default=None, help="Path to the directory containing BytePS communication zmq log files.") 36 | group_clct_bps.add_argument("--server_log_path", type=str, default=None, help="Path to the directory containing BytePS server log files.") 37 | group_clct_bps.add_argument("--profile_start_step", type=int, default=None, help="The start step of computation profiling. Used for truncating BytePS comm trace.") 38 | group_clct_bps.add_argument("--profile_duration", type=int, default=None, help="The duration (in steps) of computation profiling. Used for truncating BytePS comm trace.") 39 | group_clct_bps.add_argument("--van_type", type=str, choices=["ZMQ", "RDMA"], default=None, help="Type of protocol used in BytePS.") 40 | 41 | ### statistic 42 | group_stat = parser.add_argument_group('Statistic') 43 | group_stat.add_argument("--sort", action="store_true", help="Sorted in descending order") 44 | group_stat.add_argument("--head", type=int, default=None, help="Print the first few lines") 45 | group_stat.add_argument("--xlsx", action="store_true", help="Output XLSX file of the statistic results") 46 | 47 | ### replay 48 | group_replay = parser.add_argument_group('Replayer') 49 | group_replay.add_argument("--update_barrier", action="store_true", default=False, help="If true, add a barrier before all UPDATE ops.") 50 | group_replay.add_argument("--update_clip_overlapping", action="store_true", help="If true, clip overlapping UPDATE nodes in the timeline.") 51 | group_replay.add_argument("--step_num", type=int, default="1", help="Default step numbers to replay.") 52 | group_replay.add_argument("--delay_ratio", type=float, default=1.1, help="delay ratio") 53 | group_replay.add_argument("--full_trace", action="store_true", help="If this arg is set, simulate traces with detailed dependency info.") 54 | group_replay.add_argument("--show_queue", action="store_true", help="If this arg is set, record the queue status of each device during replaying.") 55 | 56 | ### Optimize 57 | group_opt = parser.add_argument_group('Optimal Strategies Search') 58 | group_opt.add_argument("--optimizer", type=str, default="MCMC", choices=["MCTS", "MCMC", "DP"], help="The algorithm used to search the optimal optimzation strategy") 59 | group_opt.add_argument("--ucb_type", type=str, default="AVG", choices=["MAX", "AVG"], help="The type of quanlity value used in the UCB euqation") 60 | group_opt.add_argument("--no_mutation", action="store_true", help="If this arg is set, the default policy of MCTS will not rollout") 61 | group_opt.add_argument("--ucb_gamma", type=float, default=0.1, help="Hyper Parameter used in UCB to control the exploration rate.") 62 | group_opt.add_argument("--ucb_visual", action="store_true", help="If this arg is set, visualize the MCTS search process") 63 | group_opt.add_argument("--no_crit", action="store_true", help="If this arg is set, relax the critical path constaint") 64 | 65 | group_opt.add_argument("--mcmc_beta", type=float, default=10, help="Hyper Parameter used in MCMC/SA to control the exploration rate") 66 | group_opt.add_argument("--step_size", type=int, default=1, help="Step size used in MCMC optimizer.") 67 | 68 | group_opt.add_argument("--heat_window_size", type=int, default=5, help="Window size for the heat based search heuristic.") 69 | group_opt.add_argument("--relabel", action="store_true", help="If this arg is set, relabel the dag with indexes.") 70 | group_opt.add_argument("--ckpt", action="store_true", help="If this arg is set, start search from checkpoint") 71 | group_opt.add_argument("--workspace", type=str, default=None, help="Workerspace of the optimizer") 72 | group_opt.add_argument("--memory_budget", type=float, default=16, help="GPU Memory budget") 73 | 74 | group_opt.add_argument("--search_ts_group_num", action="store_true", help="Search the optimal tensor group numbers if set") 75 | group_opt.add_argument("--fit_data_save_dir", type=str, default=None, help="Dump the data used to fit the tensor fusion cost model") 76 | group_opt.add_argument("--test_ts_group_num", type=int, default=None, help="Test the simulation result of fusing" 77 | "tensors to a spcific numbertensor group, defaul tensor partition size ~ 4 MB") 78 | 79 | ### Operator fusion 80 | group_xla = parser.add_argument_group('Operator Fusion') 81 | group_xla.add_argument("--simulate", action="store_true", help="If this arg is set, simulate the XLA cost model," 82 | " but still use its rule to determine which operators to fuse.") 83 | group_xla.add_argument("--xla_candidate_path", type=str, default=None, help="XLA candidate path") 84 | group_xla.add_argument("--layer_num_limit", type=str, default=None, help="Sample some operator fusion strategies, " 85 | "where BW operators are fused layer by layer." 86 | "This argument specifies the maximum number of layers that can be fused." 87 | "Test multiple values by separating them with commas") 88 | group_xla.add_argument("--layer_by_layer", action="store_true", help="Fuse operators layer by layer, if set ture") 89 | group_xla.add_argument("--fusion_once", action="store_true", 90 | help="If set, one op can be fused only once") 91 | group_xla.add_argument("--disable_estimate", action="store_true", 92 | help="If set, disable estimate the fused time when failed, instead, raise an error") 93 | 94 | args = parser.parse_args() 95 | 96 | 97 | @Singleton 98 | class SingleArg: 99 | def __init__(self): 100 | self.args = args 101 | -------------------------------------------------------------------------------- /dpro/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | dpro_dir = os.path.dirname(__file__) 4 | 5 | #! define a singleton class 6 | def Singleton(cls): 7 | _instance = {} 8 | 9 | def _singleton(*args, **kargs): 10 | if cls not in _instance: 11 | _instance[cls] = cls(*args, **kargs) 12 | return _instance[cls] 13 | 14 | return _singleton 15 | 16 | class bcolors: 17 | ENDC = '\033[0m' 18 | BOLD = '\033[1m' 19 | UNDERLINE = '\033[4m' 20 | CBLINK = '\33[5m' 21 | CBLINK2 = '\33[6m' 22 | CSELECTED = '\33[7m' 23 | 24 | CBLACK = '\33[30m' 25 | CRED = '\33[31m' 26 | CGREEN = '\33[32m' 27 | CYELLOW = '\33[33m' 28 | CBLUE = '\33[34m' 29 | CVIOLET = '\33[35m' 30 | CBEIGE = '\33[36m' 31 | CWHITE = '\33[37m' 32 | 33 | CBLACKBG = '\33[40m' 34 | CREDBG = '\33[41m' 35 | CGREENBG = '\33[42m' 36 | CYELLOWBG = '\33[43m' 37 | CBLUEBG = '\33[44m' 38 | CVIOLETBG = '\33[45m' 39 | CBEIGEBG = '\33[46m' 40 | CWHITEBG = '\33[47m' 41 | 42 | FAIL = '\33[31m' 43 | WARNING = '\33[33m' 44 | -------------------------------------------------------------------------------- /dpro/bps_helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/bps_helper/__init__.py -------------------------------------------------------------------------------- /dpro/cost_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/__init__.py -------------------------------------------------------------------------------- /dpro/cost_model/_gpu_predict/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_gpu_predict/__init__.py -------------------------------------------------------------------------------- /dpro/cost_model/_gpu_predict/dim_reduce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import matplotlib.pyplot as plt 4 | 5 | from mpl_toolkits.mplot3d import Axes3D 6 | from matplotlib.ticker import NullFormatter 7 | from sklearn import manifold 8 | from sklearn.utils import check_random_state 9 | 10 | def init_fig_base(cnt): 11 | h = math.ceil(math.sqrt(cnt)) 12 | w = math.ceil(cnt / h) 13 | fig_base = w * 100 + h * 10 + 1 14 | return fig_base, 0 15 | 16 | class DimReducer: 17 | def __init__(self, xdata, ydata): 18 | ''' 19 | xdata: numpy.ndarray 20 | data needed to reduce dimension, shape = (n_samples, n_dims) 21 | ydata: numpy.ndarray 22 | label data, shape = (n_samples, 1) 23 | ''' 24 | assert xdata.shape[0] > xdata.shape[1], \ 25 | "n_samples should be larger than the dimension: (%d, %d) is given"%(xdata.shape[0], xdata.shape[1]) 26 | assert len(ydata.shape) == 1 or ydata.shape[1] == 1, \ 27 | "label should be a 1 x ndims vector: {} is given".format(ydata.shape) 28 | self.xdata = xdata 29 | self.ydata = ydata.flatten() 30 | 31 | max_y = max(self.ydata) 32 | min_y = min(self.ydata) 33 | N_CLASS = 10 34 | class_step = (max_y - min_y) / N_CLASS 35 | self.ydata_class = np.floor((self.ydata - min_y) / class_step) 36 | 37 | def do_LLE(self, n_comp=2, n_neib=20, show=None): 38 | from sklearn.manifold import LocallyLinearEmbedding 39 | lle = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=n_neib) 40 | X_reduced = lle.fit_transform(self.xdata) 41 | 42 | if show is not None: 43 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d') 44 | plt.title('LLE with k = {}'.format(n_neib), size=12) 45 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class) 46 | ax.view_init(20, -19) 47 | 48 | return X_reduced 49 | 50 | def do_MDS(self, n_comp=2, show=None): 51 | from sklearn.manifold import MDS 52 | model = MDS(n_components=n_comp) 53 | X_reduced = model.fit_transform(self.xdata) 54 | if show is not None: 55 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d') 56 | plt.title('MDS', size=12) 57 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class) 58 | ax.view_init(20, -19) 59 | return X_reduced 60 | 61 | def do_LDA(self, n_comp=2, show=None): 62 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 63 | lda = LDA(n_components=n_comp) 64 | X_reduced = lda.fit_transform(self.xdata, self.ydata_class) 65 | if show is not None: 66 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d') 67 | plt.title('LDA', size=12) 68 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class) 69 | ax.view_init(20, -19) 70 | return X_reduced 71 | 72 | def do_reduction(self, n_comp=2, algo='LLE', show=True): 73 | if show: 74 | self.fig = plt.figure(figsize = (9, 8)) 75 | plt.style.use('default') 76 | 77 | if isinstance(algo, str): 78 | if show: 79 | self.fig_base, _ = init_fig_base(1) 80 | if algo == 'LLE': 81 | X_reduced = self.do_LLE(n_comp=n_comp, show=0) 82 | elif algo == 'MDS': 83 | X_reduced = self.do_MDS(n_comp=n_comp, show=0) 84 | elif algo == 'LDA': 85 | X_reduced = self.do_LDA(n_comp=n_comp, show=0) 86 | else: 87 | raise ValueError("Invalid algorithm: {}".format(algo)) 88 | if show: 89 | plt.show() 90 | return X_reduced 91 | elif isinstance(algo, list): 92 | if show: 93 | self.fig_base, _ = init_fig_base(len(algo)) 94 | ret = [] 95 | for idx, _algo in enumerate(algo): 96 | if _algo == 'LLE': 97 | X_reduced = self.do_LLE(n_comp=n_comp, show=idx) 98 | elif _algo == 'MDS': 99 | X_reduced = self.do_MDS(n_comp=n_comp, show=idx) 100 | elif _algo == 'LDA': 101 | X_reduced = self.do_LDA(n_comp=n_comp, show=idx) 102 | else: 103 | raise ValueError("Invalid algorithm: {}".format(_algo)) 104 | ret.append(X_reduced) 105 | if show: 106 | plt.show() 107 | return ret 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/.cost_model/CastToFp16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/CastToFp16.txt -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/.cost_model/CastToFp32.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/CastToFp32.txt -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/.cost_model/Conv2D.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/Conv2D.txt -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/.cost_model/MatMul.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/.cost_model/MatMul.txt -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_mixed_precision/__init__.py -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/dim_reduce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import matplotlib.pyplot as plt 4 | 5 | from mpl_toolkits.mplot3d import Axes3D 6 | from matplotlib.ticker import NullFormatter 7 | from sklearn import manifold 8 | from sklearn.utils import check_random_state 9 | 10 | def init_fig_base(cnt): 11 | h = math.ceil(math.sqrt(cnt)) 12 | w = math.ceil(cnt / h) 13 | fig_base = w * 100 + h * 10 + 1 14 | return fig_base, 0 15 | 16 | class DimReducer: 17 | def __init__(self, xdata, ydata): 18 | ''' 19 | xdata: numpy.ndarray 20 | data needed to reduce dimension, shape = (n_samples, n_dims) 21 | ydata: numpy.ndarray 22 | label data, shape = (n_samples, 1) 23 | ''' 24 | assert xdata.shape[0] > xdata.shape[1], \ 25 | "n_samples should be larger than the dimension: (%d, %d) is given"%(xdata.shape[0], xdata.shape[1]) 26 | assert len(ydata.shape) == 1 or ydata.shape[1] == 1, \ 27 | "label should be a 1 x ndims vector: {} is given".format(ydata.shape) 28 | self.xdata = xdata 29 | self.ydata = ydata.flatten() 30 | 31 | max_y = max(self.ydata) 32 | min_y = min(self.ydata) 33 | N_CLASS = 10 34 | class_step = (max_y - min_y) / N_CLASS 35 | self.ydata_class = np.floor((self.ydata - min_y) / class_step) 36 | 37 | def do_LLE(self, n_comp=2, n_neib=20, show=None): 38 | from sklearn.manifold import LocallyLinearEmbedding 39 | lle = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=n_neib) 40 | X_reduced = lle.fit_transform(self.xdata) 41 | 42 | if show is not None: 43 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d') 44 | plt.title('LLE with k = {}'.format(n_neib), size=12) 45 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class) 46 | ax.view_init(20, -19) 47 | 48 | return X_reduced 49 | 50 | def do_MDS(self, n_comp=2, show=None): 51 | from sklearn.manifold import MDS 52 | model = MDS(n_components=n_comp) 53 | X_reduced = model.fit_transform(self.xdata) 54 | if show is not None: 55 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d') 56 | plt.title('MDS', size=12) 57 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class) 58 | ax.view_init(20, -19) 59 | return X_reduced 60 | 61 | def do_LDA(self, n_comp=2, show=None): 62 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 63 | lda = LDA(n_components=n_comp) 64 | X_reduced = lda.fit_transform(self.xdata, self.ydata_class) 65 | if show is not None: 66 | ax = self.fig.add_subplot(self.fig_base+show, projection='3d') 67 | plt.title('LDA', size=12) 68 | ax.scatter(X_reduced[:, 0], X_reduced[:, 1], self.ydata, c=self.ydata_class) 69 | ax.view_init(20, -19) 70 | return X_reduced 71 | 72 | def do_reduction(self, n_comp=2, algo='LLE', show=True): 73 | if show: 74 | self.fig = plt.figure(figsize = (9, 8)) 75 | plt.style.use('default') 76 | 77 | if isinstance(algo, str): 78 | if show: 79 | self.fig_base, _ = init_fig_base(1) 80 | if algo == 'LLE': 81 | X_reduced = self.do_LLE(n_comp=n_comp, show=0) 82 | elif algo == 'MDS': 83 | X_reduced = self.do_MDS(n_comp=n_comp, show=0) 84 | elif algo == 'LDA': 85 | X_reduced = self.do_LDA(n_comp=n_comp, show=0) 86 | else: 87 | raise ValueError("Invalid algorithm: {}".format(algo)) 88 | if show: 89 | plt.show() 90 | return X_reduced 91 | elif isinstance(algo, list): 92 | if show: 93 | self.fig_base, _ = init_fig_base(len(algo)) 94 | ret = [] 95 | for idx, _algo in enumerate(algo): 96 | if _algo == 'LLE': 97 | X_reduced = self.do_LLE(n_comp=n_comp, show=idx) 98 | elif _algo == 'MDS': 99 | X_reduced = self.do_MDS(n_comp=n_comp, show=idx) 100 | elif _algo == 'LDA': 101 | X_reduced = self.do_LDA(n_comp=n_comp, show=idx) 102 | else: 103 | raise ValueError("Invalid algorithm: {}".format(_algo)) 104 | ret.append(X_reduced) 105 | if show: 106 | plt.show() 107 | return ret 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /dpro/cost_model/_mixed_precision/test_rst.py: -------------------------------------------------------------------------------- 1 | ''' This script is used to collect TF AMP strategy and test the mixed precision search resutls''' 2 | import re 3 | import os 4 | import json 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description="AMP Parser", 8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 9 | parser.add_argument("--option", type=str, required=True, help="option.") 10 | parser.add_argument("--env", type=str, default="", help="environment.") 11 | parser.add_argument("--cmd", type=str, default=None, help="command.") 12 | 13 | parser.add_argument("--amp_rst_path", type=str, default=None, help="amp_rst_path.") 14 | parser.add_argument("--search_rst_path", type=str, default=None, help="amp_rst_path.") 15 | parser.add_argument("--timeline_path", type=str, default=None, help="timeline_path.") 16 | parser.add_argument("--target_path", type=str, default=None, help="target_path.") 17 | 18 | args = parser.parse_args() 19 | 20 | def read_amp_fp16_ops(_path): 21 | with open(_path, "r") as f: 22 | fp16_ops = json.load(f) 23 | fp16_ops = fp16_ops['names'] 24 | if "DT_HALF" in fp16_ops: 25 | fp16_ops = [l.split("node ")[1].split(" to DT_HALF")[0] for l in fp16_ops] 26 | return fp16_ops 27 | 28 | def read_search_fp16_ops(_path): 29 | with open(_path, "r") as f: 30 | fp16_ops = json.load(f) 31 | fp16_ops = fp16_ops['best_strategy'] 32 | fp16_ops = [n[1].split("->")[1].split(".")[1] for n in fp16_ops] 33 | return fp16_ops 34 | 35 | if args.option == "parse": 36 | os.system("rm nohup.out") 37 | env = "" 38 | if len(args.env) > 0: 39 | env = " ".join(args.env.split(",")) 40 | if args.cmd is None: 41 | cmd = "python3 /opt/tiger/horovod_examples/tensorflow_synthetic_benchmark.py --num-warmup-batches 1 --num-batches-per-iter 1 --num-iters 1 --amp" 42 | else: 43 | cmd = args.cmd 44 | 45 | print(env + " TF_CPP_MIN_LOG_LEVEL=0 TF_CPP_MIN_VLOG_LEVEL=2 nohup {}".format(cmd)) 46 | os.system(env + " TF_CPP_MIN_LOG_LEVEL=0 TF_CPP_MIN_VLOG_LEVEL=2 nohup {}".format(cmd)) 47 | 48 | with open("nohup.out", 'r') as f: 49 | result = f.read() 50 | 51 | ret = {} 52 | lines = re.findall("Converted [0-9]+/[0-9]+ nodes to " 53 | "float16 precision using [0-9]+ cast\(s\) to " 54 | "float16 \(excluding Const and Variable casts\)", result) 55 | if len(lines) > 0: 56 | print(lines[0]) 57 | ret["info"] = lines[0] 58 | 59 | lines = re.findall("Changing type .+ of " 60 | ".+ node .+ to DT_HALF", result) 61 | print("check change {} nodes type".format(len(lines))) 62 | 63 | ret["names"] = [l.split("node ")[1].split(" to DT_HALF")[0] for l in lines] 64 | with open("amp_result.json", "w") as f: 65 | json.dump(ret, f) 66 | os.system("rm nohup.out") 67 | elif args.option == "paint": 68 | with open(args.timeline_path, "r") as f: 69 | traces = json.load(f) 70 | 71 | fp16_ops_list = [read_amp_fp16_ops(args.amp_rst_path), read_search_fp16_ops(args.search_rst_path)] 72 | 73 | rst_traces = [] 74 | one_pid = None 75 | for trace in traces: 76 | if "Comm" in trace["name"]: 77 | continue 78 | if one_pid is None: 79 | one_pid = trace["pid"] 80 | elif one_pid != trace["pid"]: 81 | continue 82 | 83 | if trace["args"]["step"] != 0: 84 | continue 85 | 86 | raw_name = trace["name"].split(".")[1] 87 | 88 | is_fp16 = [False, False] 89 | 90 | new_trace = trace.copy() 91 | if raw_name in fp16_ops_list[0]: 92 | new_trace["name"] = "Single float16" 93 | is_fp16[0] = True 94 | else: 95 | new_trace["name"] = "Double float32" 96 | new_trace["pid"] = "TF AMP" 97 | new_trace["tid"] ="default" 98 | rst_traces.append(new_trace) 99 | 100 | new_trace = trace.copy() 101 | if raw_name in fp16_ops_list[1]: 102 | new_trace["name"] = "Single float16" 103 | is_fp16[1] = True 104 | else: 105 | new_trace["name"] = "Double float32" 106 | new_trace["pid"] = "Search Result" 107 | new_trace["tid"] ="default" 108 | rst_traces.append(new_trace) 109 | 110 | if is_fp16[0] != is_fp16[1]: 111 | print("{} - TF_AMP:{}, Search Result:{}".format(raw_name, 112 | "float16" if is_fp16[0] else "float32", 113 | "float16" if is_fp16[1] else "float32")) 114 | 115 | with open(args.target_path, "w") as f: 116 | json.dump(rst_traces, f) 117 | 118 | elif args.option == "show": 119 | if (args.amp_rst_path is not None and args.search_rst_path is not None) or (args.amp_rst_path is None and args.search_rst_path is None): 120 | raise ValueError("Please input one and only one path") 121 | 122 | if args.amp_rst_path is not None: 123 | fp16_ops = read_amp_fp16_ops(args.amp_rst_path) 124 | print("TF AMP: ", ",".join(fp16_ops)) 125 | 126 | if args.search_rst_path is not None: 127 | fp16_ops = read_search_fp16_ops(args.search_rst_path) 128 | print("Search Results: ", ",".join(fp16_ops)) 129 | 130 | if args.target_path is not None: 131 | with open(args.target_path, 'w') as f: 132 | for op in fp16_ops: 133 | f.write(op + "\n") 134 | 135 | 136 | -------------------------------------------------------------------------------- /dpro/cost_model/_tsfs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_tsfs/__init__.py -------------------------------------------------------------------------------- /dpro/cost_model/_tsfs/cost_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import json 4 | 5 | from ...logger_utils import SingleLogger 6 | from ...base import bcolors 7 | 8 | def piecewise_linear_3seg(x, x0, y0, x1, y1, k2): 9 | return np.piecewise(x, [x <= x0, x > x1], 10 | [ 11 | lambda x: y0, 12 | lambda x: (x - x1) + y1, 13 | lambda x: k2 * (x - x0) + y0,]) 14 | p0_3seg = (1, 0, 6, 0, 1) 15 | 16 | def piecewise_linear_2seg(x, x0, y0): 17 | return np.piecewise(x, [x <= x0], 18 | [ 19 | lambda x: y0, 20 | lambda x: (x - x0) + y0,]) 21 | p0_2seg = (6, 0) 22 | 23 | 24 | class DataRepo: 25 | def __init__(self, tensor_time): 26 | self.para_2seg = None 27 | self.para_3seg = None 28 | self.tensor_time = tensor_time 29 | 30 | def dumps(self): 31 | print("2 seg: ", self.array_str(self.para_2seg)) 32 | print("3 seg: ", self.array_str(self.para_3seg)) 33 | 34 | def array_str(self, a): 35 | return "[" + ", ".join([str(n) for n in a]) + "]" 36 | 37 | def wrap_predict(func, para, xdata): 38 | pred_ydata = func(np.log10(xdata), *para) 39 | return np.power(10, pred_ydata) 40 | # pred_ydata = func(xdata, *para) 41 | # return pred_ydata 42 | 43 | def test_accuracy(func, para, xdata, ydata): 44 | pred_ydata = wrap_predict(func, para, xdata) 45 | mape = np.average(np.abs(pred_ydata - ydata) / ydata) 46 | return mape 47 | 48 | ### TCP 49 | intra_2GPU_para = DataRepo(None) 50 | intra_2GPU_para.para_2seg = [6.478717760741668, -0.7911850258660735] 51 | intra_2GPU_para.para_3seg = [5.768569837527714, -0.8112763281978731, 7.378590861143234, 0.07736945356154445, 0.4601007391482461] 52 | inter_100Gb_para = DataRepo(None) 53 | inter_100Gb_para.para_2seg = [5.72967574893935, 0.27409744017295945] 54 | inter_100Gb_para.para_3seg = [5.481425042939888, 0.24998168803732868, 523.1069698319661, 517.6116145143503, 0.8976445312387689] 55 | 56 | ### 20210909 profile PUSH and PULL standalone in 1wk*1gpu 1server 57 | push_data = DataRepo(None) 58 | push_data.para_2seg = [4.686307490183, -1.662961882088019] 59 | push_data.para_3seg = [4.846827061369098, -1.6483260907019037, 626.2712890335985, 619.9568948850784, 1.1001192383975844] 60 | pull_data = DataRepo(None) 61 | pull_data.para_2seg = [4.803492695527605, -1.5562480802345402] 62 | pull_data.para_3seg = [4.961341192845001, -1.5523328848981286, 626.2723641952061, 619.9558183092073, 1.119712390211427] 63 | 64 | ### 20210916 profile PUSH and PULL standalone in 2wk*8gpu 2server 65 | push_data = DataRepo(None) 66 | push_data.para_2seg = [4.659495844497468, -1.7521176854189915] 67 | push_data.para_3seg = [4.70781790174102, -1.7521176854008658, 626.2991919900213, 619.9289905166377, 1.0343874361266248] 68 | pull_data = DataRepo(None) 69 | pull_data.para_2seg = [4.671509439964712, -1.6513319055098747] 70 | pull_data.para_3seg = [4.7581024691199625, -1.6513319055201428, 626.2641292852817, 619.9640547461936, 1.063909142047732] 71 | 72 | ### 20210926 profile PUSH and PULL in a completed ResNet50 model 73 | # push_data = DataRepo(None) 74 | # push_data.para_3seg = [1.394117768140235, -2.7347276537801393, 6.265639039503503, 0.12958045808905536, 0.7140396422668894] 75 | # pull_data = DataRepo(None) 76 | # pull_data.para_3seg = push_data.para_3seg 77 | # # pull_data.para_3seg = [2.866912655486207, -2.4697033402682265, 4.575557144977073, -1.5755571450053836, 1.3329606768736635] 78 | 79 | # push_data = DataRepo(None) 80 | # push_data.para_3seg = [2.403217615362119, -1.548809007772742, 9.522663714692643, 3.443890309353957, 0.5209623472565877] 81 | # pull_data = DataRepo(None) 82 | # pull_data.para_3seg = push_data.para_3seg 83 | 84 | 85 | # data_table_path = "/home/tiger/sub_op2tensor_size2avg.json" 86 | # data_table_path = "/home/tiger/sub_op2tensor_size2avg_tcp_vgg16.json" 87 | # data_table_path = "/home/tiger/sub_op2tensor_size2avg_tcp_icptv3.json" 88 | data_table_path = os.environ.get("SUBOP_TENSORSIZE_AVG_PATH", None) 89 | if data_table_path is None: 90 | data_table = None 91 | else: 92 | with open(data_table_path, 'r') as fp: 93 | data_table = json.load(fp) 94 | SingleLogger().info(bcolors.CGREEN + \ 95 | "Read tensor_size2avg mapping from {}".format(data_table_path) + bcolors.ENDC) 96 | 97 | def interpolation(tensor_size, tensor_size2avg): 98 | tensor_size_list = list(tensor_size2avg.keys()) 99 | available_tensor_size = sorted( 100 | enumerate([float(key) for key in tensor_size_list]), 101 | key=lambda x: x[1]) 102 | i = 0 103 | while i < len(available_tensor_size): 104 | if tensor_size < available_tensor_size[i][1]: 105 | break 106 | i += 1 107 | if i == 0: 108 | i = 1 109 | SingleLogger().warn("[TSFS CM] small tensor size {}".format(tensor_size)) 110 | elif i == len(available_tensor_size): 111 | i = len(available_tensor_size) - 1 112 | SingleLogger().warn("[TSFS CM] large tensor size {}".format(tensor_size)) 113 | x1, x2 = available_tensor_size[i-1][1], available_tensor_size[i][1] 114 | y1 = tensor_size2avg[tensor_size_list[available_tensor_size[i-1][0]]] 115 | y2 = tensor_size2avg[tensor_size_list[available_tensor_size[i][0]]] 116 | return ((y1 - y2) / (x1 - x2)) * (tensor_size - x1) + y1 117 | 118 | piecewise_linear_func = piecewise_linear_3seg 119 | 120 | def predict_ps_intra_comm_time(tensor_size): 121 | return wrap_predict(piecewise_linear_func, intra_2GPU_para.para_3seg, tensor_size) 122 | 123 | USE_INTERPOLATION=False 124 | if USE_INTERPOLATION and data_table is None: 125 | SingleLogger().error("{} must be set if interpolation is used".format("SUBOP_TENSORSIZE_AVG_PATH")) 126 | exit(-1) 127 | 128 | def predict_ps_inter_comm_time(tensor_size, is_push): 129 | if USE_INTERPOLATION: 130 | if is_push: 131 | return interpolation(tensor_size, data_table["PUSH_REQ"]) 132 | else: 133 | return interpolation(tensor_size, data_table["PULL_RES"]) 134 | else: 135 | RATIO = 1.8 136 | if is_push: 137 | return RATIO * wrap_predict(piecewise_linear_func, push_data.para_3seg, tensor_size) 138 | else: 139 | return RATIO * wrap_predict(piecewise_linear_func, pull_data.para_3seg, tensor_size) 140 | ### 20210827_01: Previous method using coarse grained profiled push_pull time 141 | # all_time = wrap_predict(piecewise_linear_3seg, inter_100Gb_para.para_3seg, tensor_size) 142 | # intra_time = predict_ps_intra_comm_time(tensor_size) 143 | # return all_time - intra_time 144 | 145 | -------------------------------------------------------------------------------- /dpro/cost_model/_xla/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/cost_model/_xla/__init__.py -------------------------------------------------------------------------------- /dpro/cost_model/_xla/execute_graph.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.client import timeline 3 | import numpy as np 4 | import os 5 | import json 6 | from tensorflow.python.client import timeline 7 | 8 | def get_shape_from_placeholder(placeholder_op): 9 | dim_protos = placeholder_op.get_attr("shape").dim 10 | return [d.size for d in dim_protos] 11 | 12 | def get_dtype_from_placeholder(placeholder_op): 13 | return str(placeholder_op.get_attr("dtype")).split("\'")[1] 14 | 15 | def get_output_tensors_from_graph(graph): 16 | output_tensors = [] 17 | for op in graph.get_operations(): 18 | output_tensors.append(op.outputs[0]) 19 | return output_tensors 20 | 21 | def execute_graph_def(graph_def, input_node_defs, fetches, profile_result_dir, tf2xla_config_path=None, num_runs=20, trace_start=10, trace_end=20): 22 | graph = tf.Graph() 23 | with graph.as_default(): 24 | tf.import_graph_def(graph_def, name="") 25 | input_nodes = [] 26 | for node_def in input_node_defs: 27 | node = graph.get_operation_by_name(node_def.name) 28 | input_nodes.append(node) 29 | output_tensors = [] 30 | for node_def in fetches: 31 | node = graph.get_operation_by_name(node_def.name) 32 | output_tensors.append(node.outputs[0]) 33 | 34 | feed_dict = {} 35 | for input_node in input_nodes: 36 | shape = get_shape_from_placeholder(input_node) 37 | dtype = get_dtype_from_placeholder(input_node) 38 | print(dtype) 39 | feed_dict[input_node.outputs[0]] = np.random.rand(*shape).astype(dtype) 40 | run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) 41 | run_metadata = tf.compat.v1.RunMetadata() 42 | traces = {"traceEvents":[]} 43 | fetch = output_tensors 44 | with tf.compat.v1.Session(graph=graph) as sess: 45 | for i in range(num_runs): 46 | sess.run(fetch, feed_dict, options=run_options, run_metadata=run_metadata) 47 | tl = timeline.Timeline(run_metadata.step_stats) 48 | ctf = json.loads(tl.generate_chrome_trace_format()) 49 | if trace_start < i < trace_end: 50 | traces["traceEvents"] += ctf["traceEvents"] 51 | print("{} th step trace added.".format(i)) 52 | with open(os.path.join(profile_result_dir, "temp.json"), "w") as f: 53 | json.dump(traces, f, indent=4) 54 | print("Ran to the end.") -------------------------------------------------------------------------------- /dpro/cost_model/_xla/p_dispersion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import cvxopt 4 | import os 5 | from scipy.sparse import csr_matrix, eye as speye, vstack 6 | from tqdm import tqdm, trange 7 | 8 | from multiprocessing import Pool 9 | 10 | MUL_DELTA = 10 11 | 12 | # reference: https://stackoverflow.com/a/35566620 13 | def scipy_sparse_to_spmatrix(A): 14 | coo = A.tocoo() 15 | SP = cvxopt.spmatrix(coo.data.tolist(), coo.row.tolist(), coo.col.tolist(), size=A.shape) 16 | return SP 17 | 18 | def p_dispersion_lp(G, k, eps=0.454): 19 | num_nodes = G.shape[0] 20 | diameter = np.max(G) 21 | min_dist = np.min(G[np.nonzero(G)]) 22 | delta = diameter / MUL_DELTA 23 | dists = [min_dist + delta * i for i in range(MUL_DELTA)] 24 | print("N = {}".format(num_nodes)) 25 | variables = [[None]*len(dists)] * num_nodes 26 | print("Creating vector c.") 27 | c = cvxopt.matrix([-r for r in dists] * num_nodes) 28 | print("Building 1st constraint matrix.") 29 | G1_vals = [] 30 | G1_is = [] 31 | G1_js = [] 32 | for i in range(num_nodes): 33 | for j in range(len(dists)): 34 | G1_vals.append(1) 35 | G1_is.append(0) 36 | G1_js.append(i*len(dists) + j) 37 | G1 = csr_matrix((G1_vals, (G1_is, G1_js))) 38 | G1_h = np.array([k]) 39 | print("2nd constraint.") 40 | G2_vals = [] 41 | G2_is = [] 42 | G2_js = [] 43 | for i in trange(num_nodes): 44 | for j in trange(len(dists)): 45 | for u in trange(num_nodes): 46 | dist_u_i = G[i,u] 47 | r = dists[j] 48 | if dist_u_i < r/2: 49 | G2_vals.append(1) 50 | G2_is.append(u) 51 | G2_js.append(i*len(dists) + j) 52 | G2 = csr_matrix((G2_vals, (G2_is, G2_js))) 53 | G2_h = np.ones((num_nodes,1)) 54 | print("X range constraint.") 55 | G3 = speye(num_nodes*len(dists)) 56 | G3_h = np.ones((num_nodes*len(dists), 1)) 57 | G4 = -speye(num_nodes*len(dists)) 58 | G4_h = np.zeros((num_nodes*len(dists), 1)) 59 | 60 | G_concated = vstack([G1, G2, G3, G4]) 61 | h_concated = np.vstack((G1_h, G2_h, G3_h, G4_h)) 62 | 63 | G_cvx = scipy_sparse_to_spmatrix(G_concated) 64 | h_cvx = cvxopt.matrix(h_concated) 65 | print("Start solving...") 66 | sol = cvxopt.solvers.lp(c, G_cvx, h_cvx, solver="glpk") 67 | print("Solution obtained.") 68 | solution = np.array(sol["x"]).reshape((num_nodes, len(dists))) 69 | 70 | # rounding 71 | print("Start rounding.") 72 | while True: 73 | S = set() 74 | for i in trange(num_nodes): 75 | for j in range(len(dists)): 76 | x_i_r = solution[i,j] 77 | r = dists[j] 78 | add_prob = (1-eps)*(1-np.e**(-x_i_r)) 79 | if random.random() < add_prob: 80 | should_break = False 81 | for (i_, r_) in S: 82 | if r < r_ and G[i][i_] < r_/2: 83 | should_break = True 84 | break 85 | if should_break: 86 | continue 87 | S.add((i, r)) 88 | if len(S) <= k: 89 | # break 90 | indices = list(set([i for (i, r) in S])) 91 | yield indices 92 | 93 | def sum_min_distance(G, A): 94 | dist = 0 95 | min_j = {} 96 | for i in A: 97 | min_dist = float("inf") 98 | min_j_for_i = -1 99 | for j in A: 100 | if j == i: 101 | continue 102 | dist = G[i,j] 103 | if dist < min_dist: 104 | min_dist = dist 105 | min_j_for_i = j 106 | dist += min_dist 107 | min_j[i] = min_j_for_i 108 | return dist, min_j 109 | 110 | def sum_min_distance_edit(G, last_dist, min_j, A, idx_rm, idx_add): 111 | new_min_j = min_j.copy() 112 | last_dist -= G[idx_rm, min_j[idx_rm]] 113 | new_min_j.pop(idx_rm) 114 | for index in A: 115 | if index != idx_rm and min_j[index] == idx_rm: 116 | last_dist -= G[index, idx_rm] 117 | min_j_for_index = -1 118 | min_dist_for_index = float("inf") 119 | for A_index in A: 120 | if A_index != idx_rm and A_index != index: 121 | if G[index, A_index] < min_dist_for_index: 122 | min_dist_for_index = G[index, A_index] 123 | min_j_for_index = A_index 124 | new_min_j[index] = min_j_for_index 125 | last_dist += min_dist_for_index 126 | min_dist_for_add = float("inf") 127 | min_j_for_add = -1 128 | for index in A: 129 | if index != idx_rm: 130 | orig_min_dist = G[index, new_min_j[index]] 131 | if G[index, idx_add] < orig_min_dist: 132 | new_min_j[index] = idx_add 133 | last_dist -= orig_min_dist 134 | last_dist += G[index, idx_add] 135 | if G[index, idx_add] < min_dist_for_add: 136 | min_dist_for_add = G[index, idx_add] 137 | min_j_for_add = index 138 | last_dist += min_dist_for_add 139 | new_min_j[idx_add] = min_j_for_add 140 | return last_dist, new_min_j 141 | 142 | 143 | def p_dispersion_local_search(G, k, sample_ratio = 1, patience = 3, l=None, tqdm_position=0): 144 | num_nodes = G.shape[0] 145 | A = set(random.sample(list(range(num_nodes)), k=k)) 146 | if l is None: 147 | l = int(np.ceil(k * np.log(k))) 148 | last_dist, min_j = sum_min_distance(G, A) 149 | max_dist = last_dist 150 | sample_k = int(np.ceil(sample_ratio * num_nodes)) 151 | # print("Using {} samples in each iteration, ratio: {}".format(sample_k, sample_ratio)) 152 | opt_counter = 0 153 | tqdm_iterator = trange(l, position=tqdm_position, desc="worker {}: ".format(tqdm_position), leave=False) 154 | for i in tqdm_iterator: 155 | max_new_min_j = None 156 | max_idx_rm = -1 157 | max_idx_add = -1 158 | for out_index in random.sample(range(num_nodes), sample_k): 159 | if out_index not in A: 160 | for a_index in A: 161 | new_dist, new_min_j = sum_min_distance_edit(G, last_dist, min_j, A, a_index, out_index) 162 | if new_dist > max_dist: 163 | max_dist = new_dist 164 | max_idx_rm = a_index 165 | max_idx_add = out_index 166 | max_new_min_j = new_min_j 167 | if max_idx_rm == -1: 168 | opt_counter += 1 169 | if opt_counter >= patience: 170 | break 171 | else: 172 | A.remove(max_idx_rm) 173 | A.add(max_idx_add) 174 | last_dist = max_dist 175 | min_j = max_new_min_j 176 | tqdm_iterator.close() 177 | return list(A) 178 | 179 | def worker_func(arg): 180 | max_dist = -float("inf") 181 | max_min_j = None 182 | max_a_index = -1 183 | max_out_index = -1 184 | for (G, last_dist, last_min_j, A, a_index, out_index) in arg: 185 | new_dist, new_min_j = sum_min_distance_edit(G, last_dist, last_min_j, A, a_index, out_index) 186 | if new_dist > max_dist: 187 | max_dist = new_dist 188 | max_min_j = new_min_j 189 | max_a_index = a_index 190 | max_out_index = out_index 191 | return (max_dist, max_min_j, max_a_index, max_out_index) 192 | 193 | def parallel_p_dispersion_local_search(G, k, sample_ratio = 1, patience = 3, l=None): 194 | num_nodes = G.shape[0] 195 | A = set(random.sample(list(range(num_nodes)), k=k)) 196 | if l is None: 197 | l = int(np.ceil(k * np.log(k))) 198 | last_dist, min_j = sum_min_distance(G, A) 199 | max_dist = last_dist 200 | sample_k = int(np.ceil(sample_ratio * num_nodes)) 201 | # print("Using {} samples in each iteration, ratio: {}".format(sample_k, sample_ratio)) 202 | opt_counter = 0 203 | for i in trange(l, desc="iter: ", leave=True): 204 | max_new_min_j = None 205 | max_idx_rm = -1 206 | max_idx_add = -1 207 | map_args = [] 208 | for out_index in random.sample(range(num_nodes), sample_k): 209 | if out_index not in A: 210 | for a_index in A: 211 | map_args.append( ( 212 | G, last_dist, min_j, A, a_index, out_index 213 | ) ) 214 | num_cores = min(os.cpu_count(), len(map_args)) 215 | grouped_map_args = [] 216 | chunk_size = int(np.ceil(len(map_args) / num_cores)) 217 | for i in range(num_cores): 218 | actual_chunk_size = min(chunk_size, len(map_args)-i*chunk_size) 219 | grouped_map_args.append(map_args[i*chunk_size:i*chunk_size+actual_chunk_size]) 220 | with Pool(num_cores) as p: 221 | distances = list(tqdm(p.imap_unordered(worker_func, grouped_map_args), total=len(grouped_map_args), desc="inner: ", leave=False)) 222 | for (new_dist, new_min_j, a_index, out_index) in distances: 223 | if new_dist > max_dist: 224 | max_dist = new_dist 225 | max_idx_rm = a_index 226 | max_idx_add = out_index 227 | max_new_min_j = new_min_j 228 | if max_idx_rm == -1: 229 | opt_counter += 1 230 | if opt_counter >= patience: 231 | break 232 | else: 233 | A.remove(max_idx_rm) 234 | A.add(max_idx_add) 235 | last_dist = max_dist 236 | min_j = max_new_min_j 237 | return list(A) -------------------------------------------------------------------------------- /dpro/cost_model/_xla/process_trace.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import os 4 | import time 5 | 6 | TRACE_SUFFIX = "trace.json.gz" 7 | XLA_DUMP_SUFFIX = "after_optimizations.txt" 8 | 9 | def search_for_file(profile_dir, suffix): 10 | for dir_path, dir_names, file_names in os.walk(profile_dir): 11 | for fn in file_names: 12 | if fn.endswith(suffix): 13 | return os.path.join(dir_path, fn) 14 | return None 15 | 16 | def wait_for_file(profile_dir, suffix): 17 | for i in range(20): 18 | file_path = search_for_file(profile_dir, suffix) 19 | if file_path is not None: 20 | return file_path 21 | else: 22 | # sleep 10 ms 23 | time.sleep(0.01) 24 | file_path = search_for_file(profile_dir, suffix) 25 | if file_path is None: 26 | print("[WARNING] Cannot find file with suffix {} in dir {}.".format(suffix, profile_dir)) 27 | return None 28 | else: 29 | return file_path 30 | 31 | def search_for_trace(profile_dir): 32 | return wait_for_file(profile_dir, TRACE_SUFFIX) 33 | 34 | def search_for_hlo(xla_dir): 35 | return wait_for_file(xla_dir, XLA_DUMP_SUFFIX) 36 | 37 | def get_execution_time_for_whole_graph(trace_path): 38 | with gzip.open(trace_path, "rb") as f: 39 | trace_data = json.loads(f.read().decode("ascii")) 40 | events = trace_data["traceEvents"] 41 | time_dict = {} 42 | for ev in events: 43 | if "args" not in ev: 44 | continue 45 | if "long_name" not in ev["args"]: 46 | continue 47 | long_name = ev["args"]["long_name"].split(":")[0] 48 | if long_name not in time_dict: 49 | time_dict[long_name] = (0, 0) 50 | time, count = time_dict[long_name] 51 | time_dict[long_name] = (time + ev["dur"], count + 1) 52 | for name, (time, count) in time_dict.items(): 53 | time_dict[name] = (time / count, count) 54 | return time_dict 55 | 56 | def get_execution_time_from_temp_trace(trace_path): 57 | ### TODO (huhanpeng): delete 58 | # one_pid = None 59 | # kernel_pid = None 60 | # kernel_time_dict = {} 61 | with open(trace_path, "r") as f: 62 | trace = json.load(f) 63 | if isinstance(trace, dict): 64 | trace = trace["traceEvents"] 65 | 66 | ### TODO (huhanpeng): delete following 67 | # for tr in trace: 68 | # if tr["ph"] == "M" and tr["name"] == "process_name": 69 | # if "args" in tr and "name" in tr["args"]: 70 | # if "device:GPU" in tr["args"]["name"] and "Compute" in tr["args"]["name"] and "replica" in tr["args"]["name"]: 71 | # one_pid = tr["pid"] 72 | # elif "device:GPU" in tr["args"]["name"] and "stream:all Compute" in tr["args"]["name"]: 73 | # kernel_pid = tr["pid"] 74 | # if one_pid and kernel_pid: 75 | # break 76 | 77 | time_dict = {} 78 | for tr in trace: 79 | if tr["ph"] == "X": 80 | op_name = tr["args"]["name"] 81 | if op_name not in time_dict: 82 | time_dict[op_name] = [] 83 | time_dict[op_name].append(tr["dur"]) 84 | 85 | ### TODO (huhanpeng): delete following try...except... 86 | # try: 87 | # if tr["ph"] == "X" and tr["pid"] == one_pid: 88 | # op_name = tr["args"]["name"] 89 | # if op_name not in time_dict: 90 | # time_dict[op_name] = [] 91 | # time_dict[op_name].append(tr["dur"]) 92 | # elif tr["ph"] == "X" and tr["pid"] == kernel_pid: 93 | # op_name = tr["args"]["name"] 94 | # if op_name not in kernel_time_dict: 95 | # kernel_time_dict[op_name] = [] 96 | # kernel_time_dict[op_name].append(tr["dur"]) 97 | # except: 98 | # pass 99 | 100 | ### TODO (huhanpeng): delete 101 | # for key, durs in time_dict.items(): 102 | # if key in kernel_time_dict: 103 | # time_dict[key] = kernel_time_dict[key] 104 | 105 | for key, durs in time_dict.items(): 106 | time_dict[key] = (sum(durs) / len(durs), len(durs)) 107 | return time_dict 108 | 109 | def get_execution_time_from_trace(trace_path): 110 | with gzip.open(trace_path, "rb") as f: 111 | trace_data = json.loads(f.read().decode("ascii")) 112 | events = trace_data["traceEvents"] 113 | time = 0 114 | count = 0 115 | for ev in events: 116 | try: 117 | if ev["ph"] == "X" and ev["name"] == "_XlaRun": 118 | time += ev["dur"] 119 | count += 1 120 | except: 121 | pass 122 | if count == 0: 123 | # cannot compile 124 | return 0, 0 125 | return time/count, count 126 | 127 | def get_execution_time_from_uncompiled_trace(trace_path): 128 | with gzip.open(trace_path, "rb") as f: 129 | trace_data = json.loads(f.read().decode("ascii")) 130 | events = trace_data["traceEvents"] 131 | time = 0 132 | count = 0 133 | for ev in events: 134 | try: 135 | if ev["ph"] == "X" and ev["name"] == "SessionRun": 136 | time += ev["dur"] 137 | count += 1 138 | except: 139 | pass 140 | if count == 0: 141 | # cannot compile 142 | return 0, 0 143 | return time/count, count 144 | -------------------------------------------------------------------------------- /dpro/cost_model/_xla/utils.py: -------------------------------------------------------------------------------- 1 | class CMPaths: 2 | DATASET_DIR = "dataset" 3 | DEBUG_DIR = "debug" 4 | HLO_DIR = "hlos" 5 | PROFILE_DIR = "xla_profile" 6 | FEATURE_DIR = "features" 7 | MODULES_DIR = "modules" 8 | RAW_SUBGRAPH_DIR = "generated_subgraph" 9 | 10 | LABEL_FILE = "labels.txt" 11 | TF_SUPPORTED_OPS_FILE = "tf_xla_supported_ops.txt" 12 | 13 | METADATA_FILE = "metadata.json" 14 | RAW_GRAPH_DEF_FILE = "final_graph.json" 15 | CLEANED_GRAPH_DEF_FILE = "cleaned_graph.json" 16 | UNIQUE_OP_HISTORY_FILE = "unique_op_history.txt" 17 | 18 | MAX_CLUSTER_CACHE_FILE = "max_cluster.pickle" 19 | ELEMENTARY_OP_CACHE_FILE = "elementary_ops.txt" 20 | 21 | DATASET_SAVE_FILE = "dataset.pickle" 22 | ELEMENTARY_OP_CACHE_SAVE_FILE = "elem_op_cache.pickle" 23 | OVERHEAD_MODEL_SAVE_FILE = "overhead.pickle" 24 | MODEL_WEIGHT_SAVE_FILE = "model_weights.h5" 25 | MODEL_CONFIG_FILE = "model_config.pickle" 26 | MODULE_CONFIG_FILE = "module_config.txt" 27 | GRAPH_DEF_PICKLE_FILE = "graph_def.pickle" 28 | 29 | AFTER_OPT_TF_DAG_FILE = "partition_def_0.json" 30 | DEBUG_XLA_CANDIATES_FILE = "PLEASE SPECIFY CANDIDATE FILE PATH" 31 | TENSOR_SHAPE_FILE = "tensor_shapes.json" 32 | 33 | class CMEnvs: 34 | WHITE_LIST_PATH = "BPF_XLA_OP_WHITE_LIST_PATH" 35 | TF_PATH = "BPF_TF_PATH" 36 | CM_PROFILE_GPU = "BPF_COST_MODEL_PROFILE_GPU" 37 | 38 | 39 | ### TODO(huhanpeng): ResourceApplyGradientDescent should not be ignored 40 | IGNORE_OP_TYPES = ["ShapeN", "_Arg", "_Send", "_Recv", "VarIsInitializedOp", 41 | "ReadVariableOp", 42 | # "Pad", "SparseSoftmaxCrossEntropyWithLogits", 43 | "VarHandleOp", 44 | "IsVariableInitialized", "ResourceApplyGradientDescent", "IteratorToStringHandle", 45 | "IteratorGetNext", "MakeIterator", "IteratorV2", "NoOp", "Placeholder"] 46 | 47 | 48 | def parse_xla_candidate_ops(candidate_path): 49 | candidates = set() 50 | graph_node_id2name = {} 51 | unsafe_resource_deps_ = set() 52 | with open(candidate_path, "r") as f: 53 | lines = f.readlines() 54 | 55 | idx = 0 56 | while idx < len(lines): 57 | if lines[idx].startswith("unsafe_resource_deps_"): 58 | idx += 1 59 | break 60 | ls = lines[idx].strip().split(" ") 61 | candidates.add(ls[0]) 62 | graph_node_id2name[ls[1]] = ls[0] 63 | idx += 1 64 | while idx < len(lines): 65 | ls = lines[idx].strip().split(" ") 66 | unsafe_resource_deps_.add( 67 | (graph_node_id2name[ls[0]], graph_node_id2name[ls[1]])) 68 | idx += 1 69 | return candidates, unsafe_resource_deps_ 70 | -------------------------------------------------------------------------------- /dpro/cost_model/_xla/xla_run_generate_kernel_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo -i 4 | cd ${HOME}/ 5 | rm -rf byteprofile-analysis 6 | git clone https://github.com/joapolarbear/byteprofile-analysis.git 7 | cd byteprofile-analysis 8 | ### install requirements 9 | pip3 install -r requirements.txt 10 | 11 | ### Recompile XLA related Part or directly download it 12 | # export PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} \ 13 | # OLD_LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/cudnn/lib64:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nccl/lib:$LD_LIBRARY_PATH \ 14 | # LD_LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/cudnn:/usr/local/cuda:/usr/local/cuda/compat:$OLD_LD_LIBRARY_PATH \ 15 | # LIBRARY_PATH=/usr/local/lib:/usr/local/cuda/lib64:/usr/local/nccl/lib/:$LIBRARY_PATH 16 | # cd /root/tensorflow 17 | # ./build_bpf_tf_modules.sh 18 | cd ${HOME}/ 19 | wget https://github.com/joapolarbear/tensorflow/releases/download/v2.4.1-dev.2.0.1/dpro_xla_tools.zip 20 | unzip dpro_xla_tools.zip 21 | 22 | ### Config env 23 | # where the modified tensorflow locates 24 | export BPF_TF_PATH=${HOME}/dpro_xla_tools 25 | # the GPU id to run profiling on (specify one GPU only) 26 | export BPF_COST_MODEL_PROFILE_GPU="0" 27 | export CUDA_VISIBLE_DEVICES=0 28 | 29 | 30 | export PATH=/usr/local/cuda/bin:/usr/local/nvidia/bin:${PATH} 31 | export LD_LIBRARY_PATH=/usr/local/lib/python3.7/dist-packages/tensorflow/:$LD_LIBRARY_PATH 32 | DRIVER_VERSION=$(nvidia-smi | grep -Po "CUDA Version: \K([0-9]{1,}\.)+[0-9]{1,}") 33 | TOOLKIT_VERSION=$(nvcc --version | grep -Po "release \K([0-9]{1,}\.)+[0-9]{1,}") 34 | echo "CUDA driver version: $DRIVER_VERSION" 35 | echo "CUDA toolkit version: $TOOLKIT_VERSION" 36 | ### If the driver version is lower than the toolkit version, use compatability mode 37 | # export LD_LIBRARY_PATH=/usr/local/cuda/compat/:$LD_LIBRARY_PATH 38 | sudo ln -sf /usr/local/lib/python3.7/dist-packages/tensorflow/libtensorflow_framework.so.2 /usr/lib/ 39 | 40 | export DPRO_GRAPHDEF_DFG_PATH=xxx 41 | 42 | # The path where partition_def_0.json, tensor_shapes... are stored 43 | TRACE_DIR=xxx 44 | OUTPUT_DIR="${HOME}/xla" 45 | mkdir -p $OUTPUT_DIR 46 | 47 | 48 | ### resnet 49 | NUM_RANDOM_SAMPLES=5000 50 | MAX_CLUSTER_SAMPLES=5 51 | MIN_CLUSTER_SIZE=4 52 | MAX_CLUSTER_SIZE=800 53 | 54 | ### VGG16 55 | NUM_RANDOM_SAMPLES=5000 56 | MAX_CLUSTER_SAMPLES=5 57 | MIN_CLUSTER_SIZE=4 58 | MAX_CLUSTER_SIZE=200 59 | 60 | ### VGG19 61 | NUM_RANDOM_SAMPLES=2000 62 | MAX_CLUSTER_SAMPLES=5 63 | MIN_CLUSTER_SIZE=4 64 | MAX_CLUSTER_SIZE=200 65 | 66 | ### InceptionV3 67 | NUM_RANDOM_SAMPLES=5000 68 | MAX_CLUSTER_SAMPLES=5 69 | MIN_CLUSTER_SIZE=4 70 | MAX_CLUSTER_SIZE=800 71 | 72 | ### generate data and train 73 | cd ${HOME}/byteprofile-analysis 74 | python3 xla_cm_entry.py --mode 0 \ 75 | --trace_dir ${TRACE_DIR} \ 76 | --output_dir ${OUTPUT_DIR} \ 77 | --num_samples ${NUM_RANDOM_SAMPLES} \ 78 | --max_cluster_samples ${MAX_CLUSTER_SAMPLES} \ 79 | --min_cluster_size ${MIN_CLUSTER_SIZE} \ 80 | --max_cluster_size ${MAX_CLUSTER_SIZE} \ 81 | --batch_size 256 82 | 83 | ### exit root 84 | exit 85 | if hdfs dfs -test -e /usr/hphu/xla_model/xla ; then 86 | hdfs dfs -rmr /usr/hphu/xla_model/xla 87 | fi 88 | hdfs dfs -put ${HOME}/xla /usr/hphu/xla_model/ -------------------------------------------------------------------------------- /dpro/cost_model/_xla/xla_run_test_module_cm.sh: -------------------------------------------------------------------------------- 1 | # where the modified tensorflow locates 2 | export BPF_TF_PATH="/root/tensorflow" 3 | # this is the GPU used to compile XLA modules. Cost model will be run on another 4 | # differnt GPU (specify one GPU here only) 5 | export BPF_COST_MODEL_PROFILE_GPU="0" 6 | 7 | # modify these 8 | DATASET_DIR="/PATH/TO/DATASET/DIR" 9 | COST_MODEL_DIR="/PATH/TO/COST/MODEL" 10 | 11 | python3 xla_test_module_cm.py --dataset_dir ${DATASET_DIR} --cost_model_dir ${COST_MODEL_DIR} -------------------------------------------------------------------------------- /dpro/cost_model/_xla/xla_run_train_module_cm.sh: -------------------------------------------------------------------------------- 1 | # where the modified tensorflow locates 2 | export BPF_TF_PATH="/root/tensorflow" 3 | # this is the GPU used to compile XLA modules. Cost model will be run on another 4 | # differnt GPU (specify one GPU here only) 5 | export BPF_COST_MODEL_PROFILE_GPU="0" 6 | 7 | # modify these 8 | DATASET_DIR="/opt/tiger/xla/kernel_dataset" 9 | OUTPUT_DIR="/opt/tiger/xla/cost_model" 10 | 11 | python3 xla_train_module_cm.py --dataset_dir ${DATASET_DIR} --output_dir ${OUTPUT_DIR} 12 | -------------------------------------------------------------------------------- /dpro/cost_model/_xla/xlatools.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import subprocess 3 | import os 4 | 5 | from .utils import CMEnvs 6 | from ...logger_utils import SingleLogger 7 | from ...base import bcolors 8 | 9 | if CMEnvs.TF_PATH in os.environ: 10 | BPF_TF_PREFIX = os.environ[CMEnvs.TF_PATH] 11 | else: 12 | BPF_TF_PREFIX = None 13 | SingleLogger().warn(bcolors.CRED + "Environment {} not set. Guessing default TF location.".format(CMEnvs.TF_PATH) + bcolors.ENDC) 14 | 15 | if CMEnvs.CM_PROFILE_GPU in os.environ: 16 | try: 17 | BPF_PROFILE_GPU = int(os.environ[CMEnvs.CM_PROFILE_GPU]) 18 | except: 19 | print("[WARNING] Invalid BPF_COST_MODEL_PROFILE_GPU value (must be an integer)." 20 | " {} is given".format(CMEnvs.CM_PROFILE_GPU)) 21 | exit(-1) 22 | else: 23 | print("[WARNING] Required environment BPF_COST_MODEL_PROFILE_GPU value not set. Set it as 0 by default") 24 | BPF_PROFILE_GPU = 0 25 | # exit(-1) 26 | 27 | def _check_file_available_for_writing(path): 28 | p = Path(path) 29 | p_dir = p.resolve().parent 30 | if not p_dir.is_dir(): 31 | p.mkdir(parents=True) 32 | 33 | def _check_file_exist_for_reading(path): 34 | p = Path(path) 35 | if not p.is_file(): 36 | raise FileNotFoundError("Cannot find file {}".format(path)) 37 | 38 | def _check_arg_types(args, types): 39 | if len(args) != len(types): 40 | raise RuntimeError("Mismatch number of arguments and types in _check_arg_types. ({} v.s. {})".format(len(args), len(types))) 41 | for index, (arg, arg_type) in enumerate(zip(args, types)): 42 | if not isinstance(arg, arg_type): 43 | raise TypeError("Inappropriate argument type for argument {}. Expected {} but got {}".format(index, arg_type, type(arg))) 44 | 45 | def compile_to_hlo(graph_path, config_path, dump_path_unopt, dump_path_opt, compile_exec=None): 46 | if compile_exec is None: 47 | if BPF_TF_PREFIX is not None: 48 | compile_exec = os.path.join(BPF_TF_PREFIX, "bazel-bin/tensorflow/compiler/byteprofile_xlatools/tfcompile_hlo") 49 | else: 50 | compile_exec = "/root/tensorflow/bazel-bin/tensorflow/compiler/byteprofile_xlatools/tfcompile_hlo" 51 | if not os.path.exists(compile_exec): 52 | print("Cannot find the path to replay_computation_gpu: {}.".format(compile_exec)) 53 | exit(-1) 54 | 55 | _check_arg_types([graph_path, config_path, dump_path_unopt, dump_path_opt], [str] * 4) 56 | _check_file_exist_for_reading(graph_path) 57 | _check_file_exist_for_reading(config_path) 58 | _check_file_available_for_writing(dump_path_unopt) 59 | _check_file_available_for_writing(dump_path_opt) 60 | cmd = "CUDA_VISIBLE_DEVICES={} {} {} {} {} {}".format(str( 61 | BPF_PROFILE_GPU), compile_exec, graph_path, config_path, dump_path_unopt, dump_path_opt) 62 | if not os.path.exists(graph_path): 63 | raise ValueError("graph_path:{} do not exists".format(graph_path)) 64 | subprocess.run(cmd, stdout=subprocess.DEVNULL, 65 | stderr=subprocess.DEVNULL, check=True, shell=True) 66 | # subprocess.run(cmd, check=True, shell=True) 67 | 68 | def replay_and_generate_kernel_sample(sample_id_start, hlo_path, tmp_dir, dataset_path, replay_exec=None): 69 | if replay_exec is None: 70 | if BPF_TF_PREFIX is not None: 71 | replay_exec = os.path.join(BPF_TF_PREFIX, "bazel-bin/tensorflow/compiler/xla/tools/replay_computation_gpu") 72 | else: 73 | replay_exec = "/root/tensorflow/bazel-bin/tensorflow/compiler/xla/tools/replay_computation_gpu" 74 | if not os.path.exists(replay_exec): 75 | print("Cannot find the path to replay_computation_gpu.") 76 | exit(-1) 77 | my_env = os.environ.copy() 78 | my_env["CUDA_VISIBLE_DEVICES"] = str(BPF_PROFILE_GPU) 79 | opt_1 = "--num_runs=50" 80 | opt_2 = "--use_fake_data=true" 81 | opt_3 = "--print_result=false" 82 | opt_4 = "--dataset_path={}".format(dataset_path) 83 | opt_5 = "--temp_dir_path={}".format(tmp_dir) 84 | opt_6 = "--profile_start=30" 85 | opt_7 = "--profile_end=50" 86 | opt_8 = "--sample_id_start={}".format(sample_id_start) 87 | subprocess.run("CUDA_VISIBLE_DEVICES={} {} {} {} {} {} {} {} {} {} {}".format( 88 | str(BPF_PROFILE_GPU), replay_exec, opt_1, opt_2, opt_3, 89 | opt_4, opt_5, opt_6, opt_7, opt_8, hlo_path), 90 | stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=my_env, shell=True, check=True) 91 | 92 | def extract_kernel_features_from_hlo(hlo_path, tmp_dir, extract_exec=None): 93 | if extract_exec is None: 94 | if BPF_TF_PREFIX is not None: 95 | extract_exec = os.path.join(BPF_TF_PREFIX, "bazel-bin/tensorflow/compiler/xla/tools/extract_features_from_hlo") 96 | else: 97 | extract_exec = "/root/tensorflow/bazel-bin/tensorflow/compiler/xla/tools/extract_features_from_hlo" 98 | if not os.path.exists(extract_exec): 99 | print("Cannot find the path to replay_computation_gpu.") 100 | exit(-1) 101 | 102 | opt_1 = "--hlo_path={}".format(hlo_path) 103 | opt_2 = "--temp_dir_path={}".format(tmp_dir) 104 | subprocess.run([extract_exec, opt_1, opt_2], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) 105 | -------------------------------------------------------------------------------- /dpro/cost_model/base.py: -------------------------------------------------------------------------------- 1 | 2 | class OptApplyStrategyError(Exception): 3 | pass 4 | 5 | 6 | class OptNoValidStrategyError(Exception): 7 | pass 8 | 9 | 10 | class OptQueryCostModelError(Exception): 11 | pass 12 | 13 | class _BaseGraphPass: 14 | def __init__(self, opt): 15 | self.opt = opt 16 | self.dag = self.opt.dag 17 | ### token is the indendifier of each optimization technique 18 | self.token = None 19 | self.meta_info = self.opt.clct.para_dict 20 | 21 | self.ckpt_dir = self.opt.ckpt_dir 22 | self.spec_dir = self.opt.spec_dir 23 | 24 | def init_search_space(self, *args, **kwargs): 25 | raise NotImplementedError() 26 | 27 | def apply(self, s, __dag, __pkg): 28 | raise NotImplementedError() 29 | 30 | def load_init_ckpt(self): 31 | ''' Load the init states BEFORE the search process, 32 | reduce the preprocessing time, 33 | e.g., XLA cost model need to init partition''' 34 | raise NotImplementedError() 35 | 36 | def load_ckpt(self): 37 | ''' Load checkponits during the search process ''' 38 | raise NotImplementedError() 39 | 40 | def checkpoint(self): 41 | raise NotImplementedError() 42 | 43 | def flush(self, is_accept): 44 | ''' A strategy may be rejected, so the internal states of 45 | * cost model should not be changed in apply() 46 | * but be changed when the strategy is accepted 47 | Each cost model may cache the change of the internal states, 48 | and flush the change when this function is called 49 | ''' 50 | raise NotImplementedError() 51 | 52 | -------------------------------------------------------------------------------- /dpro/cost_model/gpu_models_info.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | ALL_GPU_MODELS = ["v100", "a100", "p100", "1080ti", "t4"] 3 | CONFIG_NAMES = ["flops_fp32", "flops_fp16"] 4 | ALL_GPU_MODELS_FILTER = [True, True, True, True, True] 5 | 6 | ### refer to https://www.microway.com/knowledge-center-articles/comparison-of-nvidia-geforce-gpus-and-nvidia-tesla-gpus/ 7 | ### in tflops 8 | GPU_CONFIG = np.array([ 9 | [7.4, 29.7], 10 | [9.7, 78], 11 | [5, 19.95], 12 | [0.355, 0.177], 13 | [0.25, 16.2] 14 | ]) 15 | class GPUConfig: 16 | def __init__(self, gpu_model, configs): 17 | self.name = gpu_model 18 | self.flops_fp32 = configs[0] 19 | self.flops_fp16 = configs[1] 20 | 21 | def ret_gpu_config(gpu_model): 22 | if gpu_model not in ALL_GPU_MODELS: 23 | raise ValueError("Invalid GPU Model name: {}".format(gpu_model)) 24 | return GPUConfig(gpu_model, GPU_CONFIG[ALL_GPU_MODELS.index(gpu_model)]) 25 | 26 | def gpu_filter(gpu_model): 27 | if gpu_model not in ALL_GPU_MODELS: 28 | raise ValueError("Invalid GPU Model name: {}".format(gpu_model)) 29 | return ALL_GPU_MODELS_FILTER[ALL_GPU_MODELS.index(gpu_model)] 30 | -------------------------------------------------------------------------------- /dpro/cost_model/mixed_precision.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import time 3 | import os 4 | import pickle 5 | import numpy as np 6 | import ujson as json 7 | from tqdm import tqdm 8 | 9 | from ..arg_utils import SingleArg 10 | from ..trace_utils import * 11 | from ._xla.pk_graph import PKGraph 12 | from .base import _BaseGraphPass 13 | from ._mixed_precision.amp_pred import AMPPredictor 14 | 15 | args_ = SingleArg().args 16 | 17 | class AMPGraphPass(_BaseGraphPass): 18 | def __init__(self, opt): 19 | super().__init__(opt) 20 | ### AMP predictor 21 | self.amp_predictor = AMPPredictor(self.meta_info) 22 | self.token = [">", "<"] 23 | 24 | def init_search_space(self, candidates, _dag: nx.DiGraph, _pkg: PKGraph): 25 | search_space = [] 26 | weights = [] 27 | for n, l in candidates: 28 | # node heat 29 | # heat = self.opt._get_heat_from_history(n) 30 | # ### Nodes that have never been fused 31 | # cat = parse_cat_fine_grained(n) 32 | # pid = parse_pid_from_name(n) 33 | 34 | ### check if mixed precision can be used for this node 35 | if self.amp_predictor.is_need_amp(_dag, n): 36 | search_space.append((">", n, None)) 37 | weights.append(l) 38 | 39 | # return [(">", "host1.rank0->BW.gradients/resnet50/conv2_block3_1_conv/Conv2D_grad/Conv2DBackpropFilter", None)], [1] 40 | SingleLogger().info("MP Cost Model init {} strategies.".format(len(search_space))) 41 | return search_space, weights 42 | 43 | def apply(self, s, __dag, __pkg): 44 | op, target, _ = s 45 | nodes_introduced = self.amp_predictor.quantize(__dag, target) 46 | ### apply this strategy to other GPUs' corresponding operators 47 | ### we assume data parallel, use the same model 48 | on_other_ranks = self.opt._debug_convert_to_other_machines(target) 49 | for target in on_other_ranks: 50 | nodes_introduced += self.amp_predictor.quantize(__dag, target) 51 | return True, nodes_introduced, [] 52 | 53 | def checkpoint(self): 54 | self.amp_predictor.checkpoint() 55 | 56 | def load_ckpt(self): 57 | self.amp_predictor.load_ckpt() 58 | 59 | def load_init_ckpt(self): 60 | init_ckpt_path = os.path.join(ROOT_PATH, "amp_init_ckpt.pickle") 61 | if os.path.isfile(init_ckpt_path): 62 | with open(init_ckpt_path, "rb") as f: 63 | G, PKG, trajectory, _cast_cnt, _num_nonvar_casts_to_fp16, _op_status = pickle.load(f) 64 | self.amp_predictor.cast_cnt = _cast_cnt 65 | self.amp_predictor.num_nonvar_casts_to_fp16 = _num_nonvar_casts_to_fp16 66 | self.amp_predictor.op_status = _op_status 67 | SingleLogger().info("Reading init graph from cache.") 68 | else: 69 | G = self.dag.copy() 70 | PKG = PKGraph(G) 71 | 72 | source_nodes = [n for n in G.nodes() if "host0.rank0" in n] 73 | trajectory = [] 74 | for n in tqdm(source_nodes, total=len(source_nodes)): 75 | if self.amp_predictor.is_need_amp(G, n): 76 | s = (">", n, None) 77 | trajectory.append(s) 78 | self.apply(s, G, PKG) 79 | 80 | with open(init_ckpt_path, "wb") as f: 81 | pickle.dump([G, PKG, trajectory, self.amp_predictor.cast_cnt, 82 | self.amp_predictor.num_nonvar_casts_to_fp16, self.amp_predictor.op_status], f) 83 | SingleLogger().info("Graph cache dumped to {}.".format(init_ckpt_path)) 84 | 85 | SingleLogger().info("Successfully initialized mixed precision strategy with {} cast(s).".format( 86 | self.amp_predictor.num_nonvar_casts_to_fp16)) 87 | return G, PKG, trajectory 88 | 89 | def flush(self, is_accept: bool): 90 | self.amp_predictor.flush(is_accept) 91 | -------------------------------------------------------------------------------- /dpro/cost_model/trace_clct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### MXNet env 4 | MXNET_CUDNN_AUTOTUNE_DEFAULT=0 5 | MXNET_GPU_WORKER_NTHREADS=1 6 | MXNET_EXEC_BULK_EXEC_TRAIN=0 7 | 8 | ############################################################################## 9 | ### Configuration 10 | 11 | # MODEL="ResNet50" 12 | MODEL="BertBase" 13 | # MODEL="InceptionV3" 14 | # MODEL="VGG16" 15 | # MODEL="Bert256" 16 | 17 | PLATFORM='TF' 18 | echo "Platform: ${PLATFORM}, Model: ${MODEL}" 19 | 20 | BPF_PATH=/home/tiger/byteprofile-analysis/analyze.py 21 | TRACE_PATH=${BYTEPS_TRACE_DIR}/bps_trace_final.json 22 | BPF_CMD="python3 ${BPF_PATH} --pretty --option collect --nccl_algo RING --path ${BYTEPS_TRACE_DIR} --platform TENSORFLOW --force" 23 | 24 | function bert_env { 25 | export BPF_BATCH_PER_GPU="${BS:-32}" 26 | export BPF_NUMSTEPS="${BPF_NUMSTEPS:-100}" 27 | export BERT_ZIP_DIR=/opt/tiger/bert/data/BERT-Base_uncase 28 | export BERT_BASE_DIR=$BERT_ZIP_DIR/uncased_L-12_H-768_A-12 29 | export MAX_SEQ_LENGTH=128 30 | export MAX_PREDICTIONS_PER_SEQ=20 31 | } 32 | 33 | function funcConfigBaseCMD { 34 | if [ "$MODEL" = "ResNet50" ] || [ "$MODEL" = "VGG16" ] || [ "$MODEL" = "InceptionV3" ]; then 35 | FILE_PATH="/home/tiger/horovod_examples/tensorflow/tensorflow_synthetic_benchmark.py" 36 | else 37 | FILE_PATH="/home/tiger/bert/run_pretraining_single_machine.py" 38 | fi 39 | 40 | if [ "$MODEL" = "ResNet50" ]; then 41 | BASE_CMD="python3 ${FILE_PATH} --batch-size ${batch_size} --classes 1000" 42 | elif [ "$MODEL" = "VGG16" ]; then 43 | BASE_CMD="python3 ${FILE_PATH} --batch-size ${batch_size} --classes 1000 --model VGG16" 44 | elif [ "$MODEL" = "InceptionV3" ]; then 45 | BASE_CMD="python3 ${FILE_PATH} --batch-size ${batch_size} --classes 1000 --model InceptionV3" 46 | elif [ "$MODEL" = "Bert256" ]; then 47 | bert_env 48 | BASE_CMD="python3 ${FILE_PATH} --train_batch_size=${batch_size} --input_file=$BERT_BASE_DIR/tf_examples.tfrecord --output_dir=$BERT_BASE_DIR/pretraining_output --do_train=True --do_eval=False --bert_config_file=$BERT_BASE_DIR/bert_config.json --max_seq_length=$MAX_SEQ_LENGTH --max_predictions_per_seq=$MAX_PREDICTIONS_PER_SEQ --num_train_steps=$BPF_NUMSTEPS --num_warmup_steps=10 --learning_rate=2e-5 --synthetic --model bert_default" 49 | elif [ "$MODEL" = "BertBase" ]; then 50 | bert_env 51 | BASE_CMD="python3 ${FILE_PATH} --train_batch_size=${batch_size} --input_file=$BERT_BASE_DIR/tf_examples.tfrecord --output_dir=$BERT_BASE_DIR/pretraining_output --do_train=True --do_eval=False --bert_config_file=$BERT_BASE_DIR/bert_config.json --max_seq_length=$MAX_SEQ_LENGTH --max_predictions_per_seq=$MAX_PREDICTIONS_PER_SEQ --num_train_steps=$BPF_NUMSTEPS --num_warmup_steps=10 --learning_rate=2e-5 --synthetic --model bert_base" 52 | else 53 | echo "Invalid model: $MODEL" 54 | exit 55 | fi 56 | } 57 | 58 | ### Start to train 59 | if [ ! -d "${BYTEPS_TRACE_DIR}/host0" ]; then 60 | mkdir -p "${BYTEPS_TRACE_DIR}/host0" 61 | else 62 | rm -rf ${BYTEPS_TRACE_DIR}/host0/* 63 | fi 64 | echo "Traces are stored at ${BYTEPS_TRACE_DIR}" 65 | 66 | function funcReset { 67 | rm $TRACE_PATH 68 | rm -rf $BYTEPS_TRACE_DIR/host0/* 69 | funcConfigBaseCMD 70 | } 71 | 72 | FIRST_RUN=1 73 | function funcRunAndTest { 74 | funcReset 75 | BYTEPS_TRACE_DIR=$BYTEPS_TRACE_DIR/host0 BYTEPS_TRACE_START_STEP=50 BYTEPS_TRACE_END_STEP=60 \ 76 | nohup ${BASE_CMD} 77 | echo "dPRO fp32 BS=$batch_size: " >> ${BYTEPS_TRACE_DIR}/avg.txt 78 | echo "dPRO fp32 BS=$batch_size: " 79 | echo "Run the command: ${BASE_CMD}" 80 | if [ ${FIRST_RUN} == "1" ]; then 81 | echo "${BPF_CMD} --sub_option amp_data_clct,save_names=fp32,model=resnet,platform=tf,showall=True" 82 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=fp32,model=resnet,platform=tf,showall=True 83 | mv $BYTEPS_TRACE_DIR/host0/0 $BYTEPS_TRACE_DIR/.metadata/ 84 | nvidia-smi >> $BYTEPS_TRACE_DIR/.metadata/config.txt 85 | # echo "$bs_to_try" >> $BYTEPS_TRACE_DIR/.metadata/config.txt 86 | else 87 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=None,model=resnet,platform=tf,showall=True 88 | fi 89 | 90 | funcReset 91 | BYTEPS_TRACE_DIR=$BYTEPS_TRACE_DIR/host0 BYTEPS_TRACE_START_STEP=50 BYTEPS_TRACE_END_STEP=60 \ 92 | nohup ${BASE_CMD} --amp 93 | echo "dPRO fp16 BS=$batch_size: " >> ${BYTEPS_TRACE_DIR}/avg.txt 94 | echo "dPRO fp16 BS=$batch_size: " 95 | echo "Run the command: ${BASE_CMD} --amp" 96 | if [ ${FIRST_RUN} == "1" ]; then 97 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=fp16,model=resnet,platform=tf,showall=True 98 | FIRST_RUN=0 99 | else 100 | nohup ${BPF_CMD} --sub_option amp_data_clct,save_names=None,model=resnet,platform=tf,showall=True 101 | fi 102 | } 103 | 104 | ### Run with different batch size 105 | bs_to_try=(4 8 16 32 64 128 256 512 1024 2048) 106 | for(( id=0; id < "${#bs_to_try[@]}"; id++ )) 107 | do 108 | batch_size=${bs_to_try[$id]} 109 | funcRunAndTest 110 | done 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /dpro/cost_model/trace_filter.py: -------------------------------------------------------------------------------- 1 | ''' This module is used to filter operators that we focus on for AMP 2 | And write them in avg.txt file 3 | write corresponding names in name.txt 4 | ''' 5 | import sys, os 6 | 7 | class TraceFilter: 8 | def __init__(self, save_names=None, model=None, platform=None, showall=None): 9 | self.save_names = save_names 10 | self.platform = platform 11 | self.model = model 12 | 13 | if self.platform == 'tf': 14 | MNIST_CANDIDATES = ["Conv2D", "BiasAdd", "Relu", "MatMul", "Mul", "Cast", "BiasAddGrad", "ApplyAdam", "ReluGrad", "Conv2DBackpropInput", "Conv2DBackpropFilter"] 15 | RESNET50_CANDIDATES = ["Conv2D", "BiasAdd", "Relu", "MatMul", "Mul", "Cast"] 16 | else: 17 | MNIST_CANDIDATES = RESNET50_CANDIDATES = ["conv", "BiasAdd", "Relu", "MatMul", "Mul", "Cast"] 18 | 19 | if "resnet" in self.model.lower(): 20 | self._CANDIDATES = RESNET50_CANDIDATES 21 | elif "mnist" in self.model.lower(): 22 | self._CANDIDATES = MNIST_CANDIDATES 23 | elif "bert" in self.model.lower(): 24 | self._CANDIDATES = None 25 | elif "dense" in self.model.lower(): 26 | self._CANDIDATES = ["_dense", "MatMul", "Mat", "Cast"] 27 | else: 28 | self._CANDIDATES = None 29 | 30 | self.showall = showall.lower() in ["true", "t", "1"] 31 | 32 | def _is_ignore_for_sta(self, name): 33 | if self.showall: 34 | return False 35 | ### store the pid for computation 36 | if self._CANDIDATES is None: 37 | return False 38 | for target in self._CANDIDATES: 39 | if target in name: 40 | return False 41 | return True 42 | 43 | def dump_for_cost_model(self, name2sta, _dir): 44 | nameL = [] 45 | avg = [] 46 | var = [] 47 | for name, statistic in sorted(name2sta.items()): 48 | if self._is_ignore_for_sta(name): 49 | continue 50 | name = ".".join(name.split("->")[1].split(".")[1:]) 51 | nameL.append(name) 52 | avg.append(statistic["avg"]) 53 | var.append(statistic["var"]) 54 | # print(nameL, avg) 55 | if self.save_names != "None": 56 | with open(os.path.join(_dir, "name.txt"), "a") as f: 57 | f.write("{}:{}\n".format(self.save_names, str(nameL))) 58 | with open(os.path.join(_dir, "avg.txt"), "a") as f: 59 | f.write(str(avg) + "\n") 60 | f.write(str(var) + "\n") 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /dpro/debug_utils.py: -------------------------------------------------------------------------------- 1 | import ujson as json 2 | import os 3 | import time 4 | 5 | from .base import Singleton 6 | 7 | @Singleton 8 | class DebugRecorder: 9 | def __init__(self, path_=None, is_enable=True): 10 | self.is_enable = is_enable 11 | self.debug_traces = [] 12 | self.path_ = path_ 13 | self.base_time = self.get_time() 14 | self.ts_list = [] 15 | 16 | def get_time(self): 17 | return time.time() * 1e6 18 | 19 | def debug_record(self, name, _ts, pid, tid): 20 | ''' Used for debug, collect traces while replaying 21 | * to optimize the replay algorithm 22 | ''' 23 | if not self.is_enable: 24 | return 25 | self.debug_traces.append({ 26 | "name": name, 27 | "ts": _ts * 10e6, 28 | "dur": ((self.get_time() - self.base_time) - _ts) , 29 | "pid": pid, 30 | "ph": "X", 31 | "tid": tid 32 | }) 33 | 34 | def debug_event_start(self): 35 | if not self.is_enable: 36 | return 37 | self.ts_list.append(self.get_time() - self.base_time) 38 | 39 | def debug_event_end(self, name, pid, tid): 40 | if not self.is_enable: 41 | return 42 | _ts = self.ts_list.pop() 43 | self.debug_traces.append({ 44 | "name": name, 45 | "ts": _ts, 46 | "dur": (self.get_time() - self.base_time - _ts) , 47 | "pid": pid, 48 | "ph": "X", 49 | "tid": tid 50 | }) 51 | 52 | def dump_traces(self, path_=None): 53 | if not self.is_enable: 54 | return 55 | if path_ is not None: 56 | trace_path = path_ 57 | elif self.path_ is not None: 58 | trace_path = self.path_ 59 | else: 60 | raise ValueError("Trace path must be given") 61 | 62 | with open(os.path.join(trace_path, "debug.json"), 'w') as f: 63 | json.dump({"traceEvents": self.debug_traces, 64 | "displayTimeUnit": "ms" 65 | }, f, indent=4) 66 | 67 | -------------------------------------------------------------------------------- /dpro/helper/combine_json.py: -------------------------------------------------------------------------------- 1 | ''' Combine trace files 2 | * Usage 3 | python3 combine_trace.py files/to/combine path/to/dump/rst pid/names bias 4 | The first item of the command parameter is the path of the json file to be combined, separated by commas; 5 | The second item is the path to store the result; 6 | The third item is the relabel pid for each input file, 7 | and the bias is used to manually align the time. 8 | An example is python3 combine_trace.py path_a, path_b path_rst pid_a, pid_b 0,0 9 | ''' 10 | import ujson as json 11 | import os, sys 12 | ALIGN_TIME = True 13 | KEEP_PID = False 14 | 15 | def combine_files(files, names, bias, output_path): 16 | final_traces = [] 17 | for idx, file in enumerate(files): 18 | with open(file, 'r') as fp: 19 | traces = json.load(fp) 20 | if "traceEvents" in traces: 21 | traces = traces["traceEvents"] 22 | ts = None 23 | for trace in traces: 24 | if ALIGN_TIME and ts is None: 25 | ts = trace["ts"] 26 | if not KEEP_PID: 27 | trace["pid"] = names[idx] 28 | else: 29 | trace["pid"] = names[idx] + "." + trace["pid"] 30 | if ALIGN_TIME: 31 | trace["ts"] = trace["ts"] - ts 32 | trace["ts"] += bias[idx] 33 | final_traces += traces 34 | 35 | with open(output_path, 'w') as fp: 36 | json.dump(final_traces, fp) 37 | 38 | files = sys.argv[1] 39 | output_path = sys.argv[2] 40 | if len(sys.argv) >= 5: 41 | bias = [float(n)*1000 for n in sys.argv[4].split(",")] 42 | else: 43 | bias = [0 for _ in files] 44 | 45 | files = files.split(",") 46 | 47 | if len(files) == 1 and os.path.isdir(files[0]): 48 | names = sorted(os.listdir(files[0])) 49 | files = [os.path.join(files[0], n) for n in names] 50 | else: 51 | names = sys.argv[3].split(",") 52 | 53 | combine_files(files, names, bias, output_path) 54 | -------------------------------------------------------------------------------- /dpro/helper/compare_graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | from google.protobuf.json_format import MessageToJson 3 | from google.protobuf.text_format import Parse 4 | import tensorflow as tf 5 | import json 6 | 7 | try: 8 | GraphDef = tf.GraphDef 9 | except: 10 | GraphDef = tf.compat.v1.GraphDef 11 | 12 | def tf_relabel_func(_name, update_nodes_in_dag): 13 | for prefix in ["Comm.", "Comp.", "BW.", "FW.", "UPDATE_."]: 14 | if _name.startswith(prefix): 15 | return _name 16 | if _name.startswith("^"): 17 | _name = _name[1:] 18 | last_slash_pos = _name.rfind("/") 19 | if last_slash_pos != -1 and last_slash_pos < len(_name)-1 and _name[last_slash_pos+1] == "_": 20 | _name = _name[:last_slash_pos] 21 | if "BytePSPushPull" in _name and "tensor" not in _name: 22 | _name = "Comm." + _name 23 | elif "allreduce" in _name.lower(): 24 | if "." in _name: 25 | _, tensor_name = _name.split(".") 26 | if "_" in tensor_name: 27 | tensor_name = tensor_name.split("_")[0] 28 | _name = "Comm." + tensor_name 29 | else: 30 | _name = "UPDATE_." + _name 31 | else: 32 | if update_nodes_in_dag is not None and _name in update_nodes_in_dag \ 33 | or _name == "GradientDescent": 34 | _name = "UPDATE_." + _name 35 | elif _name == "GradientDescent": 36 | _name = "" 37 | elif _name.startswith("gradients"): 38 | _name = "BW." + _name 39 | else: 40 | _name = "FW." + _name 41 | return _name 42 | 43 | def wrap_read_graphdef(graphdef_path): 44 | if graphdef_path.endswith("pbtxt"): 45 | with open(graphdef_path, "r") as f: 46 | pb = f.read() 47 | graph_def = Parse(pb, GraphDef()) 48 | json_string = MessageToJson(graph_def) 49 | graph_def = json.loads(json_string) 50 | else: 51 | with open(graphdef_path, "r") as f: 52 | graph_def = json.load(f) 53 | graph = nx.DiGraph() 54 | for node in graph_def["node"]: 55 | if "input" in node: 56 | for input_tensor_name in node["input"]: 57 | input_node_name = input_tensor_name.split(":")[0] 58 | graph.add_edge(input_node_name, node["name"]) 59 | update_nodes_in_dag = set() 60 | def recursive_add_succs(_node): 61 | for succ_ in graph.successors(_node): 62 | update_nodes_in_dag.add(succ_) 63 | recursive_add_succs(succ_) 64 | for node in graph.nodes: 65 | if "allreduce" in node.lower() or "bytepspushpull" in node.lower(): 66 | recursive_add_succs(node) 67 | new_graph = nx.DiGraph() 68 | for u, v in graph.edges: 69 | new_graph.add_edge(tf_relabel_func(u, update_nodes_in_dag), tf_relabel_func(v, update_nodes_in_dag)) 70 | return new_graph, update_nodes_in_dag 71 | 72 | dag = nx.read_gml("/root/capture_file/run_0_dec8/simple_dag.gml") 73 | 74 | graphdef, update_nodes = wrap_read_graphdef("/root/bert/traces/before_mark_for_compilation_5.pbtxt") 75 | 76 | dag_nodes = set(dag.nodes) 77 | graphdef_nodes = set(graphdef.nodes) 78 | 79 | import code 80 | code.interact(local=locals()) 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /dpro/helper/get_iter_time_from_trace.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open("/Users/chenyu/Downloads/20210127_02_hvd_tf_vgg16_rdma_apply_xla_no_tensor_fusion/combined.json", "r") as f: 4 | trace = json.load(f) 5 | 6 | # one_pid = -1 7 | pids = set() 8 | for ev in trace["traceEvents"]: 9 | # if ev["ph"] == "M": 10 | # if ev["name"] == "process_name" and "name" in ev["args"]: 11 | # # if "/job:localhost/replica:0/task:0/device:GPU:" in ev["args"]["name"] \ 12 | # print(ev["args"]["name"]) 13 | # if "stream:all" in ev["args"]["name"] \ 14 | # and "Compute" in ev["args"]["name"]: 15 | # one_pid = ev["pid"] 16 | pids.add(ev["pid"]) 17 | 18 | evs = sorted(trace["traceEvents"], key=lambda x: x["ts"]) 19 | 20 | iter_times_pid = {} 21 | 22 | for pid in pids: 23 | source_sts = [] 24 | dep_eds = [] 25 | started = False 26 | last_assign = -1 27 | for ev in evs: 28 | if ev["ph"] == "X" and ev["pid"] == pid: 29 | # if "args" in ev and ev["args"]["name"] == "_SOURCE": 30 | if "args" in ev and "ncclAllReduceRingLLKernel" in ev["args"]["name"] and started == False: 31 | source_sts.append(ev["ts"]) 32 | if last_assign != -1: 33 | dep_eds.append(last_assign) 34 | last_assign = -1 35 | started = True 36 | # elif "args" in ev and ev["args"]["name"] == "group_deps_1": 37 | elif "args" in ev and "ncclAllReduceRingLLKernel" in ev["args"]["name"]: 38 | # started = False 39 | last_assign = ev["ts"] + ev["dur"] 40 | elif "args" in ev and "GradientDescent" == ev["args"]["name"]: 41 | started = False 42 | if last_assign != -1: 43 | dep_eds.append(last_assign) 44 | 45 | source_sts = sorted(source_sts) 46 | dep_eds = sorted(dep_eds) 47 | 48 | iter_times = [] 49 | for i in range(len(source_sts)): 50 | iter_times.append((dep_eds[i] - source_sts[i]) / 1000) 51 | iter_times_pid[pid] = iter_times 52 | 53 | avg = [] 54 | avg_per_iter = [float("inf")] * 10 55 | for pid, iter_times in iter_times_pid.items(): 56 | print("PID {}: {}".format(pid, iter_times)) 57 | avg += iter_times 58 | for idx, time in enumerate(iter_times): 59 | avg_per_iter[idx] = min(time, avg_per_iter[idx]) 60 | 61 | print("Average: {}".format(sum(avg) / len(avg))) 62 | print("Average min per iter: {}, details: {}".format(sum(avg_per_iter)/len(avg_per_iter), avg_per_iter)) 63 | -------------------------------------------------------------------------------- /dpro/helper/tf_flops_profile.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | Refer to: https://gist.github.com/shinseung428/752f284d1c065870d7f5a7e4208f0583 4 | ''' 5 | import json 6 | import os, sys 7 | import tensorflow as tf 8 | from google.protobuf.json_format import Parse as ParseJSON 9 | from google.protobuf.text_format import Parse as ParseText 10 | from google.protobuf.json_format import MessageToJson 11 | try: 12 | GraphDef = tf.GraphDef 13 | except: 14 | GraphDef = tf.compat.v1.GraphDef 15 | 16 | try: 17 | import horovod.tensorflow as hvd 18 | except: 19 | pass 20 | 21 | 22 | def profile_flops(graph_def_path, tmp_path): 23 | with open(graph_def_path, "r") as f: 24 | if graph_def_path.endswith("pbtxt"): 25 | pb = f.read() 26 | graph_def = ParseText(pb, GraphDef()) 27 | json_string = MessageToJson(graph_def) 28 | graph_def_as_json = json.loads(json_string) 29 | else: 30 | graph_def_as_json = json.load(f) 31 | cleaned_graph_def_str = json.dumps(graph_def_as_json) 32 | graph_def = ParseJSON(cleaned_graph_def_str, GraphDef()) 33 | 34 | with tf.Graph().as_default() as graph: 35 | tf.import_graph_def(graph_def, name='') 36 | with graph.as_default(): 37 | opt = (tf.profiler.ProfileOptionBuilder( 38 | tf.profiler.ProfileOptionBuilder.float_operation()) 39 | .with_file_output(tmp_path) 40 | .build()) 41 | flops = tf.profiler.profile(graph, options=opt) 42 | total_flops = flops.total_float_ops 43 | print ("========================================================") 44 | print ('Total Flops : {}'.format(total_flops)) 45 | 46 | # opt = tf.profiler.ProfileOptionBuilder.time_and_memory() 47 | # rst = tf.profiler.profile(graph, options=opt) 48 | # print(type(rst)) 49 | 50 | def parse_flops_dict(graph_def_path, tmp_path): 51 | profile_flops(graph_def_path, tmp_path) 52 | op_name2flops = {} 53 | with open(tmp_path, 'r') as fp: 54 | lines = fp.read().split("Profile:\n")[1].split("\n")[2:] 55 | for line in lines: 56 | line_split = line.split(" ") 57 | if len(line_split) < 5: 58 | continue 59 | # print(line_split) 60 | op_name = line_split[2] 61 | flops_str = line_split[3].split("/")[1] 62 | if flops_str[-1] == "k": 63 | flops = float(flops_str[:-1]) * 1e3 64 | elif flops_str[-1] == "m": 65 | flops = float(flops_str[:-1]) * 1e6 66 | elif flops_str[-1] == "b": 67 | flops = float(flops_str[:-1]) * 1e9 68 | elif flops_str[-1] == "p": 69 | flops = float(flops_str[:-1]) * 1e12 70 | else: 71 | flops = float(flops_str) 72 | op_name2flops[op_name] = flops 73 | return op_name2flops 74 | 75 | if __name__ == "__main__": 76 | graph_def_path = sys.argv[1] 77 | parse_flops_dict(graph_def_path, "flops_log.txt") 78 | -------------------------------------------------------------------------------- /dpro/helper/visualize.py: -------------------------------------------------------------------------------- 1 | import json 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | import os, sys 6 | import math 7 | 8 | from ..arg_utils import SingleArg 9 | args = SingleArg().args 10 | 11 | ''' visualize the number of operators being queued on each device for replayer 12 | ''' 13 | 14 | def init_fig_base(cnt): 15 | h = math.ceil(math.sqrt(cnt)) 16 | w = math.ceil(cnt / h) 17 | fig_base = w * 100 + h * 10 + 1 18 | return fig_base, 0 19 | 20 | with open(os.path.join(args.path, 'queue_status.json'), 'r') as fp: 21 | rst = json.load(fp) 22 | 23 | MAXIMUM_GROUP = 4 24 | plt.figure(num=1, figsize=(8, 6)) 25 | clrs = sns.color_palette("husl", MAXIMUM_GROUP+1) 26 | 27 | ### shape = (time+num_of_nodes_queued, num_data) 28 | data = np.array(sorted(rst['data'], key=lambda x:x[0])).T 29 | 30 | sample_num = 1000 31 | if sample_num is None: 32 | mask = np.ones(data.shape[1], dtype=bool) 33 | else: 34 | mask = np.zeros(data.shape[1], dtype=bool) 35 | sample_idx = np.random.choice(data.shape[1], sample_num, replace=False) 36 | mask[sample_idx] = True 37 | 38 | group_dict = {} 39 | for idx, n in sorted(enumerate(rst['names']), key=lambda x: x[1]): 40 | group = n.split('->')[0] 41 | if group not in group_dict: 42 | group_dict[group] = [] 43 | group_dict[group].append(idx) 44 | 45 | fig_base, _ = init_fig_base(min(MAXIMUM_GROUP, len(group_dict))) 46 | for idx, (group, name_idx_list) in enumerate(group_dict.items()): 47 | if idx >= MAXIMUM_GROUP: 48 | break 49 | ax = plt.subplot(fig_base + idx) 50 | for idx, name_idx in enumerate(name_idx_list): 51 | ax.plot(data[0][mask]/1000., data[name_idx+1][mask], c=clrs[idx], label=rst['names'][name_idx]) 52 | plt.legend() 53 | plt.xlabel('Time (ms)') 54 | plt.ylabel('# of operators being queued') 55 | plt.title(group) 56 | plt.show() 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /dpro/hvd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/hvd/__init__.py -------------------------------------------------------------------------------- /dpro/logger_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os, sys 3 | 4 | from .base import Singleton 5 | 6 | LOG_LEVEL_NAME = ["DEBUG", "INFO", "WARNING", "ERROR", "FATAL"] 7 | 8 | ## Ref: https://stackoverflow.com/questions/12980512/custom-logger-class-and-correct-line-number-function-name-in-log 9 | # This code is mainly copied from the python logging module, with minor modifications 10 | 11 | # _srcfile is used when walking the stack to check when we've got the first 12 | # caller stack frame. 13 | # 14 | if hasattr(sys, 'frozen'): #support for py2exe 15 | _srcfile = "logging%s__init__%s" % (os.sep, __file__[-4:]) 16 | elif __file__[-4:].lower() in ['.pyc', '.pyo']: 17 | _srcfile = __file__[:-4] + '.py' 18 | else: 19 | _srcfile = __file__ 20 | _srcfile = os.path.normcase(_srcfile) 21 | 22 | @Singleton 23 | class SingleLogger: 24 | def __init__(self, path, name, logging_level="INFO", is_clean=False, show_progress=False): 25 | dirname = path if os.path.isdir(path) else os.path.dirname(path) 26 | dirname = os.path.join(path, ".log") 27 | if not os.path.exists(dirname): 28 | os.makedirs(dirname) 29 | logfile = os.path.join(dirname, "log_option-" + name + ".txt") 30 | if is_clean and os.path.exists(logfile): 31 | os.remove(logfile) 32 | #! config logging 33 | self.logger = logging.getLogger(name) 34 | log_level = logging_level.lower() 35 | if log_level == "trace": 36 | _log_level = logging.TRACE 37 | elif log_level == "debug": 38 | _log_level = logging.DEBUG 39 | elif log_level == "warn" or log_level == "warning": 40 | _log_level = logging.WARNING 41 | elif log_level == "error": 42 | _log_level = logging.ERROR 43 | else: 44 | _log_level = logging.INFO 45 | self.logger.setLevel(level=_log_level) 46 | 47 | formatter = logging.Formatter('[%(asctime)s] [%(filename)s:%(lineno)d] %(levelname)s - %(message)s', datefmt="%Y-%m-%d %H:%M:%S") 48 | 49 | #! bind some file stream 50 | handler = logging.FileHandler(logfile) 51 | handler.setLevel(_log_level) 52 | handler.setFormatter(formatter) 53 | self.logger.addHandler(handler) 54 | 55 | if not show_progress: 56 | #! if we want show progress, no need to bind the output stream 57 | console = logging.StreamHandler() 58 | console.setLevel(_log_level) 59 | console.setFormatter(formatter) 60 | self.logger.addHandler(console) 61 | 62 | def info(self, msg, *args, **kwargs): 63 | self._log(logging.INFO, msg, args, **kwargs) 64 | 65 | def error(self, msg, *args, **kwargs): 66 | self._log(logging.ERROR, msg, args, **kwargs) 67 | 68 | def debug(self, msg, *args, **kwargs): 69 | self._log(logging.DEBUG, msg, args, **kwargs) 70 | 71 | def warn(self, msg, *args, **kwargs): 72 | self._log(logging.WARNING, msg, args, **kwargs) 73 | 74 | def warning(self, msg, *args, **kwargs): 75 | self._log(logging.WARNING, msg, args, **kwargs) 76 | 77 | def _log(self, level, msg, args, exc_info=None, extra=None): 78 | """ 79 | Low-level logging routine which creates a LogRecord and then calls 80 | all the handlers of this logger to handle the record. 81 | """ 82 | # Add wrapping functionality here. 83 | if _srcfile: 84 | #IronPython doesn't track Python frames, so findCaller throws an 85 | #exception on some versions of IronPython. We trap it here so that 86 | #IronPython can use logging. 87 | try: 88 | fn, lno, func = self.findCaller() 89 | except ValueError: 90 | fn, lno, func = "(unknown file)", 0, "(unknown function)" 91 | else: 92 | fn, lno, func = "(unknown file)", 0, "(unknown function)" 93 | if exc_info: 94 | if not isinstance(exc_info, tuple): 95 | exc_info = sys.exc_info() 96 | record = self.logger.makeRecord( 97 | self.logger.name, level, fn, lno, msg, args, exc_info, func, extra) 98 | self.logger.handle(record) 99 | 100 | 101 | def findCaller(self): 102 | """ 103 | Find the stack frame of the caller so that we can note the source 104 | file name, line number and function name. 105 | """ 106 | f = logging.currentframe() 107 | #On some versions of IronPython, currentframe() returns None if 108 | #IronPython isn't run with -X:Frames. 109 | if f is not None: 110 | f = f.f_back 111 | rv = "(unknown file)", 0, "(unknown function)" 112 | while hasattr(f, "f_code"): 113 | co = f.f_code 114 | filename = os.path.normcase(co.co_filename) 115 | if filename == _srcfile: 116 | f = f.f_back 117 | continue 118 | rv = (co.co_filename, f.f_lineno, co.co_name) 119 | break 120 | return rv 121 | 122 | 123 | -------------------------------------------------------------------------------- /dpro/memory/.gitignore: -------------------------------------------------------------------------------- 1 | *_test.py 2 | scripts/ -------------------------------------------------------------------------------- /dpro/memory/README.md: -------------------------------------------------------------------------------- 1 | # Memory Estimation 2 | 3 | Usage: 4 | ```python 5 | from memory import MemoryEstimator 6 | 7 | memory_estimator = MemoryEstimator("TENSORFLOW") 8 | estimated_memory_usage = memory_estimator.estimate(dag, param_dict) 9 | ``` 10 | 11 | default unit: MB 12 | 13 | 14 | ## TODO 15 | 16 | - [ ] workload specific. need to know the model. 17 | 18 | e.g. select forward nodes 19 | ```py 20 | def _is_forward(name): 21 | if name.startswith(DEL.join([RANK0_PREFIX, FORWARD_CAT]) + "bert"): 22 | return True 23 | return False 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /dpro/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .estimator import MemoryEstimator 2 | 3 | 4 | __all__ = [ 5 | "MemoryEstimator" 6 | ] -------------------------------------------------------------------------------- /dpro/memory/cost_model.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from copy import deepcopy 3 | 4 | from .gradient_accumulation import get_gradient_accumulation_edited_graph 5 | from ..logger_utils import SingleLogger 6 | from ..cost_model.base import _BaseGraphPass 7 | from .recomputation import get_recomputation_edited_graph 8 | from ..replay import Replayer 9 | from .utils import * 10 | 11 | 12 | def get_execution_time(dag, clct): 13 | replayer = Replayer(dag=dag, _step_num=1, 14 | leaf_dirs=clct.all_prefix_list(), 15 | dump_path=clct.pm.path, 16 | comm_backend=clct.comm_backend, 17 | byteps_graph=clct.byteps_graph) 18 | step_end_time_ms = [ 19 | t / 1000 for t in replayer.replayAndDelay(None).values()] 20 | return max(step_end_time_ms) 21 | 22 | def has_recomputation(schedule): 23 | for op in schedule.operators: 24 | if op.requires_grad is False: 25 | return True 26 | return False 27 | 28 | class MemoryGraphPass(_BaseGraphPass): 29 | def __init__(self, opt): 30 | super().__init__(opt) 31 | self.token = ["gradient_accumulation", "recomputation"] 32 | self.cnts = [0, 0] 33 | 34 | def init_search_space(self, candidates, dag, pkg): 35 | candidate_strategies = [] 36 | candidate_weights = [] 37 | for i, strategy in enumerate(self.token): 38 | if strategy == "recomputation": 39 | if has_recomputation(self.opt.memory_estimator.schedule): 40 | continue 41 | 42 | func = self.func_factory(strategy) 43 | estimated_time, estimated_memory = func(dag, self.opt.clct) 44 | if estimated_memory > self.opt.memory_budget: 45 | continue 46 | candidate_strategies.append((strategy, None, None)) 47 | candidate_weights.append(1./(self.cnts[0] + 1)) 48 | 49 | return candidate_strategies, candidate_weights 50 | 51 | def apply(self, s, dag, pkg): 52 | if s[0] == "gradient_accumulation": 53 | if self.opt.memory_estimator.batch_size > 1: 54 | self.opt.memory_estimator.batch_size /= 2 55 | get_gradient_accumulation_edited_graph(dag) 56 | self.cnts[0] += 1 57 | elif s[0] == "recomputation": 58 | get_recomputation_edited_graph( 59 | dag, self.opt.memory_estimator.schedule, "speed") 60 | self.cnts[1] += 1 61 | else: 62 | raise NotImplementedError 63 | return True, [], [] 64 | 65 | def func_factory(self, strategy): 66 | func_name = "_get_estimated_time_and_memory_of_" + strategy 67 | return getattr(self, func_name) 68 | 69 | def _get_estimated_time_and_memory_of_gradient_accumulation(self, dag, clct): 70 | dag_copy = deepcopy(dag) 71 | get_gradient_accumulation_edited_graph(dag_copy) 72 | estimated_time = get_execution_time(dag_copy, clct) 73 | 74 | self.opt.memory_estimator.batch_size /= 2 75 | estimated_memory = self.opt.memory_estimator.estimate( 76 | dag, clct.para_dict) 77 | self.opt.memory_estimator.batch_size *= 2 # restore 78 | 79 | SingleLogger().info("Estimated time and memory after applying gradient accumulation: {:.2f}ms, {:.2f}GB".format( 80 | estimated_time, estimated_memory 81 | )) 82 | return estimated_time, estimated_memory 83 | 84 | def _get_estimated_time_and_memory_of_recomputation(self, dag, clct): 85 | dag_copy = deepcopy(dag) 86 | prev_nodes = deepcopy(self.opt.memory_estimator.schedule.operators) 87 | get_recomputation_edited_graph( 88 | dag_copy, self.opt.memory_estimator.schedule, "speed") 89 | estimated_time = get_execution_time(dag_copy, clct) 90 | 91 | estimated_memory = self.opt.memory_estimator.estimate( 92 | dag, clct.para_dict) 93 | 94 | # dirty implementation ... 95 | for op, prev_op in zip(self.opt.memory_estimator.schedule.operators, prev_nodes): 96 | op.requires_grad = prev_op.requires_grad 97 | 98 | SingleLogger().info("Estimated time and memory after applying recomputation: {:.2f}ms, {:.2f}GB".format( 99 | estimated_time, estimated_memory 100 | )) 101 | return estimated_time, estimated_memory 102 | 103 | 104 | class IncreasingBatchSizeCostModel(_BaseGraphPass): 105 | def __init__(self, opt): 106 | super().__init__(opt) 107 | self.token = ["increase_batch_size"] 108 | self.cnt = 0 109 | 110 | def init_search_space(self, candidates, dag, pkg): 111 | candidate_strategies = [] 112 | candidate_weights = [] 113 | for strategy in self.token: 114 | func = self.func_factory(strategy) 115 | estimated_time, estimated_memory = func(dag, self.opt.clct) 116 | candidate_strategies.append((strategy, None, None)) 117 | candidate_weights.append(1./(self.cnt + 1)) 118 | 119 | return candidate_strategies, candidate_weights 120 | 121 | def apply(self, s, dag, pkg): 122 | # TODO(yuchen): determine batch size upper bound 123 | if self.opt.memory_estimator.batch_size < 1024: 124 | self.opt.memory_estimator.batch_size *= 2 125 | self._update_dag(dag) 126 | self.cnt += 1 127 | return True, [], [] 128 | 129 | def func_factory(self, strategy): 130 | func_name = "_get_estimated_time_and_memory_of_" + strategy 131 | return getattr(self, func_name) 132 | 133 | def _update_dag(self, dag): 134 | computation_nodes = filter_out_comm_nodes(dag) 135 | update_time_by_scale(dag.subgraph(computation_nodes), 0.8) 136 | 137 | def _get_estimated_time_and_memory_of_increase_batch_size(self, dag, clct): 138 | dag_copy = deepcopy(dag) 139 | self._update_dag(dag_copy) 140 | 141 | estimated_time = get_execution_time(dag_copy, clct) 142 | 143 | self.opt.memory_estimator.batch_size *= 2 144 | estimated_memory = self.opt.memory_estimator.estimate( 145 | dag, clct.para_dict) 146 | self.opt.memory_estimator.batch_size /= 2 # restore 147 | 148 | SingleLogger().info("Estimated time and memory after applying increasing batch size: {:.2f}ms, {:.2f}GB".format( 149 | estimated_time, estimated_memory 150 | )) 151 | return estimated_time, estimated_memory 152 | 153 | def load_init_ckpt(self, G_prime=None): 154 | return None, None, [] 155 | 156 | def load_ckpt(self): 157 | return 158 | 159 | def checkpoint(self): 160 | return 161 | 162 | def flush(self, is_accept): 163 | return 164 | -------------------------------------------------------------------------------- /dpro/memory/estimator.py: -------------------------------------------------------------------------------- 1 | from .node import Node 2 | from .schedule import Schedule 3 | from .utils import * 4 | 5 | import networkx as nx 6 | 7 | 8 | class MemoryEstimator: 9 | 10 | def __init__(self, platform): 11 | self.platform = platform 12 | self.default_batch_size = 32 # TODO(yuchen): should read from graph 13 | self.batch_size = self.default_batch_size 14 | self._schedule = None 15 | self._cached_result = 0 16 | 17 | @property 18 | def schedule(self): 19 | return self._schedule 20 | 21 | @schedule.setter 22 | def schedule(self, val): 23 | self._schedule = val 24 | 25 | def _compose_operator_schedule(self, dag, param_dict) -> Schedule: 26 | forward_nodes = get_forward_nodes(dag.nodes) 27 | forward_graph = dag.subgraph(forward_nodes).copy() 28 | 29 | leaf_nodes = get_leaf_nodes(forward_graph) 30 | forward_graph.remove_nodes_from(leaf_nodes) 31 | leaf_nodes = remove_nodes_prefix( 32 | leaf_nodes, DEL.join([RANK0_PREFIX, FORWARD_CAT])) 33 | 34 | sorted_forward_nodes = nx.topological_sort(forward_graph) 35 | sorted_forward_nodes = remove_nodes_prefix( 36 | sorted_forward_nodes, DEL.join([RANK0_PREFIX, FORWARD_CAT])) 37 | 38 | metadata = param_dict.metainfo.tf_meta 39 | operator_schedule = Schedule(self.platform) 40 | trace_times = nx.get_node_attributes(dag, "avg") 41 | trace_times = {remove_node_prefix(k, DEL.join( 42 | [RANK0_PREFIX, FORWARD_CAT])): v for k, v in trace_times.items()} 43 | for node in leaf_nodes: 44 | op = Node.from_metadata( 45 | node, metadata, trace_times[node]) 46 | operator_schedule.add(op) 47 | 48 | for node in sorted_forward_nodes: 49 | op = Node.from_metadata(node, metadata, trace_times[node]) 50 | operator_schedule.add(op) 51 | 52 | return operator_schedule 53 | 54 | def _simulate_memory_allocation(self, operator_schedule) -> float: 55 | peak_size = 0 56 | total_activations = 0 57 | total_param_size = 0 58 | 59 | def _get_param_size(): 60 | # including optimizer states, such as momentums 61 | nonlocal total_param_size 62 | for param in operator_schedule.parameters: 63 | total_param_size += param.get_output_size() 64 | 65 | def _simulate_forward_propagation(): 66 | nonlocal total_activations, peak_size 67 | for op in operator_schedule.operators: 68 | if op.requires_grad: 69 | total_activations += op.get_output_size() 70 | 71 | temp_size = op.get_temp_size() 72 | peak_size = max(peak_size, total_activations + temp_size) 73 | 74 | def _simulate_backward_propagation(): 75 | nonlocal total_activations, peak_size 76 | restore_list = [] 77 | for i, op in reversed(list(enumerate(operator_schedule.operators))): 78 | output_grad_size = op.get_output_size() 79 | 80 | j = i 81 | while j >= 0 and not operator_schedule.operators[j].requires_grad: 82 | total_activations += operator_schedule.operators[j].get_output_size( 83 | ) 84 | operator_schedule.operators[j].requires_grad = True 85 | restore_list.append(operator_schedule.operators[j]) 86 | j -= 1 87 | 88 | temp_size = op.get_temp_size() 89 | peak_size = max(peak_size, total_activations + 90 | output_grad_size + temp_size) 91 | total_activations -= output_grad_size 92 | 93 | # restore 94 | for op in restore_list: 95 | op.requires_grad = False 96 | 97 | def _byte_to_GB(size): 98 | return size / (1000**3) 99 | 100 | _get_param_size() 101 | _simulate_forward_propagation() 102 | _simulate_backward_propagation() 103 | 104 | peak_size, total_param_size = _byte_to_GB( 105 | peak_size), _byte_to_GB(total_param_size) 106 | 107 | peak_size *= self.batch_size / self.default_batch_size 108 | 109 | # TODO(yuchen): Not expandable. This is for Adam. 110 | total = peak_size + total_param_size / 3 * 8 111 | self._cached_result = total 112 | return total 113 | 114 | def estimate(self, dag, param_dict): 115 | """Estimate memory usage based on computation graph 116 | 117 | Args: 118 | dag (nx.DiGraph): computation graph 119 | param_dict (ParameterDict): operator information 120 | 121 | Returns: 122 | [float]: memory usage in GB 123 | """ 124 | if not self._schedule: 125 | self._schedule = self._compose_operator_schedule(dag, param_dict) 126 | return self._simulate_memory_allocation(self._schedule) 127 | 128 | @property 129 | def cached_memory_estimation(self): 130 | return self._cached_result 131 | -------------------------------------------------------------------------------- /dpro/memory/gradient_accumulation.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | 3 | 4 | def get_gradient_accumulation_edited_graph(dag, verbose=False): 5 | _apply_gradient_accumulation(dag, verbose) 6 | return True 7 | 8 | 9 | def _apply_gradient_accumulation(dag, verbose): 10 | _update_dag(dag, verbose) 11 | 12 | 13 | def _update_dag(dag, verbose): 14 | computation_nodes = filter_out_comm_nodes(dag) 15 | update_time_by_scale(dag.subgraph(computation_nodes), 0.8) 16 | 17 | # TODO(yuchen): deal with other ranks 18 | filtered_nodes = get_forward_backward_nodes(dag.nodes) 19 | subgraph = dag.subgraph(filtered_nodes) 20 | 21 | target = filtered_nodes[0] # first node 22 | 23 | mapping = {node: node+"_ga" for node in subgraph.nodes} 24 | subgraph = nx.relabel_nodes(subgraph, mapping) 25 | 26 | insert_nodes(dag, subgraph, target) 27 | -------------------------------------------------------------------------------- /dpro/memory/node.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Node: 5 | def __init__(self, name, op, input, dtype, shape, time): 6 | self._name = name 7 | self._op = op 8 | self._input = input 9 | self._dtype = dtype 10 | self._shape = shape 11 | self._requires_grad = True 12 | self._inplace = False 13 | self._time = time 14 | 15 | @classmethod 16 | def from_metadata(cls, name, metadata, time): 17 | """create node from metadata 18 | 19 | Args: 20 | name (str): node name 21 | metadata (dict): from metadata.json 22 | time (float): execution time based on trace 23 | 24 | Returns: 25 | [Node]: created node 26 | """ 27 | def _get_op(node_def): 28 | return node_def.get("op").lower() 29 | 30 | def _get_input(node_def): 31 | inputs = node_def.get("input") 32 | if not inputs: 33 | return None 34 | 35 | names = [] 36 | for input in inputs: 37 | full_name = input.get("name") 38 | name = full_name.rsplit(':')[0] 39 | names.append(name) 40 | return names 41 | 42 | def _get_dtype(node_def): 43 | output = node_def.get("output") 44 | if not output: 45 | return None 46 | dtype = output[0].get("dtype") 47 | if not dtype or dtype == "string": 48 | return None 49 | 50 | if dtype.endswith("_ref"): 51 | dtype = dtype[:-len("_ref")] 52 | 53 | return np.dtype(dtype) 54 | 55 | def _get_shape(node_def): 56 | output = node_def.get("output") 57 | if not output: 58 | return None 59 | return tuple(output[0].get("shape")) 60 | 61 | if name not in metadata: 62 | return None 63 | 64 | node_def = metadata[name] 65 | 66 | return cls(name, 67 | _get_op(node_def), 68 | _get_input(node_def), 69 | _get_dtype(node_def), 70 | _get_shape(node_def), 71 | time) 72 | 73 | def is_valid(self): 74 | """check the node's validity 75 | 76 | Returns: 77 | [bool]: validity 78 | """ 79 | def _is_valid_op(op): 80 | if op in ["NoOp"]: 81 | return False 82 | return True 83 | 84 | def _is_not_none(value): 85 | if value is None: 86 | return False 87 | return True 88 | 89 | def _is_valid_shape(shape): 90 | if not isinstance(shape, tuple): 91 | return False 92 | if not shape or shape[0] == -1: 93 | return False 94 | return True 95 | 96 | return all([ 97 | _is_valid_op(self.op), 98 | _is_not_none(self.dtype), 99 | _is_not_none(self.input), 100 | _is_valid_shape(self.shape) 101 | ]) 102 | 103 | def is_parameter(self): 104 | """Whether this node is parameter node 105 | 106 | Returns: 107 | [bool]: is parameter node 108 | """ 109 | if self._op == "variablev2": 110 | return True 111 | return False 112 | 113 | def get_num_ele(self): 114 | """get number of elements 115 | 116 | Returns: 117 | [int]: number of elements 118 | """ 119 | return np.prod(self.shape) 120 | 121 | def get_output_size(self): 122 | """get output size 123 | 124 | Returns: 125 | [float]: size in Byte 126 | """ 127 | return np.prod(self.shape) * self.dtype.itemsize 128 | 129 | def get_temp_size(self): 130 | """get temporary buffer size 131 | 132 | useful for cudnn workspace size 133 | 134 | Returns: 135 | [foat]: size in Byte 136 | """ 137 | return 0 138 | 139 | @property 140 | def name(self): 141 | """get name 142 | 143 | Returns: 144 | [str]: node name 145 | """ 146 | return self._name 147 | 148 | @property 149 | def op(self): 150 | """get operator type 151 | 152 | Returns: 153 | [str]: operator type 154 | """ 155 | return self._op 156 | 157 | @property 158 | def input(self): 159 | """get input list 160 | 161 | Returns: 162 | [list]: input node name list 163 | """ 164 | return self._input 165 | 166 | @property 167 | def dtype(self): 168 | """get data type 169 | 170 | Returns: 171 | [numpy.dtype]: data type 172 | """ 173 | return self._dtype 174 | 175 | @property 176 | def shape(self): 177 | """get output shape 178 | 179 | Returns: 180 | [tuple]: output shape 181 | """ 182 | return self._shape 183 | 184 | @property 185 | def requires_grad(self): 186 | """get requires_grad 187 | 188 | Returns: 189 | [bool]: requires_grad 190 | """ 191 | return self._requires_grad 192 | 193 | @requires_grad.setter 194 | def requires_grad(self, val): 195 | self._requires_grad = val 196 | 197 | @property 198 | def inplace(self): 199 | """get inplace status 200 | 201 | Returns: 202 | [bool]: inplace 203 | """ 204 | return self._inplace 205 | 206 | @inplace.setter 207 | def inplace(self, val): 208 | self._inplace = val 209 | 210 | @property 211 | def time(self): 212 | return self._time 213 | 214 | def __repr__(self): 215 | return "Name: %s, op: %s, input: [%s], dtype: %s, shape: %s" % ( 216 | self.name, self.op, ", ".join(self.input), str( 217 | self.dtype), str(self.shape) 218 | ) 219 | -------------------------------------------------------------------------------- /dpro/memory/recomputation.py: -------------------------------------------------------------------------------- 1 | import re 2 | import networkx as nx 3 | from itertools import islice 4 | 5 | from .utils import * 6 | from ..logger_utils import SingleLogger 7 | 8 | class CheckpointsSelector: 9 | @classmethod 10 | def get_checkpoint_selector(cls, mode): 11 | if mode == "speed": 12 | return SpeedCheckpointsSelector() 13 | elif mode == "memory": 14 | return MemoryCheckpointsSelector() 15 | elif mode == "topk": 16 | return TopkCheckpointsSelector() 17 | else: 18 | raise ValueError("%s is not found" % mode) 19 | 20 | @staticmethod 21 | def select_checkpoints(schedule): 22 | raise NotImplementedError 23 | 24 | 25 | class SpeedCheckpointsSelector(CheckpointsSelector): 26 | @staticmethod 27 | def select_checkpoints(schedule): 28 | return list(filter(lambda n: len(re.findall("conv2d|conv|matmul", n.op)) 29 | > 0, schedule.operators)) 30 | 31 | 32 | class MemoryCheckpointsSelector(CheckpointsSelector): 33 | @staticmethod 34 | def select_checkpoints(schedule): 35 | # TODO(yuchen): https://arxiv.org/pdf/1604.06174.pdf 36 | raise NotImplementedError 37 | 38 | 39 | class TopkCheckpointsSelector(CheckpointsSelector): 40 | k = 0.1 41 | 42 | @staticmethod 43 | def select_checkpoints(schedule): 44 | num_checkpoints = int( 45 | TopkCheckpointsSelector.k * len(schedule.operators)) 46 | sorted_ops_indices = [i for i, _ in sorted( 47 | enumerate(schedule.operators), key=lambda n:n[1].time)] 48 | topk_indices = sorted(sorted_ops_indices[-num_checkpoints:]) 49 | expensive_ops = [schedule.operators[i] for i in topk_indices] 50 | return expensive_ops 51 | 52 | 53 | def get_recomputation_edited_graph(dag, schedule, mode, verbose=False): 54 | selector = CheckpointsSelector.get_checkpoint_selector(mode) 55 | checkpoints = selector.select_checkpoints(schedule) 56 | if not checkpoints: 57 | SingleLogger().warn("No checkpoints found! Recomputation Aborted!") 58 | return False 59 | 60 | if verbose: 61 | names = [node.name for node in checkpoints] 62 | SingleLogger().info("select %d checkpoints: %s" % 63 | (len(names), ', '.join(names))) 64 | 65 | _apply_recomputation(dag, schedule, checkpoints, verbose) 66 | 67 | return True 68 | 69 | 70 | def _update_schedule(schedule, checkpoints): 71 | name_to_checkpoints = {node.name: node for node in checkpoints} 72 | for op in schedule.operators: 73 | if op.name in name_to_checkpoints: 74 | op.requires_grad = True 75 | else: 76 | op.requires_grad = False 77 | 78 | 79 | def _apply_recomputation(dag, schedule, checkpoints, verbose): 80 | _update_schedule(schedule, checkpoints) 81 | _update_dag(dag, checkpoints, verbose) 82 | 83 | 84 | def _compose_subgraph_between_two_nodes(dag, source, target): 85 | if not nx.has_path(dag, source, target): 86 | # it is possible. e.g. matmul in k, q, v 87 | return None 88 | 89 | paths_between_two_nodes = list( 90 | islice(nx.shortest_simple_paths(dag, source, target), 10)) 91 | nodes_between_set = { 92 | node for path in paths_between_two_nodes for node in path} 93 | 94 | subgraph = dag.subgraph(nodes_between_set) 95 | 96 | # add suffix to avoid the same name in a graph 97 | mapping = {node: node+"_sg" for node in subgraph.nodes} 98 | return nx.relabel_nodes(subgraph, mapping) 99 | 100 | 101 | def _get_last_forward_node(dag): 102 | forward_nodes = get_forward_nodes(dag.nodes) 103 | forward_graph = dag.subgraph(forward_nodes).copy() 104 | leaf_nodes = get_leaf_nodes(forward_graph) 105 | forward_graph.remove_nodes_from(leaf_nodes) 106 | sorted_forward_nodes = list(nx.topological_sort(forward_graph)) 107 | sorted_forward_nodes = filter_out_node_by_name( 108 | sorted_forward_nodes, "read") 109 | return sorted_forward_nodes[-1] 110 | 111 | 112 | def _get_target_backward_node(dag, target): 113 | target_bwp_op_name = target.replace("->FW.", "->BW.gradients/") 114 | target_bwp_op_name += "_grad/" + target_bwp_op_name.rsplit('/')[-1] 115 | if target_bwp_op_name in dag.nodes: 116 | return target_bwp_op_name 117 | return None 118 | 119 | 120 | def _update_dag(dag, checkpoints, verbose): 121 | filtered_nodes = filter_out_comm_nodes(dag.nodes) 122 | # TODO(yuchen): deal with other ranks 123 | filtered_nodes = get_rank0_nodes(filtered_nodes) 124 | names_to_nodes = {get_node_name(node): node for node in filtered_nodes} 125 | checkpoints_to_nodes = {node.name: names_to_nodes[node.name] 126 | for node in checkpoints if node.name in names_to_nodes} 127 | 128 | target = _get_last_forward_node(dag) # last node in forward 129 | if verbose: 130 | SingleLogger().info("Get the last forward node %s." % target) 131 | 132 | for checkpoint in checkpoints[::-1]: 133 | source = checkpoints_to_nodes[checkpoint.name] 134 | if verbose: 135 | SingleLogger().info("source %s, target %s" % (source, target)) 136 | subgraph = _compose_subgraph_between_two_nodes(dag, source, target) 137 | 138 | if subgraph: 139 | if verbose: 140 | SingleLogger().info("ops to be copied: %s" % (', '.join(subgraph.nodes))) 141 | 142 | target_bwp_op = _get_target_backward_node(dag, target) 143 | if verbose: 144 | SingleLogger().info("target backward op: %s" % (str(target_bwp_op))) 145 | 146 | # rewire 147 | insert_nodes(dag, subgraph, target_bwp_op) 148 | 149 | target = source 150 | -------------------------------------------------------------------------------- /dpro/memory/schedule.py: -------------------------------------------------------------------------------- 1 | from .node import Node 2 | 3 | 4 | class Schedule: 5 | def __init__(self, platform): 6 | self._parameters = [] 7 | self._operators = [] 8 | self._node_collection = {} 9 | self.lists = self._get_platform_memory_lists(platform) 10 | 11 | def add(self, node): 12 | """add node into schedule and determine whether it runs in place 13 | 14 | Args: 15 | node ([Node]): operator 16 | 17 | Returns: 18 | [bool]: Status 19 | """ 20 | if not isinstance(node, Node): 21 | return False 22 | 23 | self._node_collection[node.name] = node 24 | 25 | if node.is_parameter(): 26 | self._parameters.append(node) 27 | elif node.is_valid() and self._is_in_whitelist(node): 28 | self._set_inplace(node) 29 | self._operators.append(node) 30 | else: 31 | return False 32 | 33 | return True 34 | 35 | @property 36 | def parameters(self): 37 | return self._parameters 38 | 39 | @property 40 | def operators(self): 41 | return self._operators 42 | 43 | def _is_in_whitelist(self, node): 44 | if node.op not in self.lists.WHITE_LIST: 45 | return False 46 | return True 47 | 48 | def _should_inplace(self, input_node, output_node): 49 | if output_node.op not in self.lists.CWISE_LIST: 50 | return False 51 | 52 | if input_node.inplace: 53 | return False 54 | 55 | if input_node.dtype != output_node.dtype: 56 | return False 57 | 58 | if input_node.get_num_ele() != output_node.get_num_ele(): 59 | return False 60 | 61 | return True 62 | 63 | def _set_inplace(self, node): 64 | input_names = node.input 65 | for input_name in input_names: 66 | input_node = self._node_collection.get(input_name) 67 | if input_node and self._should_inplace(input_node, node): 68 | node.inplace = True 69 | break 70 | 71 | def _get_platform_memory_lists(self, platform): 72 | try: 73 | if platform.lower() == "tensorflow": 74 | from ..ml_platform.tensorflow import memory_lists 75 | elif platform.lower() == "mxnet": 76 | from ..ml_platform.mxnet import memory_lists 77 | else: 78 | raise NotImplementedError() 79 | self.lists = memory_lists 80 | except: 81 | raise NotImplementedError( 82 | "Memory Estimator Does Not Support %s" % platform) 83 | 84 | return self.lists 85 | -------------------------------------------------------------------------------- /dpro/memory/utils.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import networkx as nx 3 | 4 | DEL = "->" 5 | RANK0_PREFIX = "host0.rank0" 6 | FORWARD_CAT = "FW." 7 | BACKWARD_CAT = "BW." 8 | 9 | 10 | def remove_prefix(text, prefix): 11 | if text.startswith(prefix): 12 | return text[len(prefix):] 13 | return text 14 | 15 | 16 | def remove_nodes_prefix(nodes, prefix): 17 | func = partial(remove_prefix, prefix=prefix) 18 | return list(map(func, nodes)) 19 | 20 | 21 | def remove_node_prefix(node, prefix): 22 | func = partial(remove_prefix, prefix=prefix) 23 | return func(node) 24 | 25 | 26 | def filter_out_comm_nodes(nodes): 27 | def _is_comm(name): 28 | if name.startswith("server") or name.startswith("worker"): 29 | return False 30 | return True 31 | 32 | return list(filter(_is_comm, nodes)) 33 | 34 | 35 | def get_node_name(name): 36 | return name.rsplit(".")[-1] 37 | 38 | 39 | def get_rank0_nodes(nodes): 40 | # TODO(yuchen): Not expandable 41 | def _is_rank0(name): 42 | if name.startswith(RANK0_PREFIX): 43 | return True 44 | return False 45 | 46 | return list(filter(_is_rank0, nodes)) 47 | 48 | 49 | def get_leaf_nodes(dag): 50 | return [node for node in dag.nodes if dag.out_degree(node) == 1 51 | and dag.in_degree(node) == 0] 52 | 53 | 54 | def filter_out_node_by_name(nodes, name): 55 | return list(filter(lambda node: False if name in node else True, nodes)) 56 | 57 | 58 | def get_forward_nodes(nodes): 59 | # TODO(yuchen): Not expandable 60 | def _is_forward(name): 61 | if name.startswith(DEL.join([RANK0_PREFIX, FORWARD_CAT]) + "bert"): 62 | return True 63 | return False 64 | 65 | return list(filter(_is_forward, nodes)) 66 | 67 | 68 | def get_forward_backward_nodes(nodes): 69 | # TODO(yuchen): Not expandable 70 | def _is_forward(name): 71 | if name.startswith(DEL.join([RANK0_PREFIX, FORWARD_CAT]) + "bert") or \ 72 | name.startswith(DEL.join([RANK0_PREFIX, BACKWARD_CAT]) + "bert"): 73 | return True 74 | return False 75 | 76 | return list(filter(_is_forward, nodes)) 77 | 78 | 79 | def get_input_nodes(dag): 80 | return [u for u, deg in dag.in_degree() if not deg] 81 | 82 | 83 | def get_output_nodes(dag): 84 | return [u for u, deg in dag.out_degree() if not deg] 85 | 86 | 87 | def insert_nodes(dag, subgraph, target): 88 | if not target: 89 | return 90 | # copy subgraph 91 | dag.add_nodes_from(subgraph.nodes.data()) 92 | dag.add_edges_from(subgraph.edges.data()) 93 | 94 | # remove previous nodes 95 | prev_nodes = list(dag.predecessors(target)) 96 | for prev_node in prev_nodes: 97 | dag.remove_edge(prev_node, target) 98 | 99 | # connect subgraph output to target 100 | outputs = get_output_nodes(subgraph) 101 | dag.add_edge(outputs[0], target) 102 | 103 | 104 | def update_time_by_scale(dag, scale): 105 | trace_times = nx.get_node_attributes(dag, "avg") 106 | for k, v in trace_times.items(): 107 | trace_times[k] = v * scale 108 | nx.set_node_attributes(dag, trace_times, "avg") 109 | -------------------------------------------------------------------------------- /dpro/mg_generate_dataset.py: -------------------------------------------------------------------------------- 1 | ''' Generate Dataset to train the Cost Model 2 | ''' 3 | from tqdm import tqdm 4 | import os,sys 5 | 6 | from .arg_utils import SingleArg 7 | from .logger_utils import SingleLogger 8 | 9 | args = SingleArg().args 10 | logger = SingleLogger(args.path.split(',')[0], 11 | args.option, args.logging_level, 12 | is_clean=args.clean, 13 | show_progress=args.progress) 14 | logger.info(args) 15 | 16 | if args.option == "optimize": 17 | if args.sub_option == "train_amp": 18 | from cost_model._mixed_precision.amp_pred import AMPPredictor, train_amp_model 19 | train_amp_model() 20 | exit(0) 21 | elif args.sub_option == "train_gpu": 22 | from cost_model._gpu_predict.gpu_pred import train_gpu_model 23 | train_gpu_model() 24 | exit(0) -------------------------------------------------------------------------------- /dpro/ml_platform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/ml_platform/__init__.py -------------------------------------------------------------------------------- /dpro/ml_platform/mxnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/ml_platform/mxnet/__init__.py -------------------------------------------------------------------------------- /dpro/ml_platform/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from . import memory_lists -------------------------------------------------------------------------------- /dpro/ml_platform/tensorflow/amp_lists.py: -------------------------------------------------------------------------------- 1 | 2 | whitelist = [ 3 | #if CUDA_VERSION >= 9010 // Fp16 BatchMatMul is slow before CUDA 9.1. 4 | "BatchMatMul", 5 | "BlockLSTM", "BlockLSTMGrad", "Conv2D", "Conv2DBackpropFilter", 6 | "Conv2DBackpropInput", 7 | 8 | # # TODO(benbarsdell): Enable these when Tensor Core kernels are 9 | # # available for 3D convolutions. 10 | # "Conv3D", 11 | # "Conv3DBackpropFilter", 12 | # "Conv3DBackpropFilterV2", 13 | # "Conv3DBackpropInput", 14 | # "Conv3DBackpropInputV2", 15 | # "CudnnRNN", "CudnnRNNBackprop", "CudnnRNNBackpropV2", 16 | # "CudnnRNNBackpropV3", "CudnnRNNV2", "CudnnRNNV3", "GRUBlockCell", 17 | # "GRUBlockCellGrad", "LSTMBlockCell", "LSTMBlockCellGrad", 18 | 19 | 20 | # # TODO(benbarsdell): Enable these when fast and safe fp16 kernels are 21 | # available for depthwise convolutions. 22 | # "DepthwiseConv2dNative", 23 | # "DepthwiseConv2dNativeBackpropFilter", 24 | # "DepthwiseConv2dNativeBackpropInput", 25 | 26 | "MatMul", 27 | ] 28 | 29 | greylist = [ 30 | "Add", 31 | "AddN", 32 | "AddV2", 33 | "AvgPool", 34 | "AvgPool3D", 35 | "AvgPool3DGrad", 36 | "AvgPoolGrad", 37 | "BiasAdd", 38 | "BiasAddGrad", 39 | "BiasAddV1", 40 | "Elu", 41 | "EluGrad", 42 | "Erf", 43 | "Erfc", 44 | "FloorDiv", 45 | "FusedBatchNormV2", 46 | "FusedBatchNormGradV2", 47 | "FusedBatchNormV3", 48 | "FusedBatchNormGradV3", 49 | "Inv", 50 | "LeakyRelu", 51 | "LeakyReluGrad", 52 | "Mul", 53 | "Prod", 54 | "RealDiv", 55 | "Reciprocal", 56 | "Sigmoid", 57 | "SigmoidGrad", 58 | "Softplus", 59 | "SoftplusGrad", 60 | "Sqrt", 61 | "Sub", 62 | "Tanh", 63 | "TanhGrad", 64 | ] 65 | 66 | blacklist = [ 67 | "Exp", 68 | "Expm1", 69 | "L2Loss", 70 | "Log", 71 | "Log1p", 72 | "LogSoftmax", 73 | "Mean", 74 | "Pow", 75 | "SaveV2", 76 | "Softmax", 77 | "SoftmaxCrossEntropyWithLogits", 78 | "SparseSoftmaxCrossEntropyWithLogits", 79 | "Sum", 80 | ] 81 | 82 | clearlist = [ 83 | "Abs", 84 | "ArgMax", 85 | "ArgMin", 86 | "BatchToSpace", 87 | "BatchToSpaceND", 88 | "BroadcastTo", 89 | "Ceil", 90 | "CheckNumerics", 91 | "ClipByValue", 92 | "Concat", 93 | "ConcatV2", 94 | "DepthToSpace", 95 | "DynamicPartition", 96 | "DynamicStitch", 97 | "Enter", 98 | "EnsureShape", 99 | "Equal", 100 | "Exit", 101 | "ExpandDims", 102 | "Fill", 103 | "Floor", 104 | "Gather", 105 | "GatherNd", 106 | "GatherV2", 107 | "Greater", 108 | "GreaterEqual", 109 | "Identity", 110 | "IdentityN", 111 | "IsFinite", 112 | "IsInf", 113 | "IsNan", 114 | "Less", 115 | "LessEqual", 116 | "Max", 117 | "MaxPool", 118 | "MaxPool3D", 119 | "MaxPool3DGrad", 120 | "MaxPool3DGradGrad", 121 | "MaxPoolGrad", 122 | "MaxPoolGradGrad", 123 | "MaxPoolGradGradV2", 124 | "MaxPoolGradV2", 125 | "MaxPoolV2", 126 | "Maximum", 127 | "Merge", 128 | "Min", 129 | "Minimum", 130 | "MirrorPad", 131 | "MirrorPadGrad", 132 | "Neg", 133 | "NextIteration", 134 | "NotEqual", 135 | "OneHot", 136 | "OnesLike", 137 | "Pack", 138 | "Pad", 139 | "PadV2", 140 | "PreventGradient", 141 | "Rank", 142 | "Relu", 143 | "Relu6", 144 | "Relu6Grad", 145 | "ReluGrad", 146 | "Reshape", 147 | "ResizeNearestNeighbor", 148 | "ResizeNearestNeighborGrad", 149 | "Reverse", 150 | "ReverseSequence", 151 | "ReverseV2", 152 | "Round", 153 | "Select", 154 | "Shape", 155 | "ShapeN", 156 | "Sign", 157 | "Size", 158 | "Slice", 159 | "Snapshot", 160 | "SpaceToBatch", 161 | "SpaceToBatchND", 162 | "SpaceToDepth", 163 | "Split", 164 | "SplitV", 165 | "Squeeze", 166 | "StackPopV2", 167 | "StackPushV2", 168 | "StopGradient", 169 | "StridedSlice", 170 | "StridedSliceGrad", 171 | "Switch", 172 | "TensorArrayConcatV3", 173 | "TensorArrayGatherV3", 174 | "TensorArrayReadV3", 175 | "TensorArrayScatterV3", 176 | "TensorArraySplitV3", 177 | "TensorArrayWriteV3", 178 | "Tile", 179 | "TopK", 180 | "TopKV2", 181 | "Transpose", 182 | "Where", 183 | "ZerosLike", 184 | ] 185 | -------------------------------------------------------------------------------- /dpro/ml_platform/tensorflow/memory_lists.py: -------------------------------------------------------------------------------- 1 | 2 | # TODO(yuchen): support CNN 3 | # it only works for BERT now 4 | WHITE_LIST = [ 5 | 'mul', 6 | 'addv2', 7 | 'batchmatmulv2', 8 | 'square', 9 | 'l2loss', 10 | 'matmul', 11 | 'sum', 12 | 'tile', 13 | 'sqrt', 14 | 'transpose', 15 | 'neg', 16 | 'randomuniform', 17 | 'cast', 18 | 'greaterequal', 19 | 'squareddifference', 20 | 'softmax', 21 | 'pow', 22 | 'gatherv2', 23 | 'onehot', 24 | 'unsortedsegmentsum', 25 | 'logsoftmax', 26 | 'pad', 27 | 'mean', 28 | 'sub', 29 | 'realdiv', 30 | 'stridedslice', 31 | ] 32 | 33 | # coefficient-wise operator 34 | # see https://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html 35 | CWISE_LIST = [ 36 | 'mul', 37 | 'addv2', 38 | 'square', 39 | 'sqrt', 40 | 'neg', 41 | 'squareddifference', 42 | 'pow', 43 | 'sub', 44 | 'realdiv' 45 | ] 46 | -------------------------------------------------------------------------------- /dpro/ml_platform/tensorflow/util.py: -------------------------------------------------------------------------------- 1 | from google.protobuf.json_format import MessageToJson 2 | from google.protobuf.text_format import Parse 3 | import tensorflow as tf 4 | import sys, os 5 | import json 6 | import networkx as nx 7 | 8 | def wrap_read_graphdef(graphdef_path): 9 | try: 10 | GraphDef = tf.GraphDef 11 | except: 12 | GraphDef = tf.compat.v1.GraphDef 13 | if graphdef_path.endswith("pbtxt"): 14 | with open(graphdef_path, "r") as f: 15 | pb = f.read() 16 | graph_def = Parse(pb, GraphDef()) 17 | json_string = MessageToJson(graph_def) 18 | graph_def = json.loads(json_string) 19 | else: 20 | with open(graphdef_path, "r") as f: 21 | graph_def = json.load(f) 22 | graph = nx.DiGraph() 23 | for node in graph_def["node"]: 24 | if "input" in node: 25 | for input_tensor_name in node["input"]: 26 | input_node_name = input_tensor_name.split(":")[0] 27 | graph.add_edge(input_node_name, node["name"]) 28 | gml_path = os.path.join(os.path.dirname(graphdef_path), "graphdef_dag.gml") 29 | nx.write_gml(graph, gml_path) 30 | print("Create gml file at {}".format(gml_path)) 31 | 32 | if __name__ == "__main__": 33 | wrap_read_graphdef(sys.argv[1]) -------------------------------------------------------------------------------- /dpro/nvprof/analyze.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import os 3 | import json 4 | import argparse 5 | import networkx as nx 6 | import sys 7 | 8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | 10 | from ..logger_utils import get_logger 11 | 12 | parser = argparse.ArgumentParser(description="Trace Analysis", 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 14 | parser.add_argument("--option", type=str, default="gpu_trace", 15 | choices=["gpu_trace"], 16 | help="The type of analysis to process. including:\n" + 17 | "* statistic: show the statistic results\n" + 18 | "* graph: show the dependency graph\n") 19 | parser.add_argument("--path", type=str, required=True, help="The paths of traces you want to analyze, support multiple paths seperated with comma.") 20 | parser.add_argument("--logging_level", type=int, default="20", help="Logging level") 21 | parser.add_argument("--clean", action="store_true", help="Flush the log file") 22 | parser.add_argument("--progress", action="store_true", help="Show the progress bar if it is set, disable the std output") 23 | args = parser.parse_args() 24 | 25 | logger = get_logger(args) 26 | logger.info(args) 27 | 28 | def printIter(_iter, prefix=''): 29 | for _cmp in _iter: 30 | logger.info(prefix + _cmp) 31 | 32 | def handle(path, platform): 33 | with open(path, 'r') as fp: 34 | s = fp.readlines() 35 | i = 0 36 | sta = {} 37 | while i < len(s): 38 | if "Device Context Stream" in s[i]: 39 | i += 1 40 | break 41 | i += 1 42 | while i < len(s): 43 | if len(s[i]) < 162: 44 | break 45 | try: 46 | stream_id = int(s[i][162:168]) 47 | except: 48 | logger.info(len(s[i]), s[i-1]) 49 | raise 50 | #! delete the index of each kernel, reduce the duplication number of each kernal 51 | #! only focus on the name of each kernal 52 | name = s[i][170:].split(" [")[0].split("<")[0] 53 | if stream_id not in sta: 54 | sta[stream_id] = {"cmp": set(), "mem": set()} 55 | if "memcpy" in name or "memset" in name: 56 | sta[stream_id]["mem"].add(name) 57 | else: 58 | sta[stream_id]["cmp"].add(name) 59 | i += 1 60 | for k, v in sta.items(): 61 | logger.info("Stream ID: %-2d => cmp: %-10d : mem %-10d %s" % (k, len(v["cmp"]), len(v["mem"]), '' if len(v["mem"]) <= 2 else str(v["mem"]))) 62 | #! Used for debug 63 | sta1 = sta2 = None 64 | if platform == 'pytorch': 65 | sta1 = sta[7] 66 | sta2 = sta[21] 67 | elif platform == "tensorflow": 68 | sta1 = sta[182] 69 | sta2 = sta[214] 70 | if sta1 is not None and sta2 is not None: 71 | logger.info("platform: %s" % (platform)) 72 | logger.info(" intersection: ") 73 | printIter(sta1["cmp"].intersection(sta2["cmp"]), prefix="\t ") 74 | logger.info(" minor set: ") 75 | printIter(sta2["cmp"], prefix="\t ") 76 | logger.info(" major set: ") 77 | printIter(sta1["cmp"], prefix="\t ") 78 | 79 | if __name__ == "__main__": 80 | if args.option == "gpu_trace": 81 | cur_dir = os.path.abspath(args.path) 82 | root, dirs, files = list(os.walk(cur_dir, topdown=True))[0] 83 | for file in files: 84 | #! file name must follow the following format ___.txt 85 | #! e.g., 20191217_04_pytorch_mnist.txt, and must in lowercase. 86 | if "txt" in file and "log" not in file: 87 | #! Get the platform name, e.g. mxnet, tensorflow or pytorch 88 | platform = file.split("_")[2] 89 | cur_path = os.path.join(root, file) 90 | logger.info(cur_path) 91 | handle(cur_path, platform) 92 | else: 93 | raise NotImplementedError() 94 | 95 | -------------------------------------------------------------------------------- /dpro/optimizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joapolarbear/dpro/ac5e4ed57ad9c5f26d49599372c31ea36e882e99/dpro/optimizer/__init__.py -------------------------------------------------------------------------------- /dpro/optimizer/mcts.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import math 3 | from enum import Enum 4 | 5 | from .base import Optimizer, GraphState, args_ 6 | from ..logger_utils import SingleLogger 7 | 8 | MAX_LOOP = 1000 9 | MAX_TREE_DEPTH = 1000 10 | UCB_GAMMA = args_.ucb_gamma 11 | 12 | class GraphExpand(Enum): 13 | NOT = 0 14 | PARTIAL = 1 15 | FULLY = 2 16 | 17 | class GraphState: 18 | def __init__(self, depth): 19 | self.visit_cnt = 1 20 | self.quality = -1 21 | 22 | self.space = None 23 | self.childs = None 24 | self.parent = None 25 | self.depth = depth 26 | 27 | # Whether the actions have been tranversed, not, partial or fully 28 | self.state = GraphExpand.NOT 29 | 30 | self.strategy = None 31 | self.iter_time = None 32 | 33 | def update_expand_state(self): 34 | if self.childs is None: 35 | self.state = GraphExpand.NOT 36 | return 37 | assert not self.space is None 38 | if len(self.childs) == len(self.space): 39 | self.state = GraphExpand.FULLY 40 | else: 41 | self.state = GraphExpand.PARTIAL 42 | 43 | class MCTSOptimizer(Optimizer): 44 | ''' Monte Carlo Tree Search ''' 45 | 46 | def __init__(self, *args, **kwargs): 47 | super(MCTSOptimizer, self).__init__(*args, **kwargs) 48 | self.loop_cnt = 0 49 | self.GS_root = None 50 | self.opt_GS = None 51 | self.ucb_type = args_.ucb_type 52 | if self.ucb_type != "MAX" and self.ucb_type != "AVG": 53 | raise ValueError( 54 | "UCB type should be MAX or AVG, but {} is given.".format(self.ucb_type)) 55 | self.no_mutation = args_.no_mutation 56 | 57 | def search(self): 58 | ### Initialize the root graph state 59 | self.GS_root = GraphState(depth=0) 60 | self.GS_root.strategy = [] 61 | 62 | while self.check_loop_time() and self.check_loop_num(): 63 | GS = self.tree_policy(self.GS_root) 64 | reward = self.default_policy(GS) 65 | SingleLogger().info("Speedup to the origin %6.4f %%" % (100 * reward)) 66 | self.backpropagation(GS, reward) 67 | if args_.ucb_visual: 68 | self.visualize_tree() 69 | self.show_opt_strategies() 70 | return 71 | 72 | def visualize_tree(self): 73 | def iter_print(GS, cnt): 74 | ### `cnt` is used to decide how many parent branches to print for current nodes 75 | LENOFNODE = 11 76 | LENOFARROW = 5 77 | node_string = " %5.4f %% " % ( 78 | GS.quality * 100) if GS.quality >= 0 else " -%5.4f %% " % (-GS.quality * 100) 79 | sys.stdout.write(node_string) 80 | assert len(node_string) == LENOFNODE 81 | if GS.childs is None: 82 | return 83 | for idx, child in enumerate(GS.childs): 84 | if idx > 0: 85 | sys.stdout.write("\n{}".format(" "*(LENOFNODE + LENOFARROW//2))) 86 | sys.stdout.write("{}".format(" "*((LENOFNODE + LENOFARROW) * (GS.depth - cnt)))) 87 | sys.stdout.write("{}".format(("|" + " " * (LENOFNODE + LENOFARROW - 1))*(cnt))) 88 | sys.stdout.write("{}".format("|" if idx < (len(GS.childs) - 1) else "\\")) 89 | sys.stdout.write("{}".format("-"*(LENOFARROW - LENOFARROW//2 - 1))) 90 | else: 91 | sys.stdout.write("{}".format('-'*LENOFARROW)) 92 | if idx < (len(GS.childs) - 1): 93 | next_cnt = cnt + 1 94 | else: 95 | next_cnt = cnt 96 | iter_print(child, next_cnt) 97 | 98 | iter_print(self.GS_root, 0) 99 | sys.stdout.write("\n") 100 | 101 | def show_opt_strategies(self): 102 | SingleLogger().info("Best speedup: %d th layer, speed up to the origin: %6.4f %%" % 103 | (len(self.opt_GS.strategy), 100 * self.opt_GS.quality)) 104 | 105 | def check_loop_num(self): 106 | self.loop_cnt += 1 107 | if self.loop_cnt > MAX_LOOP: 108 | return False # End 109 | else: 110 | return True # continue 111 | 112 | def check_loop_time(self): 113 | return True # continue 114 | 115 | def tree_policy(self, GS): 116 | while self.fully_expanded(GS): 117 | GS = self.best_UCB(GS) 118 | return self.expansion(GS) 119 | 120 | def default_policy(self, GS): 121 | if not self.no_mutation: 122 | while not self.terminal(GS): 123 | action = self.pick_strategy(GS.space)[0] 124 | GS_c = GraphState(depth=(GS.depth+1)) 125 | GS_c.strategy = GS.strategy.copy() 126 | GS_c.strategy.append(action) 127 | GS = GS_c 128 | ### Evaluate the final graph 129 | if GS.iter_time is None: 130 | self.check_search_space(GS) 131 | cost = GS.iter_time 132 | SingleLogger().debug("Evaluate the strategy %s" % (str(GS.strategy))) 133 | return (self.base_cost - cost)/self.base_cost 134 | 135 | def backpropagation(self, GS, reward): 136 | if self.ucb_type == "MAX": 137 | GS.quality = max(reward, GS.quality) 138 | elif self.ucb_type == "AVG": 139 | GS.quality += reward 140 | GS.visit_cnt += 1 141 | if GS.depth == 0: 142 | return 143 | else: 144 | self.backpropagation(GS.parent, reward) 145 | 146 | def best_UCB(self, GS): 147 | GS_opt = c_opt = None 148 | for GS_c in GS.childs: 149 | if self.ucb_type == "MAX": 150 | c = GS_c.quality + UCB_GAMMA * \ 151 | math.sqrt((2 * math.log(GS.visit_cnt)) / GS_c.visit_cnt) 152 | elif self.ucb_type == "AVG": 153 | c = GS_c.quality / GS_c.visit_cnt + UCB_GAMMA * \ 154 | math.sqrt((2 * math.log(GS.visit_cnt)) / GS_c.visit_cnt) 155 | else: 156 | raise RuntimeError("Invalid UCB_type") 157 | if GS_opt is None or c > c_opt: 158 | c_opt = c 159 | GS_opt = GS_c 160 | return GS_opt 161 | 162 | def fully_expanded(self, GS): 163 | if self.terminal(GS): 164 | return False 165 | 166 | if GS.state == GraphExpand.NOT or GS.state == GraphExpand.PARTIAL: 167 | return False 168 | else: 169 | return True 170 | 171 | def expansion(self, GS): 172 | ### Pick an unvisided child to expand 173 | assert GS.state == GraphExpand.NOT or GS.state == GraphExpand.PARTIAL 174 | action = self.pick_unvisited(GS) 175 | if action is None: 176 | ### Current state is the terminal state, expansion failed 177 | return GS 178 | 179 | GS_c = GraphState(depth=(GS.depth+1)) 180 | GS_c.strategy = GS.strategy.copy() 181 | GS_c.strategy.append(action) 182 | GS_c.parent = GS 183 | if GS.childs is None: 184 | GS.childs = [] 185 | GS.childs.append(GS_c) 186 | 187 | if len(GS.space) == len(GS.childs): 188 | GS.state = GraphExpand.FULLY 189 | else: 190 | GS.state = GraphExpand.PARTIAL 191 | 192 | return GS_c 193 | 194 | def pick_unvisited(self, GS): 195 | ### TODO (huhanpeng): how to pick with some heuristic 196 | for idx in range(len(GS.space)): 197 | if GS.space[idx][1] == 0: 198 | GS.space[idx][1] += 1 199 | return GS.space[idx][0] 200 | return None 201 | 202 | def check_search_space(self, GS): 203 | ### TODO (huhanpeng): we can do some pruning here 204 | if GS.space is None: 205 | candidates, new_dag = self.candidate_selection(GS, topk=None) 206 | search_space, _ = self.init_search_space(candidates, new_dag) 207 | # The integer value is used as a counter 208 | GS.space = [[action, 0] for action in search_space] 209 | 210 | def terminal(self, GS): 211 | self.check_search_space(GS) 212 | if GS.depth > MAX_TREE_DEPTH or len(GS.space) == 0: 213 | return True 214 | else: 215 | return False 216 | -------------------------------------------------------------------------------- /dpro/parameter.py: -------------------------------------------------------------------------------- 1 | ''' Manage the parameter info of a DNN model 2 | ''' 3 | import re 4 | 5 | from .trace_utils import * 6 | 7 | class ParameterDict: 8 | def __init__(self, _pm, platform, metadata_path=None): 9 | ### collect metadata 10 | if metadata_path is None: 11 | metadata_path = os.path.dirname(_pm.search(FileName.METADATA)) 12 | if metadata_path is None: 13 | SingleLogger().error( 14 | "{} not found. Fail to load metadata".format(FileName.METADATA.value)) 15 | 16 | if platform == "MXNET": 17 | from .ml_platform.mxnet.metadata import MetaInfo 18 | SingleLogger().info("Use MXNET metadata") 19 | elif platform == "TENSORFLOW": 20 | from .ml_platform.tensorflow.metadata import MetaInfo 21 | SingleLogger().info("Use TENSORFLOW metadata") 22 | else: 23 | raise NotImplementedError() 24 | 25 | self.metainfo = MetaInfo(metadata_path) 26 | self.cnt = len(self.metainfo.gradient_name_list) 27 | 28 | def gradient_name_list(self): 29 | return self.metainfo.gradient_name_list 30 | 31 | def gradient_num(self): 32 | return self.cnt 33 | 34 | def wrap_read_dfg(self, *args, **kwargs): 35 | return self.metainfo.wrap_read_dfg(*args, **kwargs) 36 | 37 | def standard_name(self, op_name): 38 | ''' Convert op_names in the original traces to standard names 39 | `op_cat.op_name.sub_op` 40 | ''' 41 | return self.metainfo.standard_name(op_name) 42 | 43 | ### below methods are related to tensors/Communication 44 | 45 | def tensor_id_to_tensor_name(self, tensor_id): 46 | return self.metainfo.tensor_id_to_tensor_name(tensor_id) 47 | 48 | def tensor_name_to_tensor_id(self, name): 49 | return self.metainfo.tensor_name_to_tensor_id(name) 50 | 51 | def tensor_id2size(self, tensor_id): 52 | return self.metainfo.ret_tensor_size(tensor_id) 53 | 54 | def tensor_id2update_id(self, tensor_id): 55 | '''tensor id may be 'max' to return the maximum update id ''' 56 | return self.metainfo.tensor_id2update_id(tensor_id) 57 | 58 | def tensor_grp_size(self, op_name): 59 | total_size = 0 60 | for tensor_id_str in op_name.split("+"): 61 | tensor_id = int(tensor_id_str) 62 | total_size += self.tensor_id2size(tensor_id) 63 | return total_size 64 | 65 | ### below is related op_name 66 | 67 | def ret_metadata(self, *args, **kwargs): 68 | return self.metainfo.ret_metadata(*args, **kwargs) 69 | 70 | def ret_rawmeta(self, op_name): 71 | return self.metainfo.ret_rawmeta(op_name) 72 | 73 | def check_amp_lists(self, op_name): 74 | return self.metainfo.check_amp_lists(op_name) 75 | 76 | def parse_op_type(self, op_name): 77 | return self.metainfo.parse_op_type(op_name) 78 | 79 | def ret_op_precision(self, op_name): 80 | return self.metainfo.ret_op_precision(op_name) 81 | 82 | def in_metadata(self, op_name): 83 | return self.metainfo.in_metadata(op_name) 84 | 85 | def is_const(self, op_name): 86 | return self.metainfo.is_const(op_name) 87 | 88 | def is_variable(self, op_name): 89 | return self.metainfo.is_variable(op_name) 90 | 91 | def parse_model_name(self): 92 | return self.metainfo.parse_model_name() -------------------------------------------------------------------------------- /dpro/xla_cm_entry.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | from .logger_utils import SingleLogger 5 | from .cost_model._xla.xla_module_cost_model import XLAModuleCostModel 6 | from .cost_model._xla.gen_dataset_utils import XlaKernelDataset 7 | 8 | try: 9 | import byteps.tensorflow as bps 10 | except: 11 | pass 12 | 13 | try: 14 | import horovod.tensorflow as hvd 15 | except: 16 | pass 17 | 18 | parser = argparse.ArgumentParser(description="Script to launch the kernel dataset generator and train the XLA module cost model.", 19 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 20 | 21 | parser.add_argument("--mode", type=int, default=0, 22 | help="Different actions with different mode:\n" 23 | " 0: generate training data and train the cost model\n" 24 | " 1: only generate training data\n" 25 | " 2: only traing the cost model\n" 26 | " 3: only test the cost model") 27 | 28 | parser.add_argument("--trace_dir", type=str, help="Path to the directory containing trace files for a GPU.") 29 | parser.add_argument("--output_dir", type=str, help="Directory where the generated dataset files will be dumped to.") 30 | parser.add_argument("--num_samples", type=int, help="Number of random samples to generate.") 31 | parser.add_argument("--max_cluster_samples", type=int, default=0, help="Number of max cluster samples to generate.") 32 | parser.add_argument("--min_cluster_size", type=int, default=4, help="Minimum subgraph size.") 33 | parser.add_argument("--max_cluster_size", type=int, default=800, help="Maximum subgraph size.") 34 | 35 | parser.add_argument("--batch_size", type=int, default=256, 36 | help="Directory where the generated cost model files will be dumped to.") 37 | 38 | parser.add_argument("--dataset_dir", type=str, 39 | help="Path to the directory containing generated dataset files.") 40 | 41 | args = parser.parse_args() 42 | 43 | logger = SingleLogger(args.output_dir, "xla_cm", "INFO") 44 | logger.info(args) 45 | 46 | if args.mode == 0 or args.mode == 1: 47 | SingleLogger().info("Generate Kernel dataset ...") 48 | print("""Using configuation: 49 | \t Trace Dir: {}\n\t Output Dir: {}\n\t # Random Samples: {} 50 | \t # Max Cluster Samples: {}\n\t Min Cluster Size: {}\n\t Max Cluster Size: {}""".format( 51 | args.trace_dir, args.output_dir, args.num_samples, 52 | args.max_cluster_samples, args.min_cluster_size, args.max_cluster_size 53 | )) 54 | 55 | ### Generate Kernel dataset 56 | XlaKernelDataset.construct_kernel_dataset(args.trace_dir , os.path.join(args.output_dir, "kernel_dataset"), 57 | num_samples=args.num_samples, 58 | num_max_cluster_samples=args.max_cluster_samples, 59 | min_subgraph_level=args.min_cluster_size, 60 | max_subgraph_level=args.max_cluster_size) 61 | 62 | if args.mode == 0 or args.mode == 2: 63 | SingleLogger().info("Train the cost model ...") 64 | ### Train the cost model 65 | assert os.path.exists(os.path.join(args.output_dir, "kernel_dataset")) 66 | XLAModuleCostModel.train_on_dataset( 67 | os.path.join(args.output_dir, "kernel_dataset"), 68 | os.path.join(args.output_dir, "cost_model"), 69 | args.batch_size) 70 | 71 | if args.mode == 3: 72 | SingleLogger().info("Test the cost model ...") 73 | module_cost_model = XLAModuleCostModel(os.path.join(args.output_dir, "cost_model")) 74 | module_cost_model.test_on_dataset(args.dataset_dir) 75 | 76 | 77 | -------------------------------------------------------------------------------- /dpro/xla_test_generate_cluster_spec.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import networkx as nx 3 | import pickle 4 | import os 5 | 6 | from google.protobuf.json_format import MessageToJson 7 | from google.protobuf.text_format import Parse 8 | import tensorflow as tf 9 | import json 10 | 11 | try: 12 | GraphDef = tf.GraphDef 13 | except: 14 | GraphDef = tf.compat.v1.GraphDef 15 | 16 | # from collect import Collector 17 | from cost_model._xla.pk_graph import PKGraph, postorder_contract_nx 18 | from trace_utils import parse_op_name, parse_pid_from_name 19 | 20 | TRACE_PATH = "/root/capture_file/run_0_dec8" 21 | OUTPUT_PATH = "/root/cluster_spec_test.txt" 22 | 23 | name2index = {} 24 | index2name = {} 25 | index2pid = {} 26 | index2newname = {} 27 | 28 | # logger = SingleLogger("/root", "trash_logger", "info") 29 | 30 | def tf_relabel_func(_name, update_nodes_in_dag): 31 | for prefix in ["Comm.", "Comp.", "BW.", "FW.", "UPDATE_."]: 32 | if _name.startswith(prefix): 33 | return _name 34 | if _name.startswith("^"): 35 | _name = _name[1:] 36 | last_slash_pos = _name.rfind("/") 37 | if last_slash_pos != -1 and last_slash_pos < len(_name)-1 and _name[last_slash_pos+1] == "_": 38 | _name = _name[:last_slash_pos] 39 | if "BytePSPushPull" in _name and "tensor" not in _name: 40 | _name = "Comm." + _name 41 | elif "allreduce" in _name.lower(): 42 | if "." in _name: 43 | _, tensor_name = _name.split(".") 44 | if "_" in tensor_name: 45 | tensor_name = tensor_name.split("_")[0] 46 | _name = "Comm." + tensor_name 47 | else: 48 | _name = "UPDATE_." + _name 49 | else: 50 | if update_nodes_in_dag is not None and _name in update_nodes_in_dag: 51 | _name = "UPDATE_." + _name 52 | elif _name.startswith("gradients"): 53 | _name = "BW." + _name 54 | else: 55 | _name = "FW." + _name 56 | return _name 57 | 58 | def wrap_read_graphdef(graphdef_path): 59 | if graphdef_path.endswith("pbtxt"): 60 | with open(graphdef_path, "r") as f: 61 | pb = f.read() 62 | graph_def = Parse(pb, GraphDef()) 63 | json_string = MessageToJson(graph_def) 64 | graph_def = json.loads(json_string) 65 | else: 66 | with open(graphdef_path, "r") as f: 67 | graph_def = json.load(f) 68 | graph = nx.DiGraph() 69 | for node in graph_def["node"]: 70 | if "input" in node: 71 | for input_tensor_name in node["input"]: 72 | input_node_name = input_tensor_name.split(":")[0] 73 | graph.add_edge(input_node_name, node["name"]) 74 | update_nodes_in_dag = set() 75 | def recursive_add_succs(_node): 76 | for succ_ in graph.successors(_node): 77 | update_nodes_in_dag.add(succ_) 78 | recursive_add_succs(succ_) 79 | for node in graph.nodes: 80 | if "allreduce" in node.lower() or "bytepspushpull" in node.lower(): 81 | recursive_add_succs(node) 82 | new_graph = nx.DiGraph() 83 | for u, v in graph.edges: 84 | new_graph.add_edge(tf_relabel_func(u, update_nodes_in_dag), tf_relabel_func(v, update_nodes_in_dag)) 85 | return new_graph, update_nodes_in_dag 86 | 87 | def relabel_dag_node(_dag) -> nx.DiGraph: 88 | def relabel_func(old_label): 89 | if ("BW" in old_label or "FW" in old_label or "Comm" in old_label or "UPDATE" in old_label) and "^" not in old_label: 90 | layer_name = parse_op_name(old_label) 91 | layer_pid = parse_pid_from_name(old_label) 92 | # if layer_pid not in self.cost_models or layer_name not in self.cost_models[layer_pid].graph_def_util.operation_names: 93 | # return "DEL~"+old_label 94 | # TODO (huhanpeng): different pids share the same index 95 | # if "Comm" in old_label and layer_name in name2index and layer_pid in name2index[layer_name]: 96 | # layer_index = name2index[layer_name][layer_pid] 97 | # new_name = ("[%d]"%layer_index).join(old_label.split(layer_name)) 98 | # return new_name 99 | 100 | layer_index = len(index2name) 101 | new_name = ("[%d]"%layer_index).join(old_label.split(layer_name)) 102 | index2name[layer_index] = layer_name 103 | index2pid[layer_index] = layer_pid 104 | if layer_name not in name2index: 105 | name2index[layer_name] = {} 106 | name2index[layer_name][layer_pid] = layer_index 107 | new_label = ("[%d]"%layer_index).join(old_label.split(layer_name)) 108 | index2newname[layer_index] = new_label 109 | return new_label 110 | else: 111 | return old_label 112 | return nx.relabel_nodes(_dag, relabel_func) 113 | 114 | 115 | # remove dependency from FW to UPDATE 116 | # for (u, v) in list(dag.edges): 117 | # dag.remove_edge(u, v) 118 | xla_candidates = set() 119 | with open("/root/xla_candidates.txt", "r") as f: 120 | for line in f: 121 | xla_candidates.add(line.strip()) 122 | 123 | dag = wrap_read_graphdef("/root/bert/traces/before_mark_for_compilation_5.pbtxt") 124 | 125 | dag = relabel_dag_node(dag) 126 | 127 | pkg = PKGraph(dag, dag) 128 | 129 | fw_nodes = [] 130 | bw_nodes = [] 131 | comm_nodes = [] 132 | update_nodes = [] 133 | 134 | for node in dag.nodes: 135 | if "FW" in node: 136 | fw_nodes.append(node) 137 | elif "BW" in node: 138 | bw_nodes.append(node) 139 | elif "Comm" in node: 140 | comm_nodes.append(node) 141 | elif "UPDATE" in node: 142 | update_nodes.append(node) 143 | 144 | print("Len FW nodes: {}, Len BW nodes: {}, Len COMM nodes: {}, Len UPDATE nodes: {}" \ 145 | .format(len(fw_nodes), len(bw_nodes), len(comm_nodes), len(update_nodes))) 146 | 147 | BW_graph = dag.subgraph(bw_nodes) 148 | BW_sequence = list(nx.topological_sort(BW_graph)) 149 | 150 | num_forbidden = int(len(BW_sequence) / 2) 151 | forbidden_bw = BW_sequence[num_forbidden:] 152 | 153 | 154 | 155 | filtered_nodes = [] 156 | for node in dag.nodes: 157 | index = int(node.split("[")[1].split("]")[0]) 158 | orig_name = index2name[index] 159 | if orig_name.split(".")[1] not in xla_candidates: 160 | filtered_nodes.append(node) 161 | 162 | if not os.path.exists("/root/alter_cluster_spec.pickle"): 163 | # Cluster all FW 164 | source_nodes = sorted(list(dag.nodes), key=lambda x: dag.in_degree(x)) 165 | 166 | # Run post order traversal on G 167 | print("Finding maximal clusters in FW...") 168 | visited_nodes = set() 169 | for source in tqdm(source_nodes, total=len(source_nodes)): 170 | if source not in visited_nodes and source in dag.nodes: 171 | _, _, dag = postorder_contract_nx(dag, pkg, source, visited_nodes, forbidden_list= filtered_nodes + comm_nodes + bw_nodes) 172 | 173 | with open("/root/alter_cluster_spec.pickle", "wb") as f: 174 | pickle.dump([fw_nodes, bw_nodes, comm_nodes, update_nodes, 175 | filtered_nodes, index2name, index2pid, dag, pkg], f) 176 | else: 177 | with open("/root/alter_cluster_spec.pickle", "rb") as f: 178 | ( fw_nodes, bw_nodes, comm_nodes, update_nodes, filtered_nodes, 179 | index2name, index2pid, dag, pkg )= pickle.load(f) 180 | 181 | # new_fw_nodes = [node for node in dag.nodes if "FW" in node] 182 | 183 | # # all BW 184 | # print("Finding maximal clusters in all BW...") 185 | # source_nodes = sorted(list(dag.nodes), key=lambda x: dag.in_degree(x)) 186 | # visited_nodes = set() 187 | # for source in tqdm(source_nodes, total=len(source_nodes)): 188 | # if source not in visited_nodes and source in dag.nodes: 189 | # _, _, dag = postorder_contract_nx(dag, pkg, source, visited_nodes, forbidden_list= filtered_nodes + comm_nodes + new_fw_nodes) 190 | 191 | # # all BW, size limit 1/2 192 | # print("Finding maximal clusters in all BW...") 193 | # source_nodes = sorted(list(dag.nodes), key=lambda x: dag.in_degree(x)) 194 | # visited_nodes = set() 195 | # for source in tqdm(source_nodes, total=len(source_nodes)): 196 | # if source not in visited_nodes and source in dag.nodes: 197 | # _, _, dag = postorder_contract_nx(dag, pkg, source, visited_nodes, forbidden_list= filtered_nodes + comm_nodes + new_fw_nodes, size_limit=int(len(bw_nodes)/2)) 198 | 199 | def _get_original_name_pid_from_index(name_): 200 | try: 201 | index = int(name_.split("[")[1].split("]")[0]) 202 | except: 203 | print(name_) 204 | input() 205 | return index2name[index], index2pid[index] 206 | 207 | def _get_original_name_pid_from_fused_node(u_): 208 | single_pid = None 209 | orig_names = [] 210 | for node_name in u_.split("+"): 211 | orig_name, pid = _get_original_name_pid_from_index(node_name) 212 | orig_names.append(orig_name) 213 | if single_pid is None: 214 | single_pid = pid 215 | else: 216 | if single_pid != pid: 217 | raise RuntimeError("Fused DAG node {} contains ops from different machines.".format(u_)) 218 | return orig_names, single_pid 219 | 220 | bw_cluster_sizes = [] 221 | bw_cluster_nodes = [] 222 | single_pid = -1 223 | for node in dag.nodes: 224 | if "+" in node and "BW" in node: 225 | orig_names, pid = _get_original_name_pid_from_fused_node(node) 226 | if single_pid == -1: 227 | single_pid = pid 228 | else: 229 | if single_pid != pid: 230 | continue 231 | bw_cluster_sizes.append(len(node.split("+"))) 232 | bw_cluster_nodes.append(node) 233 | 234 | for idx, node_size in enumerate(bw_cluster_sizes): 235 | if node_size > 10: 236 | print("idx: {}, size: {}".format(idx, node_size)) 237 | 238 | clusters_to_ignore = [] 239 | while True: 240 | s = input("Choose a cluster to disgard: ") 241 | try: 242 | discard_id = int(s.strip()) 243 | clusters_to_ignore.append(discard_id) 244 | print("Remaining clusters:") 245 | for idx, node_size in enumerate(bw_cluster_sizes): 246 | if node_size > 10 and idx not in clusters_to_ignore: 247 | print("idx: {}, size: {}".format(idx, node_size)) 248 | except: 249 | break 250 | 251 | nodes_to_ignore = set() 252 | for idx in clusters_to_ignore: 253 | nodes_to_ignore.add(bw_cluster_nodes[idx]) 254 | 255 | # dump cluster mapping 256 | cluster_index = 0 257 | with open("/root/partitions_spec.txt", "w") as f: 258 | for node in dag.nodes(): 259 | if "+" in node: 260 | orig_names, pid = _get_original_name_pid_from_fused_node(node) 261 | if pid != single_pid: 262 | continue 263 | if node not in nodes_to_ignore: 264 | for orig_node_name in orig_names: 265 | f.write("{} {}\n".format(orig_node_name, cluster_index)) 266 | cluster_index += 1 267 | else: 268 | for orig_node_name in orig_names: 269 | f.write("{} {}\n".format(orig_node_name, cluster_index)) 270 | cluster_index += 1 -------------------------------------------------------------------------------- /dpro_cli: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | **************************************** 4 | * _______________________________ * 5 | * ______ /__ __ \__ __ \_ __ \ * 6 | * _ __ /__ /_/ /_ /_/ / / / / * 7 | * / /_/ / _ ____/_ _, _// /_/ / * 8 | * \__,_/ /_/ /_/ |_| \____/ * 9 | * * 10 | **************************************** 11 | ''' 12 | import os, sys 13 | import yaml 14 | from jinja2 import Environment, FileSystemLoader 15 | from dpro.base import bcolors, dpro_dir 16 | 17 | usage_prompt = "usage: dpro