├── README.md
├── dk
    ├── Dockerfile
    └── ghost.sh
├── ghost
    ├── .ipynb_checkpoints
    │   └── SberSwapInference-checkpoint.ipynb
    ├── LICENSE
    ├── README.md
    ├── SberSwapInference.ipynb
    ├── apex
    │   ├── .gitignore
    │   ├── .gitmodules
    │   ├── .nojekyll
    │   ├── LICENSE
    │   ├── README.md
    │   ├── apex
    │   │   ├── RNN
    │   │   │   ├── README.md
    │   │   │   ├── RNNBackend.py
    │   │   │   ├── __init__.py
    │   │   │   ├── cells.py
    │   │   │   └── models.py
    │   │   ├── __init__.py
    │   │   ├── amp
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── __version__.py
    │   │   │   ├── _amp_state.py
    │   │   │   ├── _initialize.py
    │   │   │   ├── _process_optimizer.py
    │   │   │   ├── amp.py
    │   │   │   ├── compat.py
    │   │   │   ├── frontend.py
    │   │   │   ├── handle.py
    │   │   │   ├── lists
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── functional_overrides.py
    │   │   │   │   ├── tensor_overrides.py
    │   │   │   │   └── torch_overrides.py
    │   │   │   ├── opt.py
    │   │   │   ├── rnn_compat.py
    │   │   │   ├── scaler.py
    │   │   │   ├── utils.py
    │   │   │   └── wrap.py
    │   │   ├── contrib
    │   │   │   ├── __init__.py
    │   │   │   ├── bottleneck
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── bottleneck.py
    │   │   │   │   └── test.py
    │   │   │   ├── csrc
    │   │   │   │   ├── bottleneck
    │   │   │   │   │   └── bottleneck.cpp
    │   │   │   │   ├── fmha
    │   │   │   │   │   ├── fmha_api.cpp
    │   │   │   │   │   └── src
    │   │   │   │   │   │   ├── fmha.h
    │   │   │   │   │   │   ├── fmha
    │   │   │   │   │   │       ├── gemm.h
    │   │   │   │   │   │       ├── gmem_tile.h
    │   │   │   │   │   │       ├── kernel_traits.h
    │   │   │   │   │   │       ├── mask.h
    │   │   │   │   │   │       ├── smem_tile.h
    │   │   │   │   │   │       ├── softmax.h
    │   │   │   │   │   │       └── utils.h
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_128_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_256_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_384_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_fp16_512_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_dgrad_kernel_1xN_reload.h
    │   │   │   │   │   │   ├── fmha_fprop_fp16_128_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_256_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_384_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_fp16_512_64_kernel.sm80.cu
    │   │   │   │   │   │   ├── fmha_fprop_kernel_1xN.h
    │   │   │   │   │   │   ├── fmha_fprop_kernel_1xN_reload_v.h
    │   │   │   │   │   │   ├── fmha_kernel.h
    │   │   │   │   │   │   └── fmha_utils.h
    │   │   │   │   ├── groupbn
    │   │   │   │   │   ├── batch_norm.cu
    │   │   │   │   │   ├── batch_norm.h
    │   │   │   │   │   ├── batch_norm_add_relu.cu
    │   │   │   │   │   ├── batch_norm_add_relu.h
    │   │   │   │   │   ├── cuda_utils.h
    │   │   │   │   │   ├── interface.cpp
    │   │   │   │   │   ├── ipc.cu
    │   │   │   │   │   └── nhwc_batch_norm_kernel.h
    │   │   │   │   ├── layer_norm
    │   │   │   │   │   ├── ln_api.cpp
    │   │   │   │   │   ├── ln_bwd_semi_cuda_kernel.cu
    │   │   │   │   │   ├── ln_fwd_cuda_kernel.cu
    │   │   │   │   │   ├── ln_kernel_traits.h
    │   │   │   │   │   └── utils.cuh
    │   │   │   │   ├── multihead_attn
    │   │   │   │   │   ├── additive_masked_softmax_dropout.cpp
    │   │   │   │   │   ├── additive_masked_softmax_dropout_cuda.cu
    │   │   │   │   │   ├── dropout.h
    │   │   │   │   │   ├── encdec_multihead_attn.cpp
    │   │   │   │   │   ├── encdec_multihead_attn_cuda.cu
    │   │   │   │   │   ├── encdec_multihead_attn_norm_add.cpp
    │   │   │   │   │   ├── encdec_multihead_attn_norm_add_cuda.cu
    │   │   │   │   │   ├── layer_norm.h
    │   │   │   │   │   ├── masked_softmax_dropout.cpp
    │   │   │   │   │   ├── masked_softmax_dropout_cuda.cu
    │   │   │   │   │   ├── philox.h
    │   │   │   │   │   ├── self_multihead_attn.cpp
    │   │   │   │   │   ├── self_multihead_attn_bias.cpp
    │   │   │   │   │   ├── self_multihead_attn_bias_additive_mask.cpp
    │   │   │   │   │   ├── self_multihead_attn_bias_additive_mask_cuda.cu
    │   │   │   │   │   ├── self_multihead_attn_bias_cuda.cu
    │   │   │   │   │   ├── self_multihead_attn_cuda.cu
    │   │   │   │   │   ├── self_multihead_attn_norm_add.cpp
    │   │   │   │   │   ├── self_multihead_attn_norm_add_cuda.cu
    │   │   │   │   │   ├── softmax.h
    │   │   │   │   │   └── strided_batched_gemm.h
    │   │   │   │   ├── optimizers
    │   │   │   │   │   ├── fused_adam_cuda.cpp
    │   │   │   │   │   ├── fused_adam_cuda_kernel.cu
    │   │   │   │   │   ├── fused_lamb_cuda.cpp
    │   │   │   │   │   ├── fused_lamb_cuda_kernel.cu
    │   │   │   │   │   ├── multi_tensor_distopt_adam.cpp
    │   │   │   │   │   ├── multi_tensor_distopt_adam_kernel.cu
    │   │   │   │   │   ├── multi_tensor_distopt_lamb.cpp
    │   │   │   │   │   └── multi_tensor_distopt_lamb_kernel.cu
    │   │   │   │   ├── transducer
    │   │   │   │   │   ├── transducer_joint.cpp
    │   │   │   │   │   ├── transducer_joint_kernel.cu
    │   │   │   │   │   ├── transducer_loss.cpp
    │   │   │   │   │   └── transducer_loss_kernel.cu
    │   │   │   │   └── xentropy
    │   │   │   │   │   ├── interface.cpp
    │   │   │   │   │   └── xentropy_kernel.cu
    │   │   │   ├── examples
    │   │   │   │   └── multihead_attn
    │   │   │   │   │   ├── func_test_multihead_attn.py
    │   │   │   │   │   └── perf_test_multihead_attn.py
    │   │   │   ├── fmha
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── fmha.py
    │   │   │   ├── groupbn
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── batch_norm.py
    │   │   │   ├── layer_norm
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── layer_norm.py
    │   │   │   ├── multihead_attn
    │   │   │   │   ├── MHA_bwd.png
    │   │   │   │   ├── MHA_fwd.png
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── encdec_multihead_attn.py
    │   │   │   │   ├── encdec_multihead_attn_func.py
    │   │   │   │   ├── fast_encdec_multihead_attn_func.py
    │   │   │   │   ├── fast_encdec_multihead_attn_norm_add_func.py
    │   │   │   │   ├── fast_self_multihead_attn_func.py
    │   │   │   │   ├── fast_self_multihead_attn_norm_add_func.py
    │   │   │   │   ├── mask_softmax_dropout_func.py
    │   │   │   │   ├── self_multihead_attn.py
    │   │   │   │   └── self_multihead_attn_func.py
    │   │   │   ├── optimizers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── distributed_fused_adam.py
    │   │   │   │   ├── distributed_fused_adam_v2.py
    │   │   │   │   ├── distributed_fused_adam_v3.py
    │   │   │   │   ├── distributed_fused_lamb.py
    │   │   │   │   ├── fp16_optimizer.py
    │   │   │   │   ├── fused_adam.py
    │   │   │   │   ├── fused_lamb.py
    │   │   │   │   └── fused_sgd.py
    │   │   │   ├── sparsity
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── asp.py
    │   │   │   │   ├── sparse_masklib.py
    │   │   │   │   └── test
    │   │   │   │   │   ├── checkpointing_test_part1.py
    │   │   │   │   │   ├── checkpointing_test_part2.py
    │   │   │   │   │   ├── checkpointing_test_reference.py
    │   │   │   │   │   └── toy_problem.py
    │   │   │   ├── test
    │   │   │   │   ├── fmha
    │   │   │   │   │   └── test_fmha.py
    │   │   │   │   ├── layer_norm
    │   │   │   │   │   └── test_fast_layer_norm.py
    │   │   │   │   ├── multihead_attn
    │   │   │   │   │   ├── test_encdec_multihead_attn.py
    │   │   │   │   │   ├── test_encdec_multihead_attn_norm_add.py
    │   │   │   │   │   ├── test_fast_self_multihead_attn_bias.py
    │   │   │   │   │   ├── test_mha_fused_softmax.py
    │   │   │   │   │   ├── test_self_multihead_attn.py
    │   │   │   │   │   └── test_self_multihead_attn_norm_add.py
    │   │   │   │   ├── test_label_smoothing.py
    │   │   │   │   └── transducer
    │   │   │   │   │   ├── test_transducer_joint.py
    │   │   │   │   │   ├── test_transducer_loss.py
    │   │   │   │   │   └── transducer_ref.py
    │   │   │   ├── transducer
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── transducer.py
    │   │   │   └── xentropy
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── softmax_xentropy.py
    │   │   ├── fp16_utils
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── fp16_optimizer.py
    │   │   │   ├── fp16util.py
    │   │   │   └── loss_scaler.py
    │   │   ├── mlp
    │   │   │   ├── __init__.py
    │   │   │   └── mlp.py
    │   │   ├── multi_tensor_apply
    │   │   │   ├── __init__.py
    │   │   │   └── multi_tensor_apply.py
    │   │   ├── normalization
    │   │   │   ├── __init__.py
    │   │   │   └── fused_layer_norm.py
    │   │   ├── optimizers
    │   │   │   ├── __init__.py
    │   │   │   ├── fused_adagrad.py
    │   │   │   ├── fused_adam.py
    │   │   │   ├── fused_lamb.py
    │   │   │   ├── fused_novograd.py
    │   │   │   └── fused_sgd.py
    │   │   ├── parallel
    │   │   │   ├── LARC.py
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── distributed.py
    │   │   │   ├── multiproc.py
    │   │   │   ├── optimized_sync_batchnorm.py
    │   │   │   ├── optimized_sync_batchnorm_kernel.py
    │   │   │   ├── sync_batchnorm.py
    │   │   │   └── sync_batchnorm_kernel.py
    │   │   ├── pyprof
    │   │   │   ├── FAQs.md
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── examples
    │   │   │   │   ├── .gitignore
    │   │   │   │   ├── apex
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   ├── fused_adam.py
    │   │   │   │   │   ├── fused_layer_norm.py
    │   │   │   │   │   └── test.sh
    │   │   │   │   ├── custom_func_module
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   ├── custom_function.py
    │   │   │   │   │   ├── custom_module.py
    │   │   │   │   │   └── test.sh
    │   │   │   │   ├── imagenet
    │   │   │   │   │   ├── imagenet.py
    │   │   │   │   │   └── test.sh
    │   │   │   │   ├── jit
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   ├── jit_script_function.py
    │   │   │   │   │   ├── jit_script_method.py
    │   │   │   │   │   ├── jit_trace_function.py
    │   │   │   │   │   ├── jit_trace_method.py
    │   │   │   │   │   └── test.sh
    │   │   │   │   ├── lenet.py
    │   │   │   │   ├── operators.py
    │   │   │   │   ├── simple.py
    │   │   │   │   └── user_annotation
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   ├── resnet.py
    │   │   │   │   │   └── test.sh
    │   │   │   ├── nvtx
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── nvmarker.py
    │   │   │   ├── parse
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __main__.py
    │   │   │   │   ├── db.py
    │   │   │   │   ├── kernel.py
    │   │   │   │   ├── nvvp.py
    │   │   │   │   └── parse.py
    │   │   │   └── prof
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __main__.py
    │   │   │   │   ├── activation.py
    │   │   │   │   ├── base.py
    │   │   │   │   ├── blas.py
    │   │   │   │   ├── conv.py
    │   │   │   │   ├── convert.py
    │   │   │   │   ├── data.py
    │   │   │   │   ├── dropout.py
    │   │   │   │   ├── embedding.py
    │   │   │   │   ├── index_slice_join_mutate.py
    │   │   │   │   ├── linear.py
    │   │   │   │   ├── loss.py
    │   │   │   │   ├── misc.py
    │   │   │   │   ├── normalization.py
    │   │   │   │   ├── optim.py
    │   │   │   │   ├── output.py
    │   │   │   │   ├── pointwise.py
    │   │   │   │   ├── pooling.py
    │   │   │   │   ├── prof.py
    │   │   │   │   ├── randomSample.py
    │   │   │   │   ├── recurrentCell.py
    │   │   │   │   ├── reduction.py
    │   │   │   │   ├── softmax.py
    │   │   │   │   ├── usage.py
    │   │   │   │   └── utility.py
    │   │   └── reparameterization
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── reparameterization.py
    │   │   │   └── weight_norm.py
    │   ├── csrc
    │   │   ├── amp_C_frontend.cpp
    │   │   ├── compat.h
    │   │   ├── flatten_unflatten.cpp
    │   │   ├── layer_norm_cuda.cpp
    │   │   ├── layer_norm_cuda_kernel.cu
    │   │   ├── mlp.cpp
    │   │   ├── mlp_cuda.cu
    │   │   ├── multi_tensor_adagrad.cu
    │   │   ├── multi_tensor_adam.cu
    │   │   ├── multi_tensor_apply.cuh
    │   │   ├── multi_tensor_axpby_kernel.cu
    │   │   ├── multi_tensor_l2norm_kernel.cu
    │   │   ├── multi_tensor_lamb.cu
    │   │   ├── multi_tensor_lamb_stage_1.cu
    │   │   ├── multi_tensor_lamb_stage_2.cu
    │   │   ├── multi_tensor_novograd.cu
    │   │   ├── multi_tensor_scale_kernel.cu
    │   │   ├── multi_tensor_sgd_kernel.cu
    │   │   ├── syncbn.cpp
    │   │   ├── type_shim.h
    │   │   └── welford.cu
    │   ├── docs
    │   │   ├── Makefile
    │   │   └── source
    │   │   │   ├── _static
    │   │   │       ├── css
    │   │   │       │   └── pytorch_theme.css
    │   │   │       └── img
    │   │   │       │   └── nv-pytorch2.png
    │   │   │   ├── _templates
    │   │   │       └── layout.html
    │   │   │   ├── advanced.rst
    │   │   │   ├── amp.rst
    │   │   │   ├── conf.py
    │   │   │   ├── fp16_utils.rst
    │   │   │   ├── index.rst
    │   │   │   ├── layernorm.rst
    │   │   │   ├── optimizers.rst
    │   │   │   └── parallel.rst
    │   ├── examples
    │   │   ├── README.md
    │   │   ├── dcgan
    │   │   │   ├── README.md
    │   │   │   └── main_amp.py
    │   │   ├── docker
    │   │   │   ├── Dockerfile
    │   │   │   └── README.md
    │   │   ├── imagenet
    │   │   │   ├── README.md
    │   │   │   └── main_amp.py
    │   │   └── simple
    │   │   │   └── distributed
    │   │   │       ├── README.md
    │   │   │       ├── distributed_data_parallel.py
    │   │   │       └── run.sh
    │   ├── requirements.txt
    │   ├── requirements_dev.txt
    │   ├── setup.py
    │   └── tests
    │   │   ├── L0
    │   │       ├── run_amp
    │   │       │   ├── __init__.py
    │   │       │   ├── test_add_param_group.py
    │   │       │   ├── test_basic_casts.py
    │   │       │   ├── test_cache.py
    │   │       │   ├── test_checkpointing.py
    │   │       │   ├── test_fused_sgd.py
    │   │       │   ├── test_larc.py
    │   │       │   ├── test_multi_tensor_axpby.py
    │   │       │   ├── test_multi_tensor_l2norm.py
    │   │       │   ├── test_multi_tensor_scale.py
    │   │       │   ├── test_multiple_models_optimizers_losses.py
    │   │       │   ├── test_promotion.py
    │   │       │   ├── test_rnn.py
    │   │       │   └── utils.py
    │   │       ├── run_fp16util
    │   │       │   ├── __init__.py
    │   │       │   └── test_fp16util.py
    │   │       ├── run_fused_layer_norm
    │   │       │   └── test_fused_layer_norm.py
    │   │       ├── run_mlp
    │   │       │   └── test_mlp.py
    │   │       ├── run_optimizers
    │   │       │   ├── __init__.py
    │   │       │   ├── test_dist_adam.py
    │   │       │   ├── test_fused_novograd.py
    │   │       │   ├── test_fused_optimizer.py
    │   │       │   └── test_lamb.py
    │   │       ├── run_pyprof_data
    │   │       │   ├── __init__.py
    │   │       │   └── test_pyprof_data.py
    │   │       ├── run_pyprof_nvtx
    │   │       │   ├── __init__.py
    │   │       │   └── test_pyprof_nvtx.py
    │   │       └── run_test.py
    │   │   ├── L1
    │   │       ├── common
    │   │       │   ├── compare.py
    │   │       │   ├── main_amp.py
    │   │       │   └── run_test.sh
    │   │       ├── cross_product
    │   │       │   └── run.sh
    │   │       └── cross_product_distributed
    │   │       │   └── run.sh
    │   │   ├── distributed
    │   │       ├── DDP
    │   │       │   ├── ddp_race_condition_test.py
    │   │       │   └── run_race_test.sh
    │   │       ├── amp_master_params
    │   │       │   ├── amp_master_params.py
    │   │       │   ├── compare.py
    │   │       │   └── run.sh
    │   │       └── synced_batchnorm
    │   │       │   ├── python_single_gpu_unit_test.py
    │   │       │   ├── single_gpu_unit_test.py
    │   │       │   ├── test_batchnorm1d.py
    │   │       │   ├── test_groups.py
    │   │       │   ├── two_gpu_test_different_batch_size.py
    │   │       │   ├── two_gpu_unit_test.py
    │   │       │   └── unit_test.sh
    │   │   └── docker_extension_builds
    │   │       └── run.sh
    ├── arcface_model
    │   └── iresnet.py
    ├── coordinate_reg
    │   ├── image_infer.py
    │   └── model
    │   │   ├── 2d106det-0000.params
    │   │   ├── 2d106det-symbol.json
    │   │   ├── 2d106det.zip
    │   │   ├── 2d106det
    │   │       ├── 2d106det-0000.params
    │   │       └── 2d106det-symbol.json
    │   │   └── SCRFD
    │   │       └── model_25GF.pth
    ├── download_models.sh
    ├── examples
    │   ├── images
    │   │   ├── beckham.jpg
    │   │   ├── elon_musk.jpg
    │   │   ├── example1.png
    │   │   ├── example2.png
    │   │   ├── mark.jpg
    │   │   ├── murakami.jpg
    │   │   ├── p1.jpg
    │   │   ├── p2.jpg
    │   │   ├── tgt1.png
    │   │   ├── tgt2.png
    │   │   └── training
    │   │   │   ├── source1.png
    │   │   │   ├── source2.png
    │   │   │   ├── source3.png
    │   │   │   ├── source4.png
    │   │   │   ├── source5.png
    │   │   │   ├── source6.png
    │   │   │   ├── target1.png
    │   │   │   ├── target2.png
    │   │   │   ├── target3.png
    │   │   │   ├── target4.png
    │   │   │   ├── target5.png
    │   │   │   └── target6.png
    │   ├── results
    │   │   ├── result.mp4
    │   │   ├── result.png
    │   │   └── result_multi.mp4
    │   └── videos
    │   │   ├── dance.mp4
    │   │   ├── dirtydancing.mp4
    │   │   ├── elon.webp
    │   │   ├── inVideo1.mp4
    │   │   ├── inVideo2.mp4
    │   │   ├── inVideo3.mp4
    │   │   ├── inVideo4.mp4
    │   │   ├── inVideo5.mp4
    │   │   ├── khabenskii.webp
    │   │   ├── mark.webp
    │   │   ├── nggyup.mp4
    │   │   ├── orig.webp
    │   │   └── random_gif.gif
    ├── inference.py
    ├── inference_demo.py
    ├── insightface_func
    │   ├── __init__.py
    │   ├── face_detect_crop_multi.py
    │   └── face_detect_crop_single.py
    ├── models
    │   ├── __init__.py
    │   ├── config.py
    │   ├── config_sr.py
    │   ├── models.py
    │   ├── networks
    │   │   ├── Synchronized-BatchNorm-PyTorch
    │   │   │   ├── sync_batchnorm
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── batchnorm.py
    │   │   │   │   ├── batchnorm_reimpl.py
    │   │   │   │   ├── comm.py
    │   │   │   │   ├── replicate.py
    │   │   │   │   └── unittest.py
    │   │   │   └── tests
    │   │   │   │   ├── test_numeric_batchnorm.py
    │   │   │   │   ├── test_numeric_batchnorm_v2.py
    │   │   │   │   └── test_sync_batchnorm.py
    │   │   ├── __init__.py
    │   │   ├── architecture.py
    │   │   ├── base_network.py
    │   │   ├── discriminator.py
    │   │   ├── encoder.py
    │   │   ├── generator.py
    │   │   ├── loss.py
    │   │   ├── normalization.py
    │   │   └── sync_batchnorm
    │   │   │   ├── __init__.py
    │   │   │   ├── batchnorm.py
    │   │   │   ├── batchnorm_reimpl.py
    │   │   │   ├── comm.py
    │   │   │   ├── replicate.py
    │   │   │   └── unittest.py
    │   └── pix2pix_model.py
    ├── network
    │   ├── AADLayer.py
    │   ├── AEI_Net.py
    │   ├── MultiscaleDiscriminator.py
    │   ├── __init__.py
    │   └── resnet.py
    ├── preprocess_vgg.py
    ├── requirements.txt
    ├── train.py
    └── utils
    │   ├── inference
    │       ├── core.py
    │       ├── faceshifter_run.py
    │       ├── image_processing.py
    │       ├── masks.py
    │       ├── util.py
    │       └── video_processing.py
    │   └── training
    │       ├── Dataset.py
    │       ├── detector.py
    │       ├── image_processing.py
    │       └── losses.py
├── inference流程.txt
└── tmp.jpg


/dk/ghost.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | today=$(date -d "now" +%Y-%m-%d)
 3 | yesterday=$(date -d "yesterday" +%Y-%m-%d)
 4 | 
 5 | cd /data/wgs/face_swap/ghost
 6 | 
 7 | #Generator_PARAMS="\
 8 | #                --G_path ./weights/G_unet_3blocks.pth \
 9 | #                --num_blocks 3 \
10 | #                "
11 | #--G_path ./weights/G_unet_2blocks.pth \
12 | #--num_blocks 2 \
13 | 
14 | SOURCE_PATHA="\
15 |             --source_paths ./examples/images/p1.jpg \
16 |            "
17 | 
18 | # --batch_size 10 \
19 | 
20 | VIDEO_PATH="\
21 |           --target_video ./examples/videos/inVideo1.mp4 \
22 |           --out_video_name ./examples/results/o1_1_10.mp4 \
23 |           "
24 | 
25 | #SOURCE_PATHA="\
26 | #            --source_paths ./examples/images/p1.jpg ./examples/images/p2.jpg \
27 | #            --target_faces_paths ./examples/images/tgt1.png ./examples/images/tgt2.png \
28 | #           "
29 | #
30 | #VIDEO_PATH="\
31 | #           --target_video ./examples/videos/dirtydancing.mp4 \
32 | #          --out_video_name ./examples/results/o_multi.mp4 \
33 | #          "
34 | 
35 | options=" \
36 |         $SOURCE_PATHA \
37 |         $VIDEO_PATH \
38 |         "
39 | #$Generator_PARAMS \
40 | 
41 | docker run -d --gpus '"device=1"' \
42 |        --rm -it --name face_swap \
43 |        --shm-size 15G \
44 |        -v /data/wgs/face_swap:/home \
45 |        wgs-torch/faceswap:ghost \
46 |        sh -c "python3 /home/ghost/inference.py $options 1>>/home/log/ghost.log 2>>/home/log/ghost.err"
47 | 
48 | # nohup sh /data/wgs/face_swap/dk/ghost.sh &


--------------------------------------------------------------------------------
/ghost/apex/.gitignore:
--------------------------------------------------------------------------------
1 | apex.egg-info
2 | dist
3 | build
4 | docs/build
5 | *~
6 | __pycache__
7 | 


--------------------------------------------------------------------------------
/ghost/apex/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "apex/contrib/csrc/multihead_attn/cutlass"]
2 | 	path = apex/contrib/csrc/multihead_attn/cutlass
3 | 	url = https://github.com/NVIDIA/cutlass.git
4 | 	branch = v1.2.0
5 | [submodule "apex/contrib/csrc/cudnn-frontend"]
6 | 	path = apex/contrib/csrc/cudnn-frontend
7 | 	url = https://github.com/NVIDIA/cudnn-frontend.git
8 | 


--------------------------------------------------------------------------------
/ghost/apex/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/.nojekyll


--------------------------------------------------------------------------------
/ghost/apex/LICENSE:
--------------------------------------------------------------------------------
 1 | All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/ghost/apex/apex/RNN/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/RNN/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import LSTM, GRU, ReLU, Tanh, mLSTM
2 | 
3 | __all__ = ['models']
4 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/RNN/cells.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .RNNBackend import RNNCell
 6 | 
 7 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
 8 | 
 9 | import math 
10 | 
11 | 
12 | class mLSTMRNNCell(RNNCell):
13 |     """
14 |     mLSTMRNNCell
15 |     """
16 | 
17 |     def __init__(self, input_size, hidden_size, bias = False, output_size = None):
18 |         gate_multiplier = 4
19 |         super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
20 | 
21 |         self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
22 |         self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
23 | 
24 |         self.reset_parameters()
25 | 
26 |     def forward(self, input):
27 |         """
28 |         mLSTMRNNCell.forward()
29 |         """
30 |         #if not inited or bsz has changed this will create hidden states
31 |         self.init_hidden(input.size()[0])
32 | 
33 |         hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
34 | 
35 |         self.hidden = list(
36 |                            self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
37 |                            b_ih=self.b_ih, b_hh=self.b_hh)
38 |         )
39 |         
40 |         if self.output_size != self.hidden_size:
41 |             self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
42 |         return tuple(self.hidden)
43 | 
44 | 
45 |     def new_like(self, new_input_size=None):
46 |         if new_input_size is None:
47 |             new_input_size = self.input_size
48 |         
49 |         return type(self)(
50 |             new_input_size,
51 |             self.hidden_size,
52 |             self.bias,
53 |             self.output_size)
54 | 
55 | def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
56 |     """
57 |     mLSTMCell
58 |     """
59 | 
60 |     if input.is_cuda:
61 |         igates = F.linear(input, w_ih)
62 |         m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
63 |         hgates = F.linear(m, w_hh)
64 | 
65 |         state = fusedBackend.LSTMFused.apply
66 |         return state(igates, hgates, hidden[1], b_ih, b_hh)
67 | 
68 |     hx, cx = hidden
69 |     
70 |     m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
71 |     gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
72 | 
73 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
74 | 
75 |     ingate = F.sigmoid(ingate)
76 |     forgetgate = F.sigmoid(forgetgate)
77 |     cellgate = F.tanh(cellgate)
78 |     outgate = F.sigmoid(outgate)
79 |     
80 |     cy = (forgetgate * cx) + (ingate * cellgate)
81 |     hy = outgate * F.tanh(cy)
82 |     
83 |     return hy, cy
84 |                                                                             
85 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/RNN/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
 4 | 
 5 | from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
 6 | from .cells import mLSTMRNNCell, mLSTMCell
 7 | 
 8 | def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
 9 |     """
10 |     :class:`toRNNBackend`
11 |     """
12 | 
13 |     if bidirectional:
14 |         return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
15 |     else:
16 |         return stackedRNN(inputRNN, num_layers, dropout = dropout)
17 | 
18 | 
19 | def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
20 |     """
21 |     :class:`LSTM`
22 |     """
23 |     inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
24 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
25 | 
26 | def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
27 |     """
28 |     :class:`GRU`
29 |     """
30 |     inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
31 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
32 | 
33 | def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
34 |     """
35 |     :class:`ReLU`
36 |     """
37 |     inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
38 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
39 | 
40 | def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
41 |     """
42 |     :class:`Tanh`
43 |     """
44 |     inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
45 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
46 |         
47 | def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
48 |     """
49 |     :class:`mLSTM`
50 |     """
51 |     inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
52 |     return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/__init__.py:
--------------------------------------------------------------------------------
 1 | # May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
 2 | import torch
 3 | import warnings
 4 | 
 5 | if torch.distributed.is_available():
 6 |     from . import parallel
 7 | 
 8 | from . import amp
 9 | from . import fp16_utils
10 | 
11 | # For optimizers and normalization there is no Python fallback.
12 | # Absence of cuda backend is a hard error.
13 | # I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
14 | # to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
15 | # so they expect those backends to be available, but for some reason they actually aren't
16 | # available (for example because they built improperly in a way that isn't revealed until
17 | # load time) the error message is timely and visible.
18 | from . import optimizers
19 | from . import normalization
20 | from . import pyprof
21 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/README.md:
--------------------------------------------------------------------------------
 1 | # amp: Automatic Mixed Precision
 2 | 
 3 | ## Annotating User Functions
 4 | 
 5 | Nearly all PyTorch user code needs nothing more than the two steps
 6 | above to use amp. After all, custom layers are built out of simpler
 7 | PyTorch components, and amp already can see those.
 8 | 
 9 | However, any custom C++ or CUDA code is outside of amp's (default)
10 | view of things. For example, suppose I implemented a new recurrent
11 | cell called a "forgetful recurrent unit" that calls directly into a
12 | CUDA backend:
13 | 
14 | ```python
15 | from backend import FRUBackend
16 | 
17 | def fru(input, hidden, weight, bias):
18 |     # call to CUDA code
19 |     FRUBackend(input, hidden, weight, bias)
20 | ```
21 | 
22 | In this case, it is possible to get a runtime type mismatch. For
23 | example, you might have `input` in fp16, and `weight` in fp32, and amp
24 | doesn't have the visibility to insert an appropriate cast.
25 | 
26 | amp exposes two ways to handle "invisible" backend code: function
27 | annotations and explicit registration.
28 | 
29 | #### Function annotation
30 | 
31 | The first way to handle backend code is a set of function annotations:
32 | 
33 | - `@amp.half_function`
34 | - `@amp.float_function`
35 | - `@amp.promote_function`
36 | 
37 | These correspond to:
38 | 
39 | - Cast all arguments to fp16
40 | - Cast all argumnets fo fp32
41 | - If there are any type mismatches, cast everything to the widest type
42 | 
43 | In our example, we believe that the FRU unit is fp16-safe and will get
44 | performance gains from casting its arguments to fp16, so we write:
45 | 
46 | ```python
47 | @amp.half_function
48 | def fru(input, hidden, weight, bias):
49 |     #...
50 | ```
51 | 
52 | #### Explicit registration
53 | 
54 | The other way to handle backend code is with explicit function
55 | registration:
56 | 
57 | - `amp.register_half_function(module, function_name)`
58 | - `amp.register_float_function(module, function_name)`
59 | - `amp.register_promote_function(module, function_name)`
60 | 
61 | When using this API, `module` is the containing class or module for
62 | the function, and `function_name` is the _string_ name of the
63 | function. Note that the function must be registered before the call to
64 | `amp.initalize()`.
65 | 
66 | For our FRU unit, we can register the backend function directly:
67 | 
68 | ```python
69 | import backend
70 | 
71 | amp.register_half_function(backend, 'FRUBackend')
72 | ```
73 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/__init__.py:
--------------------------------------------------------------------------------
1 | from .amp import init, half_function, float_function, promote_function,\
2 |     register_half_function, register_float_function, register_promote_function
3 | from .handle import scale_loss, disable_casts
4 | from .frontend import initialize, state_dict, load_state_dict
5 | from ._amp_state import master_params, _amp_state
6 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, 0)
2 | __version__ = '.'.join(map(str, VERSION))
3 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/_amp_state.py:
--------------------------------------------------------------------------------
 1 | # This is a "header object" that allows different amp modules to communicate.
 2 | # I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.
 3 | # But apparently it's ok:
 4 | # http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
 5 | import os
 6 | import torch
 7 | 
 8 | TORCH_MAJOR = int(torch.__version__.split('.')[0])
 9 | TORCH_MINOR = int(torch.__version__.split('.')[1])
10 | 
11 | 
12 | if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
13 |     from torch._six import container_abcs
14 | else:
15 |     import collections.abc as container_abcs
16 | 
17 | 
18 | class AmpState(object):
19 |     def __init__(self):
20 |         self.hard_override=False
21 |         self.allow_incoming_model_not_fp32 = False
22 |         self.verbosity=1
23 | 
24 | 
25 | # Attribute stash.  Could also just stash things as global module attributes.
26 | _amp_state = AmpState()
27 | 
28 | 
29 | def warn_or_err(msg):
30 |     if _amp_state.hard_override:
31 |         print("Warning:  " + msg)
32 |     else:
33 |         raise RuntimeError(msg)
34 |         # I'm not sure if allowing hard_override is a good idea.
35 |         # + "  If you're sure you know what you're doing, supply " +
36 |         #                    "hard_override=True to amp.initialize.")
37 | 
38 | 
39 | def maybe_print(msg, rank0=False):
40 |     distributed = torch.distributed.is_available() and \
41 |         torch.distributed.is_initialized() and \
42 |         torch.distributed.get_world_size() > 1
43 |     if _amp_state.verbosity > 0:
44 |         if rank0:
45 |             if distributed:
46 |                 if torch.distributed.get_rank() == 0:
47 |                     print(msg)
48 |             else:
49 |                 print(msg)
50 |         else:
51 |             print(msg)
52 | 
53 | 
54 | # def iter_params(param_groups):
55 | #     for group in param_groups:
56 | #         for p in group['params']:
57 | #             yield p
58 | 
59 | 
60 | def master_params(optimizer):
61 |     """
62 |     Generator expression that iterates over the params owned by ``optimizer``.
63 | 
64 |     Args:
65 |         optimizer: An optimizer previously returned from ``amp.initialize``.
66 |     """
67 |     for group in optimizer.param_groups:
68 |         for p in group['params']:
69 |             yield p
70 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/compat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # True for post-0.4, when Variables/Tensors merged.
 4 | def variable_is_tensor():
 5 |     v = torch.autograd.Variable()
 6 |     return isinstance(v, torch.Tensor)
 7 | 
 8 | def tensor_is_variable():
 9 |     x = torch.Tensor()
10 |     return type(x) == torch.autograd.Variable
11 | 
12 | # False for post-0.4
13 | def tensor_is_float_tensor():
14 |     x = torch.Tensor()
15 |     return type(x) == torch.FloatTensor
16 | 
17 | # Akin to `torch.is_tensor`, but returns True for Variable
18 | # objects in pre-0.4.
19 | def is_tensor_like(x):
20 |     return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
21 | 
22 | # Wraps `torch.is_floating_point` if present, otherwise checks
23 | # the suffix of `x.type()`.
24 | def is_floating_point(x):
25 |     if hasattr(torch, 'is_floating_point'):
26 |         return torch.is_floating_point(x)
27 |     try:
28 |         torch_type = x.type()
29 |         return torch_type.endswith('FloatTensor') or \
30 |             torch_type.endswith('HalfTensor') or \
31 |             torch_type.endswith('DoubleTensor')
32 |     except AttributeError:
33 |         return False
34 | 
35 | def scalar_python_val(x):
36 |     if hasattr(x, 'item'):
37 |         return x.item()
38 |     else:
39 |         if isinstance(x, torch.autograd.Variable):
40 |             return x.data[0]
41 |         else:
42 |             return x[0]
43 | 
44 | # Accounts for the possibility that some ops may be removed from a namespace.
45 | def filter_attrs(module, attrs):
46 |     return list(attrname for attrname in attrs if hasattr(module, attrname))
47 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/lists/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/apex/amp/lists/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/lists/functional_overrides.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # TODO: think about the following two. They do weird things.
 3 | # - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
 4 | # - torch.nn.utils.weight_norm
 5 | 
 6 | # Notes:
 7 | # F.instance_norm uses batch_norm internally. Which correctly handles
 8 | #   fp16 in/out with fp32 weights. So we shouldn't do anything for
 9 | #   either of these.
10 | # F.normalize calls `input.norm()` internally, so it's redundant, but
11 | #   kept here in case impl. changes.
12 | # F.cosine_similarity is same: calls `x.norm()` internally.
13 | 
14 | import torch.nn.functional
15 | 
16 | MODULE = torch.nn.functional
17 | 
18 | FP16_FUNCS = [
19 |     'conv1d',
20 |     'conv2d',
21 |     'conv3d',
22 |     'conv_transpose1d',
23 |     'conv_transpose2d',
24 |     'conv_transpose3d',
25 |     'conv_tbc', # Undocumented / maybe new?
26 |     'linear',
27 | ]
28 | 
29 | FP32_FUNCS = [
30 | 
31 |     # Interpolation/Upsampling TODO:  Remove for 1.2
32 |     'interpolate',
33 |     'grid_sample',
34 | 
35 |     # Pointwise
36 |     'softplus',
37 |     'softmin',
38 |     'log_softmax',
39 |     'softmax',
40 |     'gelu',
41 |     
42 |     # Normalization
43 |     'layer_norm',
44 |     'group_norm',
45 |     'local_response_norm',
46 |     'normalize',
47 |     'cosine_similarity',
48 | 
49 |     # Loss functions
50 |     # TODO: which of these can be fp16?
51 |     'poisson_nll_loss',
52 |     'cosine_embedding_loss',
53 |     'cross_entropy',
54 |     'hinge_embedding_loss',
55 |     'kl_div',
56 |     'l1_loss',
57 |     'mse_loss',
58 |     'margin_ranking_loss',
59 |     'multilabel_margin_loss',
60 |     'multilabel_soft_margin_loss',
61 |     'multi_margin_loss',
62 |     'nll_loss',
63 |     'binary_cross_entropy_with_logits',
64 |     'smooth_l1_loss',
65 |     'soft_margin_loss',
66 |     'triplet_margin_loss',
67 |     'ctc_loss'
68 | ]
69 | 
70 | BANNED_FUNCS = [
71 |     ('binary_cross_entropy',
72 |      ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
73 |       "It requires that the output of the previous function be already a FloatTensor. \n\n"
74 |       "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
75 |       "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
76 |       "that is compatible with amp.\nAnother option is to add\n"
77 |       "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
78 |       "If you _really_ know what you are doing, you can disable this warning by passing "
79 |       "allow_banned=True to `amp.init()`."))
80 | ]
81 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/lists/tensor_overrides.py:
--------------------------------------------------------------------------------
 1 | from .. import compat
 2 | from . import torch_overrides
 3 | 
 4 | import importlib
 5 | 
 6 | import torch
 7 | 
 8 | # if compat.variable_is_tensor() and not compat.tensor_is_variable():
 9 | MODULE = torch.Tensor
10 | # else:
11 | #     MODULE = torch.autograd.Variable
12 | 
13 | 
14 | FP16_FUNCS = compat.filter_attrs(MODULE, [
15 |     '__matmul__',
16 | ])
17 | 
18 | FP32_FUNCS = compat.filter_attrs(MODULE, [
19 |     '__ipow__',
20 |     '__pow__',
21 |     '__rpow__',
22 | 
23 |     # Cast to fp32 before transfer to CPU
24 |     'cpu',
25 | ])
26 | 
27 | CASTS = compat.filter_attrs(MODULE, [
28 |     '__add__',
29 |     '__div__',
30 |     '__eq__',
31 |     '__ge__',
32 |     '__gt__',
33 |     '__iadd__',
34 |     '__idiv__',
35 |     '__imul__',
36 |     '__isub__',
37 |     '__itruediv__',
38 |     '__le__',
39 |     '__lt__',
40 |     '__mul__',
41 |     '__ne__',
42 |     '__radd__',
43 |     '__rdiv__',
44 |     '__rmul__',
45 |     '__rsub__',
46 |     '__rtruediv__',
47 |     '__sub__',
48 |     '__truediv__',
49 | ])
50 | 
51 | # None of these, but here to make code cleaner.
52 | SEQUENCE_CASTS = []
53 | 
54 | # We need to grab all the methods from torch_overrides and add them to
55 | # the Tensor lists as well, as almost all methods are duplicated
56 | # between `torch` and `torch.Tensor` (and check with `hasattr`,
57 | # because a few random ones aren't defined on Tensor)
58 | _self_mod = importlib.import_module(__name__)
59 | for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
60 |     lst = getattr(_self_mod, attrname)
61 |     for fn in getattr(torch_overrides, attrname):
62 |         if hasattr(MODULE, fn):
63 |             lst.append(fn)
64 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/lists/torch_overrides.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from .. import utils
  4 | 
  5 | MODULE = torch
  6 | 
  7 | FP16_FUNCS = [
  8 |     # Low level functions wrapped by torch.nn layers.
  9 |     # The wrapper layers contain the weights which are then passed in as a parameter
 10 |     # to these functions.
 11 |     'conv1d',
 12 |     'conv2d',
 13 |     'conv3d',
 14 |     'conv_transpose1d',
 15 |     'conv_transpose2d',
 16 |     'conv_transpose3d',
 17 |     'conv_tbc',
 18 |     'prelu',
 19 | 
 20 |     # BLAS
 21 |     'addmm',
 22 |     'addmv',
 23 |     'addr',
 24 |     'matmul',
 25 |     'mm',
 26 |     'mv',
 27 | ]
 28 | 
 29 | FP32_FUNCS = [
 30 |     # Pointwise
 31 |     'acos',
 32 |     'asin',
 33 |     'cosh',
 34 |     'erfinv',
 35 |     'exp',
 36 |     'expm1',
 37 |     'log',
 38 |     'log10',
 39 |     'log2',
 40 |     'reciprocal',
 41 |     'rsqrt',
 42 |     'sinh',
 43 |     'tan',
 44 | 
 45 |     # Other math
 46 |     'pow',
 47 | 
 48 |     # Reduction
 49 |     'cumprod',
 50 |     'cumsum',
 51 |     'dist',
 52 |     # 'mean',
 53 |     'norm',
 54 |     'prod',
 55 |     'std',
 56 |     'sum',
 57 |     'var',
 58 | 
 59 |     # Misc
 60 |     'renorm'
 61 | ]
 62 | 
 63 | version_strings = torch.__version__.split('.')
 64 | version_major = version_strings[0]
 65 | version_minor = version_strings[1]
 66 | version_num = float(version_major + "." + version_minor)
 67 | # Before torch 1.1, mean must be blacklisted.
 68 | if version_num < 1.1:
 69 |     FP32_FUNCS.append('mean')
 70 | 
 71 | # Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
 72 | # check the CUDA version -- if at least 9.1, then put the bmm
 73 | # functions on the fp16 list. Otherwise, put them on the fp32 list.
 74 | _bmms = ['addbmm',
 75 |          'baddbmm',
 76 |          'bmm']
 77 | 
 78 | if utils.is_cuda_enabled():
 79 |   # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
 80 |   if utils.get_cuda_version() >= (9, 1, 0):
 81 |       FP16_FUNCS.extend(_bmms)
 82 |   else:
 83 |       FP32_FUNCS.extend(_bmms)
 84 | 
 85 | # Multi-tensor fns that may need type promotion
 86 | CASTS = [
 87 |     # Multi-tensor math
 88 |     'addcdiv',
 89 |     'addcmul',
 90 |     'atan2',
 91 |     'cross',
 92 |     'bilinear',
 93 |     'dot',
 94 | 
 95 |     # Element-wise _or_ tensor-wise math
 96 |     'add',
 97 |     'div',
 98 |     'mul',
 99 | 
100 |     # Comparison
101 |     'eq',
102 |     'equal',
103 |     'ge',
104 |     'gt',
105 |     'le',
106 |     'lt',
107 |     'ne'
108 | ]
109 | 
110 | # Functions that take sequence arguments. We need to inspect the whole
111 | # sequence and cast to the widest type.
112 | SEQUENCE_CASTS = [
113 |     'cat',
114 |     'stack'
115 | ]
116 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/amp/rnn_compat.py:
--------------------------------------------------------------------------------
 1 | from . import utils, wrap
 2 | 
 3 | import torch
 4 | _VF = torch._C._VariableFunctions
 5 | RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
 6 | 
 7 | def _gen_VF_wrapper(name):
 8 |     def wrapper(*args, **kwargs):
 9 |         return getattr(_VF, name)(*args, **kwargs)
10 |     return wrapper
11 | 
12 | # Some python magic to generate an object that has the rnn cell functions
13 | # defined on it, all of which call into corresponding _VF version.
14 | # Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
15 | # imported at module scope within torch.nn.modules.rnn).  This should
16 | # not affect third-party importers of _VF.py.
17 | class VariableFunctionsShim(object):
18 |     def __init__(self):
19 |         for name in RNN_NAMES:
20 |             for suffix in ['', '_cell']:
21 |                fn_name = name + suffix
22 |                setattr(self, fn_name, _gen_VF_wrapper(fn_name))
23 | 
24 | def has_old_rnns():
25 |     try:
26 |         torch.nn.backends.thnn.backend.LSTMCell
27 |         return True
28 |     except:
29 |         return False
30 | 
31 | def whitelist_rnn_cells(handle, verbose):
32 |     # Different module + function names in old/new RNN cases
33 |     if has_old_rnns():
34 |         fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
35 |         mod = torch.nn.backends.thnn.backend
36 |     else:
37 |         fn_names = [x + '_cell' for x in RNN_NAMES]
38 |         mod = torch.nn.modules.rnn._VF
39 |         assert isinstance(mod, VariableFunctionsShim)
40 | 
41 |     # Insert casts on cell functions
42 |     for fn in fn_names:
43 |         wrap.cached_cast(mod, fn, utils.maybe_half, handle,
44 |                          try_caching=True, verbose=verbose)
45 | 
46 |     if has_old_rnns():
47 |         # Special handling of `backward` for fused gru / lstm:
48 |         # The `backward` method calls Tensor.sum() (blacklist) internally,
49 |         # and then the resulting grad_input has the wrong type.
50 |         # TODO: where else is this a problem?
51 |         for rnn_type in ['GRUFused', 'LSTMFused']:
52 |             mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
53 |             wrap.disable_casts(mod, 'backward', handle)
54 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/apex/contrib/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/bottleneck/__init__.py:
--------------------------------------------------------------------------------
1 | from .bottleneck import Bottleneck
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include "fmha.h"
29 | #include "fmha_fprop_kernel_1xN_reload_v.h"
30 | 
31 | using Kernel_traits = FMHA_kernel_traits< 384, 64, 16, 1, 4, 0x08u>;
32 | 
33 | extern "C" __global__ void fmha_fprop_fp16_384_64_sm80_train_kernel(Fused_multihead_attention_fprop_params params) {
34 |     fmha::device_1xN<Kernel_traits, true>(params);
35 | }
36 | 
37 | extern "C" __global__ void fmha_fprop_fp16_384_64_sm80_predict_kernel(Fused_multihead_attention_fprop_params params) {
38 |     fmha::device_1xN<Kernel_traits, false>(params);
39 | }
40 | 
41 | void run_fmha_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params &params, bool is_training, cudaStream_t stream) {
42 | 
43 |     auto kernel = is_training ? &fmha_fprop_fp16_384_64_sm80_train_kernel : &fmha_fprop_fp16_384_64_sm80_predict_kernel;
44 | 
45 |     constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
46 |     constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
47 |     constexpr int smem_size_o = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
48 | 
49 |     constexpr int smem_size = smem_size_v + smem_size_o + smem_size_softmax;
50 | 
51 |     if( smem_size >= 48 * 1024 ) {
52 |         FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
53 |     }
54 | 
55 |     dim3 grid(params.h, params.b);
56 |     kernel<<<grid, Kernel_traits::THREADS, smem_size, stream>>>(params);
57 | }
58 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/groupbn/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #include <ATen/cuda/CUDAContext.h>
 2 | #ifndef CUDA_UTILS_H
 3 | #define CUDA_UTILS_H
 4 | 
 5 | namespace at {
 6 | namespace cuda {
 7 | 
 8 | namespace utils {
 9 | 
10 | static inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
11 |     return getDeviceProperties(device_id)->sharedMemPerMultiprocessor;
12 | }
13 | 
14 | 
15 | }
16 | }
17 | }
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/layer_norm/ln_kernel_traits.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | constexpr uint32_t THREADS_PER_WARP = 32;
 4 | 
 5 | template <typename dtype, int COLS_, int WARPS_M_, int WARPS_N_,
 6 |           int BYTES_PER_LDG_ = 16>
 7 | struct Kernel_traits {
 8 |   enum { WARPS_M = WARPS_M_ };
 9 |   enum { WARPS_N = WARPS_N_ };
10 |   enum { COLS = COLS_ };
11 |   enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
12 | 
13 |   using Vec = Vec<dtype, BYTES_PER_LDG>;
14 | 
15 |   using vec_t = typename Vec::vec_t;
16 |   using base_t = typename Vec::base_t;
17 |   using packed_t = typename Vec::packed_t;
18 |   using compute_t = typename Vec::compute_t;
19 |   using packed_compute_t = typename Vec::packed_compute_t;
20 | 
21 |   enum { THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP };
22 |   enum { THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW };
23 |   enum { ROWS_PER_CTA = WARPS_M };
24 | 
25 |   enum { BYTES_PER_ROW = COLS * sizeof(base_t) };
26 |   enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG };
27 |   enum {SMEM_BYTES = ROWS_PER_CTA * COLS * sizeof(compute_t)};
28 | };
29 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/multihead_attn/philox.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | //Philox CUDA. 
 3 | 
 4 | class Philox {
 5 | public:
 6 |   __device__ inline Philox(unsigned long long seed,
 7 |                            unsigned long long subsequence,
 8 |                            unsigned long long offset) {
 9 |     key.x = (unsigned int)seed;
10 |     key.y = (unsigned int)(seed >> 32);
11 |     counter = make_uint4(0, 0, 0, 0);
12 |     counter.z = (unsigned int)(subsequence);
13 |     counter.w = (unsigned int)(subsequence >> 32);
14 |     STATE = 0;
15 |     incr_n(offset / 4);
16 |   }
17 |   __device__ inline uint4 operator()() {
18 |     if(STATE == 0) {
19 |       uint4 counter_ = counter;
20 |       uint2 key_ = key;
21 |       //7-round philox
22 |       for(int i = 0; i < 6; i++) {
23 |         counter_ = single_round(counter_, key_);
24 |         key_.x += (kPhilox10A); key_.y += (kPhilox10B);
25 |       }
26 |       output = single_round(counter_, key_);
27 |       incr();
28 |     }
29 |     //return a float4 directly
30 |     //unsigned long ret;
31 |     //switch(STATE) {
32 |     //  case 0: ret = output.x; break;
33 |     //  case 1: ret = output.y; break;
34 |     //  case 2: ret = output.z; break;
35 |     //  case 3: ret = output.w; break;
36 |     //}
37 |     //STATE = (STATE + 1) % 4;
38 |     return output;
39 |   }
40 | private:
41 |   uint4 counter;
42 |   uint4 output;
43 |   uint2 key;
44 |   unsigned int STATE;
45 |   __device__ inline void incr_n(unsigned long long n) {
46 |     unsigned int nlo = (unsigned int)(n);
47 |     unsigned int nhi = (unsigned int)(n >> 32);
48 |     counter.x += nlo;
49 |     if (counter.x < nlo)
50 |       nhi++;
51 |     counter.y += nhi;
52 |     if (nhi <= counter.y)
53 |       return;
54 |     if (++counter.z)
55 |       return;
56 |     ++counter.w;
57 |   }
58 |   __device__ inline void incr() {
59 |     if (++counter.x)
60 |       return;
61 |     if (++counter.y)
62 |       return;
63 |     if (++counter.z)
64 |       return;
65 |     ++counter.w;
66 |   }
67 |   __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
68 |                                     unsigned int *result_high) {
69 |     *result_high = __umulhi(a, b);
70 |     return a*b;
71 |   }
72 |   __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
73 |     unsigned int hi0;
74 |     unsigned int hi1;
75 |     unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
76 |     unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
77 |     uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
78 |     return ret;
79 |   }
80 |   static const unsigned long kPhilox10A = 0x9E3779B9;
81 |   static const unsigned long kPhilox10B = 0xBB67AE85;
82 |   static const unsigned long kPhiloxSA = 0xD2511F53;
83 |   static const unsigned long kPhiloxSB = 0xCD9E8D57;
84 | };
85 | // Inverse of 2^32.
86 | #define M_RAN_INVM32 2.3283064e-10f
87 | __device__  __inline__ float4 uniform4(uint4 x) {
88 |     return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,x.w * M_RAN_INVM32);
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   const float lr,
 8 |   const float beta1,
 9 |   const float beta2,
10 |   const float epsilon,
11 |   const int step,
12 |   const int bias_correction,
13 |   const float weight_decay,
14 |   const int grad_averaging,
15 |   const int mode,
16 |   const float global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
20 |         m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer");
21 | }
22 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_fused_adam_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor per_tensor_beta1,
 8 |   at::Tensor per_tensor_beta2,
 9 |   at::Tensor per_tensor_bias_correction,
10 |   at::Tensor per_tensor_eps,
11 |   at::Tensor per_tensor_weight_decay,
12 |   float lr,
13 |   float grad_scale,
14 |   int step,
15 |   int mode);
16 | 
17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
18 |   m.def("multi_tensor_fused_adam", &multi_tensor_fused_adam_cuda,
19 |         "Multi tensor Adam optimized CUDA implementation.");
20 | }
21 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void multi_tensor_lamb_compute_update_term_cuda(
 4 |   int chunk_size,
 5 |   at::Tensor noop_flag,
 6 |   std::vector<std::vector<at::Tensor>> tensor_lists,
 7 |   at::Tensor per_tensor_beta1,
 8 |   at::Tensor per_tensor_beta2,
 9 |   at::Tensor per_tensor_beta3,
10 |   at::Tensor per_tensor_bias_correction,
11 |   at::Tensor step,
12 |   at::Tensor per_tensor_epsilon,
13 |   const int mode,
14 |   at::Tensor per_tensor_decay,
15 |   at::Tensor global_scale,
16 |   at::Tensor global_grad_norm,
17 |   const float max_grad_norm);
18 | 
19 | void multi_tensor_lamb_update_weights_cuda(
20 |   int chunk_size,
21 |   at::Tensor noop_flag,
22 |   std::vector<std::vector<at::Tensor>> tensor_lists,
23 |   at::Tensor per_tensor_param_norm,
24 |   at::Tensor per_tensor_update_norm,
25 |   at::Tensor update_norm_offset,
26 |   at::Tensor learning_rate,
27 |   at::Tensor per_tensor_decay,
28 |   at::Tensor global_grad_norm,
29 |   bool use_nvlamb);
30 | 
31 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
32 |   m.def("multi_tensor_lamb_compute_update_term", &multi_tensor_lamb_compute_update_term_cuda,
33 |         "Computes update term for LAMB optimizer");
34 |   m.def("multi_tensor_lamb_update_weights", &multi_tensor_lamb_update_weights_cuda,
35 |         "Applies update term for LAMB optimizer");
36 | }
37 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/transducer/transducer_joint.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <ATen/Functions.h>
 3 | 
 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 7 | 
 8 | torch::Tensor transducer_joint_cuda_forward(
 9 |     torch::Tensor f,
10 |     torch::Tensor g,
11 |     torch::Tensor fLen,
12 |     torch::Tensor gLen,
13 |     torch::Tensor batchOffset,
14 |     int64_t packedBatch,
15 |     int opt,
16 |     bool packOutput,
17 |     int tileSize);
18 | 
19 | 
20 | std::vector<torch::Tensor> transducer_joint_cuda_backward(
21 |     torch::Tensor grad,
22 |     torch::Tensor fLen,
23 |     torch::Tensor gLen,
24 |     torch::Tensor batchOffset,
25 |     int maxFLen,
26 |     int maxGLen,
27 |     bool packOutput);
28 | 
29 | torch::Tensor transducer_joint_forward(
30 |     torch::Tensor f,
31 |     torch::Tensor g,
32 |     torch::Tensor fLen,
33 |     torch::Tensor gLen,
34 |     torch::Tensor batchOffset,
35 |     int64_t packedBatch,
36 |     int opt,
37 |     bool packOutput,
38 |     int tileSize) {
39 |     CHECK_INPUT(f);
40 |     CHECK_INPUT(g);
41 |     CHECK_INPUT(fLen);
42 |     CHECK_INPUT(gLen);
43 |     if (packOutput)
44 |         CHECK_INPUT(batchOffset);
45 |     return transducer_joint_cuda_forward(
46 |         f, 
47 |         g, 
48 |         fLen, 
49 |         gLen,
50 |         batchOffset,
51 |         packedBatch,
52 |         opt,
53 |         packOutput,
54 |         tileSize);
55 | }
56 | 
57 | std::vector<torch::Tensor> transducer_joint_backward(
58 |     torch::Tensor grad,
59 |     torch::Tensor fLen,
60 |     torch::Tensor gLen,
61 |     torch::Tensor batchOffset,
62 |     int maxFLen,
63 |     int maxGLen,
64 |     bool packOutput) {
65 |     CHECK_INPUT(grad);
66 |     CHECK_INPUT(fLen);
67 |     CHECK_INPUT(gLen);
68 |     if (packOutput)
69 |         CHECK_INPUT(batchOffset);
70 |     return transducer_joint_cuda_backward(
71 |         grad, 
72 |         fLen, 
73 |         gLen,
74 |         batchOffset,
75 |         maxFLen,
76 |         maxGLen,
77 |         packOutput);
78 | }
79 | 
80 | 
81 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
82 |   m.def("forward", &transducer_joint_forward, "transducer joint forward (CUDA)");
83 |   m.def("backward", &transducer_joint_backward, "transducer joint backward (CUDA)");
84 | }


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/transducer/transducer_loss.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <vector>
  3 | 
  4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
  5 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
  6 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
  7 | 
  8 | std::vector<torch::Tensor> transducer_loss_cuda_forward(
  9 |     torch::Tensor x,
 10 |     torch::Tensor label,
 11 |     torch::Tensor audLen,
 12 |     torch::Tensor txtLen,
 13 |     torch::Tensor batchOffset,
 14 |     int maxFLen,
 15 |     int blankIdx,
 16 |     int opt,
 17 |     bool packedInput);
 18 | 
 19 | torch::Tensor transducer_loss_cuda_backward(
 20 |     torch::Tensor x,
 21 |     torch::Tensor lossGrad,
 22 |     torch::Tensor alpha,
 23 |     torch::Tensor beta,
 24 |     torch::Tensor audLen,
 25 |     torch::Tensor txtLen,
 26 |     torch::Tensor label,
 27 |     torch::Tensor batchOffset,
 28 |     int maxFLen,
 29 |     int blankIdx,
 30 |     int opt,
 31 |     bool fuseSoftmaxBackward,
 32 |     bool packedInput);
 33 | 
 34 | 
 35 | std::vector<torch::Tensor> transducer_loss_forward(
 36 |     torch::Tensor x,
 37 |     torch::Tensor label,
 38 |     torch::Tensor fLen,
 39 |     torch::Tensor yLen,
 40 |     torch::Tensor batchOffset,
 41 |     int maxFLen,
 42 |     int blankIdx,
 43 |     int opt,
 44 |     bool packedInput
 45 |     ) {
 46 | 
 47 |     CHECK_INPUT(x);
 48 |     CHECK_INPUT(label);
 49 |     CHECK_INPUT(fLen);
 50 |     CHECK_INPUT(yLen);
 51 |     if (packedInput)
 52 |         CHECK_INPUT(batchOffset);
 53 |     return transducer_loss_cuda_forward(
 54 |         x, 
 55 |         label, 
 56 |         fLen, 
 57 |         yLen, 
 58 |         batchOffset,
 59 |         maxFLen,
 60 |         blankIdx, 
 61 |         opt,
 62 |         packedInput);
 63 | }
 64 | 
 65 | torch::Tensor transducer_loss_backward(
 66 |     torch::Tensor x,
 67 |     torch::Tensor lossGrad,
 68 |     torch::Tensor alpha,
 69 |     torch::Tensor beta,
 70 |     torch::Tensor fLen,
 71 |     torch::Tensor yLen,
 72 |     torch::Tensor label,
 73 |     torch::Tensor batchOffset,
 74 |     int maxFLen,
 75 |     int blankIdx,
 76 |     int opt,
 77 |     bool fuseSoftmaxBackward,
 78 |     bool packedInput){
 79 | 
 80 |     CHECK_INPUT(x);
 81 |     CHECK_INPUT(label);
 82 |     CHECK_INPUT(lossGrad);
 83 |     CHECK_INPUT(alpha);
 84 |     CHECK_INPUT(beta);
 85 |     CHECK_INPUT(fLen);
 86 |     CHECK_INPUT(yLen);
 87 |     if (packedInput)
 88 |         CHECK_INPUT(batchOffset);
 89 | 
 90 |     return transducer_loss_cuda_backward(
 91 |         x,
 92 |         lossGrad,
 93 |         alpha,
 94 |         beta,
 95 |         fLen,
 96 |         yLen,
 97 |         label,
 98 |         batchOffset,
 99 |         maxFLen,
100 |         blankIdx,
101 |         opt,
102 |         fuseSoftmaxBackward,
103 |         packedInput);
104 | }
105 | 
106 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
107 |   m.def("forward", &transducer_loss_forward, "transducer loss forward (CUDA)");
108 |   m.def("backward", &transducer_loss_backward, "transducer loss backward (CUDA)");
109 | }
110 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/csrc/xentropy/interface.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | // CUDA forward declarations
 4 | 
 5 | std::vector<at::Tensor> softmax_xentropy_cuda(
 6 |     const at::Tensor &input,
 7 |     const at::Tensor &labels,
 8 |     const float smoothing,
 9 |     const bool half_to_float);
10 | 
11 | at::Tensor softmax_xentropy_backward_cuda(
12 |     const at::Tensor &grad_loss,
13 |     const at::Tensor &logits,
14 |     const at::Tensor &max_log_sum_exp,
15 |     const at::Tensor &labels,
16 |     const float smoothing);
17 | 
18 | // C++ interface
19 | 
20 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
21 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
22 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
23 | 
24 | std::vector<at::Tensor> softmax_xentropy_forward(
25 |     const at::Tensor &input,
26 |     const at::Tensor &labels,
27 |     const float smoothing,
28 |     const bool half_to_float) {
29 |     CHECK_CUDA(input);
30 |     CHECK_INPUT(labels);
31 | 
32 |     return softmax_xentropy_cuda(input, labels, smoothing, half_to_float);
33 | }
34 | 
35 | at::Tensor softmax_xentropy_backward(
36 |     const at::Tensor &grad_loss,
37 |     const at::Tensor &logits,
38 |     const at::Tensor &max_log_sum_exp,
39 |     const at::Tensor &labels,
40 |     const float smoothing)  {
41 |     CHECK_CUDA(grad_loss);
42 |     CHECK_CUDA(logits);
43 |     CHECK_INPUT(max_log_sum_exp);
44 |     CHECK_INPUT(labels);
45 | 
46 |     return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, smoothing);
47 | }
48 | 
49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
50 |     m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)");
51 |     m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)");
52 | }
53 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/fmha/__init__.py:
--------------------------------------------------------------------------------
1 | from .fmha import FMHAFun
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/groupbn/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import bnp
 4 |     from .batch_norm import BatchNorm2d_NHWC
 5 |     del torch
 6 |     del bnp
 7 |     del batch_norm
 8 | except ImportError as err:
 9 |     print("apex was installed without --bnp flag, contrib.groupbn is not available")
10 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/layer_norm/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_norm import FastLayerNorm
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/layer_norm/layer_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import init
 3 | 
 4 | import fast_layer_norm
 5 | 
 6 | class FastLayerNormFN(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(ctx, x, gamma, beta, epsilon):
 9 |         x = x.contiguous()
10 |         gamma = gamma.contiguous()
11 |         beta = beta.contiguous()
12 |         hidden_size = gamma.numel()
13 |         xmat = x.view((-1, hidden_size))
14 |         ymat, mu, rsigma = fast_layer_norm.ln_fwd(xmat, gamma, beta, epsilon)
15 |         ctx.save_for_backward(x, gamma, mu, rsigma)
16 |         return ymat.view(x.shape)
17 |     
18 |     @staticmethod
19 |     def backward(ctx, dy):
20 |         #assert dy.is_contiguous()
21 |         dy = dy.contiguous() # this happens!
22 |         x, gamma, mu, rsigma = ctx.saved_tensors
23 | 
24 |         hidden_size = gamma.numel()
25 |         xmat = x.view((-1, hidden_size))
26 |         dymat = dy.view(xmat.shape)
27 |         dxmat, dgamma, dbeta = fast_layer_norm.ln_bwd(dymat, xmat, mu, rsigma, gamma)
28 |         dx = dxmat.view(x.shape)
29 |         return dx, dgamma, dbeta, None
30 | 
31 | class FastLayerNorm(torch.nn.Module):
32 |     def __init__(self, hidden_size, eps=1e-5):
33 |         super(FastLayerNorm, self).__init__()
34 |         self.epsilon = eps
35 |         self.weight = torch.nn.Parameter(torch.Tensor(hidden_size))
36 |         self.bias = torch.nn.Parameter(torch.Tensor(hidden_size))
37 |         self.reset_parameters()
38 | 
39 |     def reset_parameters(self):
40 |         init.ones_(self.weight)
41 |         init.zeros_(self.bias)
42 | 
43 |     def forward(self, x):
44 |         return FastLayerNormFN.apply(x, self.weight, self.bias, self.epsilon)
45 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/multihead_attn/MHA_bwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/apex/contrib/multihead_attn/MHA_bwd.png


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/multihead_attn/MHA_fwd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/apex/contrib/multihead_attn/MHA_fwd.png


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/multihead_attn/README.md:
--------------------------------------------------------------------------------
 1 | # Fast Multihead Attention 
 2 | 
 3 | This implementation has two main features :
 4 | * A C++ implementation to avoid the CPU overheads of Pytorch found with smaller batch sizes.
 5 | * The removal of all copies and transposes found in standard implementations of Multihead Attention.
 6 | 
 7 | |                                            | Python Version | C++ Version |
 8 | | :----------------------------------------- | :------------: | :---------: |
 9 | | Layer Norm and Residual Add Variant        | X              | X           |
10 | | Includes Linear Biases                     | X              |             |
11 | | Reduces CPU Overheads                      |                | X           |
12 | | Fuses masking with Softmax                 |                | X           |
13 | | Removes Transposes and Copies              | X              | X           |
14 | | Includes Self and Encoder/Decoder Variants | X              | X           |
15 | 
16 | ## How to Instantiate
17 | 
18 | `SelfMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
19 | `EncdecMultiheadAttn(` _hidden dim_, _heads_, _dropout=prob_, _bias=bool_, _include_norm_add=bool_, _impl='fast'_ `)`
20 | 
21 |  `impl` has two options:
22 |  * `fast` uses C++ Version
23 |  * `default` uses Python Version
24 | 
25 | ## Instructions to build on Linux
26 | 
27 | ```
28 | $ git clone https://github.com/NVIDIA/apex
29 | $ cd apex
30 | $ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" ./
31 | ```
32 | ## Try Performance Tests Yourself!
33 | Perf test script is found here!
34 | ```
35 | cd contrib/examples/multihead_attn
36 | ```
37 | #### Fast Multihead Attention
38 | ```
39 | python perf_test_multihead_attn.py --ref
40 | ```
41 | #### Fast Multihead Attention with C++ Implementation
42 | ```
43 | python perf_test_multihead_attn.py
44 | ```
45 | #### Compare with `torch.nn.MultiheadAttn`
46 | ```
47 | python perf_test_multihead_attn.py --native
48 | ```
49 | #### Test your own range!
50 | ```
51 | python perf_test_multihead_attn.py --seq-length 64 --num-seqs-start 10 --num-seqs-stop 120 --num-seqs-inc 5
52 | ```
53 | 
54 | ## Performance Comparisons
55 | 
56 | * Performance was measured with 64 token sequence lengths on an NVIDIA TitanV card.
57 | * Time is measured across multiple layers to simulate an in model scenario.
58 | 
59 | ![Multihead Attention Forward](MHA_fwd.png)
60 | ![Multihead Attention Backward](MHA_bwd.png)
61 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/multihead_attn/__init__.py:
--------------------------------------------------------------------------------
1 | from .self_multihead_attn import SelfMultiheadAttn
2 | from .encdec_multihead_attn import EncdecMultiheadAttn
3 | from .mask_softmax_dropout_func import fast_mask_softmax_dropout_func
4 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fp16_optimizer import FP16_Optimizer
2 | from .fused_adam import FusedAdam
3 | from .fused_lamb import FusedLAMB
4 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/sparsity/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_masklib import create_mask
2 | from .asp import ASP
3 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import unittest
 3 | import torch.nn.functional as F
 4 | from apex.contrib.multihead_attn import fast_mask_softmax_dropout_func
 5 | 
 6 | class FusedSoftmaxTest(unittest.TestCase):
 7 |     def setUp(self, seed=1234):
 8 |         torch.manual_seed(seed)
 9 |         torch.cuda.manual_seed_all(seed)
10 | 
11 |         self.seq_length   = 80
12 |         self.sequences    = 10
13 |         self.hidden_dim   = 1024
14 |         self.heads        = 16
15 |         self.dropout_prob = 0.0
16 | 
17 |         self.mask = (torch.randn(self.sequences,self.seq_length)>0).cuda()
18 |         self.mask = self.mask.half()*-10000
19 |         self.ref_inputs = torch.randn(self.heads * self.sequences, self.seq_length, self.seq_length, 
20 |                                       dtype=torch.float16, device=torch.device("cuda")).requires_grad_(True)
21 |         
22 |         self.tst_inputs = self.ref_inputs.clone().detach().requires_grad_(True)
23 | 
24 |     def test_fused_softmax(self) :
25 |         grads = torch.randn_like(self.tst_inputs)
26 |         y_ref = self.ref_inputs.view(self.sequences, self.heads, self.seq_length, self.seq_length)
27 |         y_ref = y_ref + self.mask.unsqueeze(1).unsqueeze(2)
28 |         y_ref = y_ref.view(self.sequences*self.heads, self.seq_length, self.seq_length) 
29 |         y_ref = F.softmax(y_ref, dim=-1)
30 |         y_ref = torch._fused_dropout(y_ref, 1.0)    
31 |    
32 |         y_tst = fast_mask_softmax_dropout_func(True, self.heads, self.tst_inputs, self.mask, True, 0.0)        
33 |         y_ref[0].backward(grads)
34 |         y_tst.backward(grads)
35 | 
36 |         self.assertTrue(torch.allclose(self.ref_inputs,  self.tst_inputs,  atol=1e-5, rtol=1e-5))
37 |         self.assertTrue(torch.allclose(y_ref[0], y_tst, atol=1e-3, rtol=1e-3))
38 |         self.assertTrue(torch.allclose(self.ref_inputs.grad, self.tst_inputs.grad, atol=1e-3, rtol=1e-3))
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/transducer/__init__.py:
--------------------------------------------------------------------------------
1 | from .transducer import TransducerJoint
2 | from .transducer import TransducerLoss


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/xentropy/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 |     import xentropy_cuda
 4 |     from .softmax_xentropy import SoftmaxCrossEntropyLoss
 5 |     del torch
 6 |     del xentropy_cuda
 7 |     del softmax_xentropy
 8 | except ImportError as err:
 9 |     print("apex was installed without --xentropy flag, contrib.xentropy is not available")
10 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/contrib/xentropy/softmax_xentropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import xentropy_cuda
 3 | 
 4 | class SoftmaxCrossEntropyLoss(torch.autograd.Function):
 5 |     @staticmethod
 6 |     def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, half_to_float=False):
 7 |         losses, max_log_sum_exp = xentropy_cuda.forward(
 8 |             logits, labels, smoothing, half_to_float)
 9 |         losses.masked_fill_(labels==padding_idx, 0)
10 | 
11 |         ctx.save_for_backward(logits, max_log_sum_exp, labels,
12 |             torch.FloatTensor([smoothing]),
13 |             torch.LongTensor([padding_idx]))
14 | 
15 |         return losses
16 | 
17 |     @staticmethod
18 |     def backward(ctx, grad_loss):
19 |         logits, max_log_sum_exp, labels, smoothing, padding_idx = ctx.saved_tensors
20 | 
21 |         if not grad_loss.is_contiguous():
22 |             grad_loss = grad_loss.contiguous()
23 |         grad_loss.masked_fill_(labels==padding_idx.item(), 0)
24 |         grad_logits = xentropy_cuda.backward(
25 |             grad_loss.contiguous(), logits, max_log_sum_exp,
26 |             labels, smoothing.item())
27 | 
28 |         return grad_logits, None, None, None, None
29 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/fp16_utils/README.md:
--------------------------------------------------------------------------------
 1 | fp16_optimizer.py contains `FP16_Optimizer`, a Python class designed to wrap an existing Pytorch optimizer and automatically enable master parameters and loss scaling in a manner transparent to the user.  To use `FP16_Optimizer`, only two lines of one's Python model need to change.
 2 | 
 3 | #### [FP16_Optimizer API documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
 4 | 
 5 | #### [Simple examples with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple)
 6 | 
 7 | #### [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
 8 | 
 9 | #### [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model)
10 | 
11 | 
12 | fp16_util.py contains a number of utilities to manually manage master parameters and loss scaling, if the user chooses.  
13 | 
14 | #### [Manual management documentation](https://nvidia.github.io/apex/fp16_utils.html#manual-master-parameter-management)
15 | 
16 | The [Imagenet with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) and [word_language_model with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/word_language_model) directories also contain `main.py` files that demonstrate manual management of master parameters and static loss scaling.  These examples illustrate what sort of operations `FP16_Optimizer` is performing automatically.
17 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/fp16_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .fp16util import (
 2 |     BN_convert_float,
 3 |     network_to_half,
 4 |     prep_param_lists,
 5 |     model_grads_to_master_grads,
 6 |     master_params_to_model_params,
 7 |     tofp16,
 8 |     to_python_float,
 9 |     clip_grad_norm,
10 |     convert_module,
11 |     convert_network,
12 |     FP16Model,
13 | )
14 | 
15 | from .fp16_optimizer import FP16_Optimizer
16 | from .loss_scaler import LossScaler, DynamicLossScaler
17 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/mlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import *
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/mlp/mlp.py:
--------------------------------------------------------------------------------
 1 | from copy import copy
 2 | import math
 3 | import torch
 4 | from torch import nn
 5 | import mlp_cuda
 6 | from .. import amp
 7 | 
 8 | class MlpFunction(torch.autograd.Function):
 9 |     @staticmethod
10 |     def forward(ctx, bias, activation, *args):
11 |         output = mlp_cuda.forward(bias, activation, args)
12 |         ctx.save_for_backward(*args)
13 |         ctx.outputs = output
14 |         ctx.bias = bias
15 |         ctx.activation = activation
16 |         return output[0]
17 | 
18 |     @staticmethod
19 |     def backward(ctx, grad_o):
20 |         grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
21 |         del ctx.outputs
22 |         return (None, None, *grads)
23 | 
24 | mlp_function = amp.half_function(MlpFunction.apply)
25 | 
26 | class MLP(torch.nn.Module):
27 |     """Launch MLP in C++
28 | 
29 |     Args:
30 |         mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
31 |         bias (bool): Default True:
32 |         relu (bool): Default True
33 |     """
34 |     def __init__(self, mlp_sizes, bias=True, activation='relu'):
35 |         super(MLP, self).__init__()
36 |         self.num_layers = len(mlp_sizes) - 1
37 |         self.mlp_sizes = copy(mlp_sizes)
38 |         self.bias = 1 if bias else 0
39 | 
40 |         if activation is 'none':
41 |             self.activation = 0
42 |         elif activation is 'relu':
43 |             self.activation = 1
44 |         elif activation is 'sigmoid':
45 |             self.activation = 2
46 |         else:
47 |             raise TypeError("activation must be relu or none.")
48 | 
49 |         self.weights = []
50 |         self.biases = []
51 |         for i in range(self.num_layers):
52 |             w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i]))
53 |             self.weights.append(w)
54 |             name = 'weight_{}'.format(i)
55 |             setattr(self, name, w)
56 |             if self.bias:
57 |                 b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
58 |                 self.biases.append(b)
59 |                 name = 'bias_{}'.format(i)
60 |                 setattr(self, name, b)
61 | 
62 |         self.reset_parameters()
63 | 
64 |     def reset_parameters(self):
65 |         for weight in self.weights:
66 |             dimsum = weight.size(0) + weight.size(1)
67 |             std = math.sqrt(2. / float(dimsum))
68 |             nn.init.normal_(weight, 0., std)
69 |         if self.bias:
70 |             for bias in self.biases:
71 |                 std = math.sqrt(1. / float(bias.size(0)))
72 |                 nn.init.normal_(bias, 0., std)
73 | 
74 |     def forward(self, input):
75 |         return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)
76 | 
77 |     def extra_repr(self):
78 |         s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
79 |         return s
80 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/multi_tensor_apply/__init__.py:
--------------------------------------------------------------------------------
1 | from .multi_tensor_apply import MultiTensorApply
2 | 
3 | multi_tensor_applier = MultiTensorApply(2048*32)
4 | 
5 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/multi_tensor_apply/multi_tensor_apply.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class MultiTensorApply(object):
 4 |     available = False
 5 |     warned = False
 6 | 
 7 |     def __init__(self, chunk_size):
 8 |         try:
 9 |             import amp_C
10 |             MultiTensorApply.available = True
11 |             self.chunk_size = chunk_size
12 |         except ImportError as err:
13 |             MultiTensorApply.available = False
14 |             MultiTensorApply.import_err = err
15 | 
16 |     def check_avail(self):
17 |         if MultiTensorApply.available == False:
18 |             raise RuntimeError(
19 |                 "Attempted to call MultiTensorApply method, but MultiTensorApply "
20 |                 "is not available, possibly because Apex was installed without "
21 |                 "--cpp_ext --cuda_ext.  Original import error message:",
22 |                 MultiTensorApply.import_err)
23 | 
24 |     def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
25 |         self.check_avail()
26 | 
27 |         return op(self.chunk_size,
28 |                   noop_flag_buffer,
29 |                   tensor_lists,
30 |                   *args)
31 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/normalization/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_layer_norm import FusedLayerNorm
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused_sgd import FusedSGD
2 | from .fused_adam import FusedAdam
3 | from .fused_novograd import FusedNovoGrad
4 | from .fused_lamb import FusedLAMB
5 | from .fused_adagrad import FusedAdagrad


--------------------------------------------------------------------------------
/ghost/apex/apex/parallel/README.md:
--------------------------------------------------------------------------------
 1 | ## Distributed Data Parallel
 2 | 
 3 | distributed.py contains the source code for `apex.parallel.DistributedDataParallel`, a module wrapper that enables multi-process multi-GPU data parallel training optimized for NVIDIA's NCCL communication library.
 4 | 
 5 | `apex.parallel.DistributedDataParallel` achieves high performance by overlapping communication with
 6 | computation in the backward pass and bucketing smaller transfers to reduce the total number of
 7 | transfers required.
 8 | 
 9 | multiproc.py contains the source code for `apex.parallel.multiproc`, a launch utility that places one process on each of the node's available GPUs.
10 | 
11 | #### [API Documentation](https://nvidia.github.io/apex/parallel.html)
12 | 
13 | #### [Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
14 | 
15 | #### [Imagenet example with Mixed Precision](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
16 | 
17 | #### [Simple example with FP16_Optimizer](https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple/distributed_apex)
18 | 
19 | ### Synchronized Batch Normalization
20 | 
21 | `apex.parallel.SyncBatchNorm` has similar APIs as with `torch.nn.BatchNorm*N*d`.
22 | It reduces stats on the first (channel) dimension of the Tensor and accepts
23 | arbitrary spatial dimensions.
24 | 
25 | #### Installation
26 | 
27 | Apex provides two sync BN implementation:
28 | 
29 | 1. There is the Python-only implementation, which is the default implementation
30 | when install with `python setup.py install`.
31 | It uses PyTorch primitive operations and distributed communication package from
32 | `torch.distributed`.
33 | 
34 |    - _Python-only implementation requires input tensor to be of same data type as
35 | layer_
36 | 
37 | 2. We also provide implementation with kernels through CUDA/C++ extension with
38 | improved performance. We are experimenting with Welford and Kahan for reduction
39 | hoping to get better accuracy.
40 |    To use the kernel implementation, user need to install Apex with CUDA extension
41 | enabled `python setup.py install --cuda_ext`.
42 | 
43 |    - _Custom kernel implementation supports fp16 input with fp32 layer as cudnn.
44 | This is required to run imagenet example in fp16._
45 | 
46 |    - _Currently kernel implementation only supports GPU._
47 | 
48 | #### HowTo
49 | 
50 | 1. User could use `apex.parallel.SyncBatchNorm` by building their module with
51 | the layer explicitly.
52 | 
53 | ```
54 | import apex
55 | input_t = torch.randn(3, 5, 20).cuda()
56 | sbn = apex.parallel.SyncBatchNorm(5).cuda()
57 | output_t = sbn(input)
58 | ```
59 | 
60 | 2. User could also take a constructed `torch.nn.Model` and replace all its `torch.nn.BatchNorm*N*d` modules with `apex.parallel.SyncBatchNorm` through utility function `apex.parallel.convert_syncbn_model`.
61 | 
62 | ```
63 | # model is an instance of torch.nn.Module
64 | import apex
65 | sync_bn_model = apex.parallel.convert_syncbn_model(model)
66 | ```
67 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/parallel/multiproc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def docstring_hack():
 6 |     """
 7 |     Multiproc file which will launch a set of processes locally for multi-gpu
 8 |     usage: python -m apex.parallel.multiproc main.py ...
 9 |     """
10 |     pass
11 | 
12 | argslist = list(sys.argv)[1:]
13 | world_size = torch.cuda.device_count()
14 | 
15 | if '--world-size' in argslist:
16 |     world_size = int(argslist[argslist.index('--world-size')+1])
17 | else:
18 |     argslist.append('--world-size')
19 |     argslist.append(str(world_size))
20 | 
21 | workers = []
22 | 
23 | for i in range(world_size):
24 |     if '--rank' in argslist:
25 |         argslist[argslist.index('--rank')+1] = str(i)
26 |     else:
27 |         argslist.append('--rank')
28 |         argslist.append(str(i))
29 |     stdout = None if i == 0 else open("GPU_"+str(i)+".log", "w")
30 |     print(argslist)
31 |     p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
32 |     workers.append(p)
33 | 
34 | for p in workers:
35 |     p.wait()
36 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/FAQs.md:
--------------------------------------------------------------------------------
 1 | 1. How do I intercept the Adam optimizer in APEX ?
 2 | 
 3 | 	```python
 4 | 	from apex import pyprof
 5 | 	import fused_adam_cuda
 6 | 	pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
 7 | 	```
 8 | 
 9 | 2. If you are using JIT and/or AMP, the correct initialization sequence is
10 | 	1. Let any JIT to finish.
11 | 	2. Initlialize pyprof `pyprof.nvtx.init()`.
12 | 	3. Initialize AMP.
13 | 
14 | 3. How do I profile with `torch.distributed.launch` ?
15 | 
16 | 	```python
17 | 	nvprof -f -o net%p.sql \
18 | 		--profile-from-start off \
19 | 		--profile-child-processes \
20 | 		python -m torch.distributed.launch net.py
21 | 	```
22 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | 
3 | from . import nvtx, prof
4 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.sql
3 | *.dict
4 | *.csv
5 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/apex/README.md:
--------------------------------------------------------------------------------
1 | This directory has examples of how to use `pyprof` with APEX extensions e.g. `fused_adam_cuda` and `fused_layer_norm_cuda`.
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/apex/fused_adam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import fused_adam_cuda
 3 | from apex.optimizers import FusedAdam, FP16_Optimizer
 4 | from apex import pyprof
 5 | 
 6 | pyprof.nvtx.init()
 7 | pyprof.nvtx.wrap(fused_adam_cuda, 'adam')
 8 | 
 9 | model = torch.nn.Linear(10, 20).cuda().half()
10 | criterion = torch.nn.CrossEntropyLoss().cuda()
11 | optimizer = FusedAdam(model.parameters())
12 | optimizer = FP16_Optimizer(optimizer)
13 | 
14 | x = torch.ones(32, 10).cuda().half()
15 | target = torch.empty(32, dtype=torch.long).random_(20).cuda()
16 | y = model(x)
17 | loss = criterion(y, target)
18 | optimizer.zero_grad()
19 | loss.backward()
20 | optimizer.step()
21 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/apex/fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import fused_layer_norm_cuda
 3 | from apex.normalization import FusedLayerNorm
 4 | from apex import pyprof
 5 | 
 6 | pyprof.nvtx.init()
 7 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward')
 8 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward')
 9 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'forward_affine')
10 | pyprof.nvtx.wrap(fused_layer_norm_cuda, 'backward_affine')
11 | 
12 | input = torch.randn(20, 5, 10, 10).cuda()
13 | 
14 | # With Learnable Parameters
15 | m = FusedLayerNorm(input.size()[1:]).cuda()
16 | output = m(input)
17 | 
18 | # Without Learnable Parameters
19 | m = FusedLayerNorm(input.size()[1:], elementwise_affine=False).cuda()
20 | output = m(input)
21 | 
22 | # Normalize over last two dimensions
23 | m = FusedLayerNorm([10, 10]).cuda()
24 | output = m(input)
25 | 
26 | # Normalize over last dimension of size 10
27 | m = FusedLayerNorm(10).cuda()
28 | output = m(input)
29 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/apex/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo $sql python $f"
20 | 	nvprof -fo $sql python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	$prof -w 130 $dict
29 | 	\rm $sql $dict
30 | done
31 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/custom_func_module/README.md:
--------------------------------------------------------------------------------
1 | This directory has examples which show how to intercept (monkey patch) custom functions and modules with `pyprof`. No changes are required in `pyprof/parse`, however, users can add support for bytes and flops calculation for custom functions and modules in `pyprof/prof` by extending the `OperatorLayerBase` class.
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/custom_func_module/custom_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | #Initialize pyprof
 7 | pyprof.nvtx.init()
 8 | 
 9 | class Foo(torch.autograd.Function):
10 | 	@staticmethod
11 | 	def forward(ctx, in1, in2):
12 | 		out = in1 + in2		#This could be a custom C/C++ function.
13 | 		return out
14 | 
15 | 	@staticmethod
16 | 	def backward(ctx, grad):
17 | 		in1_grad = grad		#This could be a custom C/C++ function.
18 | 		in2_grad = grad		#This could be a custom C/C++ function.
19 | 		return in1_grad, in2_grad
20 | 
21 | #Hook the forward and backward functions to pyprof
22 | pyprof.nvtx.wrap(Foo, 'forward')
23 | pyprof.nvtx.wrap(Foo, 'backward')
24 | 
25 | foo = Foo.apply
26 | 
27 | x = torch.ones(4,4).cuda()
28 | y = torch.ones(4,4).cuda()
29 | 
30 | with torch.autograd.profiler.emit_nvtx():
31 | 	profiler.start()
32 | 	z = foo(x,y)
33 | 	profiler.stop()
34 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/custom_func_module/custom_module.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | pyprof.nvtx.init()
 7 | 
 8 | class Foo(torch.nn.Module):
 9 |     def __init__(self, size):
10 |         super(Foo, self).__init__()
11 |         self.n = torch.nn.Parameter(torch.ones(size))
12 |         self.m = torch.nn.Parameter(torch.ones(size))
13 | 
14 |     def forward(self, input):
15 |         return self.n*input + self.m
16 | 
17 | #Hook the forward function to pyprof
18 | pyprof.nvtx.wrap(Foo, 'forward')
19 | 
20 | foo = Foo(4)
21 | foo.cuda()
22 | x = torch.ones(4).cuda()
23 | 
24 | with torch.autograd.profiler.emit_nvtx():
25 | 	profiler.start()
26 | 	z = foo(x)
27 | 	profiler.stop()
28 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/custom_func_module/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo $sql python $f"
20 | 	nvprof -fo $sql python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	$prof -w 130 $dict
29 | 	\rm $sql $dict
30 | done
31 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/imagenet/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python -m apex.pyprof.parse"
10 | prof="python -m apex.pyprof.prof"
11 | 
12 | for net in "resnet50"
13 | do
14 | 	for optim in adam sgd
15 | 	do
16 | 		for batch in 32 64
17 | 		do
18 | 			base="torchvision".$net.$optim.$batch
19 | 			sql=$base.sql
20 | 			dict=$base.dict
21 | 
22 | 			#NVprof
23 | 			echo "nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch"
24 | 			nvprof -fo $sql --profile-from-start off python imagenet.py -m ${net} -o $optim -b $batch
25 | 
26 | 			#Parse
27 | 			echo $parse $sql
28 | 			$parse $sql > $dict
29 | 
30 | 			#Prof
31 | 			echo $prof $dict
32 | 			$prof -w 130 $dict
33 | #			\rm $sql $dict
34 | 		done
35 | 	done
36 | done
37 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/jit/README.md:
--------------------------------------------------------------------------------
 1 | *As of this writing, these examples do not work
 2 | because of changes being proposed in PyTorch.*
 3 | 
 4 | There are two ways to use PyTorch JIT
 5 |  - Scripting
 6 |  - Tracing
 7 | 
 8 | In addition, we can JIT a
 9 |  - Stand alone function
10 |  - Class / class method
11 | 
12 | This directory has an example for each of the 4 cases.
13 | Intercepting (monkey patching) JITted code has a few extra steps,
14 | which are explained through comments.
15 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/jit/jit_script_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | #The following creates an object "foo" of type ScriptModule
 8 | #The new object has a function called "forward"
 9 | 
10 | @torch.jit.script
11 | def foo(x, y):
12 | 	return torch.sigmoid(x) + y
13 | 
14 | #Initialize pyprof after the JIT step
15 | pyprof.nvtx.init()
16 | 
17 | #Assign a name to the object "foo"
18 | foo.__name__ = "foo"
19 | 
20 | #Hook up the forward function to pyprof
21 | pyprof.nvtx.wrap(foo, 'forward')
22 | 
23 | x = torch.zeros(4,4).cuda()
24 | y = torch.ones(4,4).cuda()
25 | 
26 | with torch.autograd.profiler.emit_nvtx():
27 | 	profiler.start()
28 | 	z = foo(x, y)
29 | 	profiler.stop()
30 | 	print(z)
31 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/jit/jit_script_method.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | class Foo(torch.jit.ScriptModule):
 8 |     def __init__(self, size):
 9 |         super(Foo, self).__init__()
10 |         self.n = torch.nn.Parameter(torch.ones(size))
11 |         self.m = torch.nn.Parameter(torch.ones(size))
12 | 
13 |     @torch.jit.script_method
14 |     def forward(self, input):
15 |         return self.n*input + self.m
16 | 
17 | #Initialize pyprof after the JIT step
18 | pyprof.nvtx.init()
19 | 
20 | #Hook up the forward function to pyprof
21 | pyprof.nvtx.wrap(Foo, 'forward')
22 | 
23 | foo = Foo(4)
24 | foo.cuda()
25 | x = torch.ones(4).cuda()
26 | 
27 | with torch.autograd.profiler.emit_nvtx():
28 | 	profiler.start()
29 | 	z = foo(x)
30 | 	profiler.stop()
31 | 	print(z)
32 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/jit/jit_trace_function.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | def foo(x, y):
 8 | 	return torch.sigmoid(x) + y
 9 | 
10 | x = torch.zeros(4,4).cuda()
11 | y = torch.ones(4,4).cuda()
12 | 
13 | #JIT the function using tracing
14 | #This returns an object of type ScriptModule with a forward method.
15 | traced_foo = torch.jit.trace(foo, (x,y))
16 | 
17 | #Initialize pyprof after the JIT step
18 | pyprof.nvtx.init()
19 | 
20 | #Assign a name to the object "traced_foo"
21 | traced_foo.__dict__['__name__'] = "foo"
22 | 
23 | #Hook up the forward function to pyprof
24 | pyprof.nvtx.wrap(traced_foo, 'forward')
25 | 
26 | with torch.autograd.profiler.emit_nvtx():
27 | 	profiler.start()
28 | 	z = traced_foo(x, y)
29 | 	profiler.stop()
30 | 	print(z)
31 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/jit/jit_trace_method.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.cuda.profiler as profiler
 5 | from apex import pyprof
 6 | 
 7 | class Foo(torch.nn.Module):
 8 |     def __init__(self, size):
 9 |         super(Foo, self).__init__()
10 |         self.n = torch.nn.Parameter(torch.ones(size))
11 |         self.m = torch.nn.Parameter(torch.ones(size))
12 | 
13 |     def forward(self, input):
14 |         return self.n*input + self.m
15 | 
16 | foo = Foo(4)
17 | foo.cuda()
18 | x = torch.ones(4).cuda()
19 | 
20 | #JIT the class using tracing
21 | traced_foo = torch.jit.trace(foo, x)
22 | 
23 | #Initialize pyprof after the JIT step
24 | pyprof.nvtx.init()
25 | 
26 | #Assign a name to the object "traced_foo"
27 | traced_foo.__dict__['__name__'] = "foo"
28 | 
29 | #Hook up the forward function to pyprof
30 | pyprof.nvtx.wrap(traced_foo, 'forward')
31 | 
32 | with torch.autograd.profiler.emit_nvtx():
33 | 	profiler.start()
34 | 	z = traced_foo(x)
35 | 	profiler.stop()
36 | 	print(z)
37 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/jit/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo $sql python $f"
20 | 	nvprof -fo $sql python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	$prof -w 130 $dict
29 | 	\rm $sql $dict
30 | done
31 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/lenet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.cuda.profiler as profiler
 7 | import torch.optim as optim
 8 | 
 9 | from apex import pyprof
10 | pyprof.nvtx.init()
11 | 
12 | class LeNet5(nn.Module):
13 | 	def __init__(self):
14 | 		super(LeNet5, self).__init__()
15 | 		# 1 input image channel, 6 output channels, 5x5 square convolution
16 | 		# kernel
17 | 		self.conv1 = nn.Conv2d(1, 6, 5)
18 | 		self.conv2 = nn.Conv2d(6, 16, 5)
19 | 		# an affine operation: y = Wx + b
20 | 		self.fc1 = nn.Linear(16 * 5 * 5, 120)
21 | 		self.fc2 = nn.Linear(120, 84)
22 | 		self.fc3 = nn.Linear(84, 10)
23 | 
24 | 	def forward(self, x):
25 | 		# Max pooling over a (2, 2) window
26 | 		x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
27 | 		# If the size is a square you can only specify a single number
28 | 		x = F.max_pool2d(F.relu(self.conv2(x)), 2)
29 | 		x = x.view(-1, self.num_flat_features(x))
30 | 		x = F.relu(self.fc1(x))
31 | 		x = F.relu(self.fc2(x))
32 | 		x = self.fc3(x)
33 | 		return x
34 | 
35 | 	def num_flat_features(self, x):
36 | 		size = x.size()[1:]  # all dimensions except the batch dimension
37 | 		num_features = 1
38 | 		for s in size:
39 | 			num_features *= s
40 | 		return num_features
41 | 
42 | with torch.autograd.profiler.emit_nvtx():
43 | 
44 | 	net = LeNet5().cuda()
45 | 
46 | 	input = torch.randn(1, 1, 32, 32).cuda()
47 | 	out = net(input)
48 | 
49 | 	target = torch.randn(10)			# a dummy target, for example
50 | 	target = target.view(1, -1).cuda()	# make it the same shape as output
51 | 	criterion = nn.MSELoss()
52 | 
53 | 	# create your optimizer
54 | 	optimizer = optim.SGD(net.parameters(), lr=0.01)
55 | 
56 | 	# in your training loop:
57 | 	optimizer.zero_grad()	# zero the gradient buffers
58 | 
59 | 	profiler.start()
60 | 	output = net(input)
61 | 	loss = criterion(output, target)
62 | 	loss.backward()
63 | 	optimizer.step()	# Does the update
64 | 	profiler.stop()
65 | 
66 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/simple.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | This simple file provides an example of how to
 5 |  - import the pyprof library and initialize it
 6 |  - use the emit_nvtx context manager
 7 |  - start and stop the profiler
 8 | 
 9 | Only kernels within profiler.start and profiler.stop calls are profiled.
10 | To profile
11 | $ nvprof -f -o simple.sql --profile-from-start off ./simple.py
12 | """
13 | 
14 | import sys
15 | import torch
16 | import torch.cuda.profiler as profiler
17 | 
18 | #Import and initialize pyprof
19 | from apex import pyprof
20 | pyprof.nvtx.init()
21 | 
22 | a = torch.randn(5, 5).cuda()
23 | b = torch.randn(5, 5).cuda()
24 | 
25 | #Context manager
26 | with torch.autograd.profiler.emit_nvtx():
27 | 
28 | 	#Start profiler
29 | 	profiler.start()
30 | 
31 | 	c = a + b
32 | 	c = torch.mul(a,b)
33 | 	c = torch.matmul(a,b)
34 | 	c = torch.argmax(a, dim=1)
35 | 	c = torch.nn.functional.pad(a, (1,1))
36 | 
37 | 	#Stop profiler
38 | 	profiler.stop()
39 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/user_annotation/README.md:
--------------------------------------------------------------------------------
 1 | Nvidia NVTX range markers (https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm) 
 2 | are a useful tool to capture and observe events and code ranges etc. 
 3 | Using PyTorch APIs e.g, `torch.cuda.nvtx.range_push("xxx")` and `torch.cuda.nvtx.range_pop()` users can easily add their own NVTX range markers. These markers can then be observed in the Nvidia Visual Profiler (NVVP).
 4 | 
 5 | While inserting NVTX markers (strings), if the users follow a specific string pattern `"layer:your_string_here"` e.g. `"layer:conv1"` or `"layer:encoder_layer_3_self_attention`, then `pyprof` will display the strings `conv1` and `encoder_layer_3_self_attention` next to the associated kernels in the output of `prof.py` when used with the `-c layer` option.
 6 | 
 7 | NVTX range markers can be nested and if users follow the above string pattern, the output of `prof.py` will show all the markers associated with a kernel.
 8 | 
 9 | The file `resnet.py` (a simplified version of the torchvision model) shows an example of how users can add (nested) NVTX markers with information which can greatly aid in understanding and analysis of networks.
10 | 
11 | Note that the pattern `"layer:your_string_here"` was chosen to aid information extraction by `pyprof`. The tool will work seamlessly even if there are other markers or no markers at all.
12 | 
13 | ### To run
14 | 
15 | ```sh
16 | nvprof -fo resnet.sql --profile-from-start off python resnet.py
17 | parse.py resnet.sql > resnet.dict
18 | prof.py --csv -c idx,layer,dir,mod,op,kernel,params,sil resnet.dict
19 | ```
20 | 
21 | The file `resnet.sql` can also be opened with NVVP as usual.
22 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/examples/user_annotation/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | SCRIPT=`realpath $0`
 6 | SCRIPTPATH=`dirname $SCRIPT`
 7 | PYPROF="$SCRIPTPATH/../.."
 8 | 
 9 | parse="python $PYPROF/parse/parse.py"
10 | prof="python $PYPROF/prof/prof.py"
11 | 
12 | for f in *.py
13 | do
14 | 	base=`basename $f .py`
15 | 	sql=$base.sql
16 | 	dict=$base.dict
17 | 
18 | 	#NVprof
19 | 	echo "nvprof -fo --profile-from-start off $sql python $f"
20 | 	nvprof -fo $sql --profile-from-start off python $f
21 | 
22 | 	#Parse
23 | 	echo $parse $sql
24 | 	$parse $sql > $dict
25 | 
26 | 	#Prof
27 | 	echo $prof $dict
28 | 	#$prof -w 130 $dict
29 | 	$prof --csv -c idx,layer,dir,mod,op,kernel,params,sil $dict
30 | 	\rm $sql $dict
31 | done
32 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/nvtx/__init__.py:
--------------------------------------------------------------------------------
1 | from .nvmarker import init
2 | from .nvmarker import add_wrapper as wrap
3 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/apex/pyprof/parse/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/parse/__main__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | try:
 4 |     from .parse import main
 5 | except ImportError as e:
 6 |     warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?)")
 7 |     raise e
 8 | 
 9 | if __name__ == '__main__':
10 |     main()
11 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/parse/db.py:
--------------------------------------------------------------------------------
 1 | import sys, sqlite3
 2 | 
 3 | class DB(object):
 4 | 	"""
 5 | 	This class provides functions for DB operations
 6 | 	with exception handling.
 7 | 	"""
 8 | 
 9 | 	def __init__(self, dbFile):
10 | 		try:
11 | 			conn = sqlite3.connect(dbFile)
12 | 			conn.row_factory = sqlite3.Row
13 | 			c = conn.cursor()
14 | 		except:
15 | 			print("Error opening {}".format(dbFile))
16 | 			sys.exit(1)
17 | 
18 | 		self.conn = conn
19 | 		self.c = c
20 | 
21 | 	def select(self, cmd):
22 | 		try:
23 | 			self.c.execute(cmd)
24 | 			#rows = self.c.fetchall()
25 | 			rows = [dict(row) for row in self.c.fetchall()]
26 | 		except sqlite3.Error as e:
27 | 			print(e)
28 | 			sys.exit(1)
29 | 		except:
30 | 			print("Uncaught error in SQLite access while executing {}".format(cmd))
31 | 			sys.exit(1)
32 | 
33 | 		#print(rows)
34 | 		return rows
35 | 
36 | 	def insert(self, cmd, data):
37 | 		try:
38 | 			self.c.execute(cmd, data)
39 | 		except sqlite3.Error as e:
40 | 			print(e)
41 | 			sys.exit(1)
42 | 		except:
43 | 			print("Uncaught error in SQLite access while executing {}".format(cmd))
44 | 			sys.exit(1)
45 | 
46 | 	def execute(self, cmd):
47 | 		try:
48 | 			self.c.execute(cmd)
49 | 		except sqlite3.Error as e:
50 | 			print(e)
51 | 			sys.exit(1)
52 | 		except:
53 | 			print("Uncaught error in SQLite access while executing {}".format(cmd))
54 | 			sys.exit(1)
55 | 
56 | 	def commit(self):
57 | 		self.conn.commit()
58 | 
59 | 	def close(self):
60 | 		self.c.close()
61 | 		self.conn.close()
62 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/__init__.py:
--------------------------------------------------------------------------------
1 | from . import data, prof
2 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/__main__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | try:
 4 |     from .prof import main
 5 | except ImportError as e:
 6 |     warnings.warn("Did you make sure to install PyProf dependencies by using the --pyprof flag during Apex installation?")
 7 |     raise e
 8 | 
 9 | if __name__ == '__main__':
10 |     main()
11 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/activation.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Activation(OperatorLayerBase):
 6 | 	"""
 7 | 	This class handles the various activation functions.
 8 | 	"""
 9 | 
10 | 	ops = ["celu", "elu", "elu_", "hardshrink", "hardtanh", "hardtanh_", "leaky_relu", "leaky_relu_", "logsigmoid", "prelu", "relu", "relu_", "relu6", "rrelu", "rrelu_", "selu", "sigmoid", "softplus", "softshrink", "softsign", "tanh", "tanhshrink", "threshold", "threshold_"]
11 | 
12 | 	def __init__(self, d):
13 | 		marker = eval(d.argMarker[0])
14 | 		mod = marker['mod']
15 | 		op = marker['op']
16 | 		args = marker['args']
17 | 
18 | 		self.marker = marker
19 | 		self.mod_ = mod
20 | 		self.op_ = op
21 | 		self.args = args
22 | 
23 | 		assert (mod in ["torch.nn.functional", "torch", "Tensor"])
24 | 
25 | 		#Filter out named parameters
26 | 		args = list(filter(lambda x : x['name'] == '', args))
27 | 
28 | 		assert (len(args) >= 1)
29 | 		arg = args[0]
30 | 		assert (arg['type'] == "tensor")
31 | 
32 | 		self.i = arg
33 | 		self.dir = d.dir
34 | 
35 | 	def params(self):
36 | 		p = OrderedDict([('T', self.i['shape']),('type', self.i['dtype'])])
37 | 		return p
38 | 
39 | 	def flops(self):
40 | 		direction = self.dir
41 | 		tensor = self.i['shape']
42 | 		t = self.i['dtype']
43 | 
44 | 		# TODO: revise
45 | 		elems = Utility.numElems(tensor)
46 | 		return elems
47 | 
48 | 	def bytes(self):
49 | 		direction = self.dir
50 | 		tensor = self.i['shape']
51 | 		t = self.i['dtype']
52 | 
53 | 		elems = Utility.numElems(tensor)
54 | 		elems = elems * (2 if direction == "fprop" else 3)
55 | 
56 | 		return elems * Utility.typeToBytes(t)
57 | 
58 | 	def tc(self):
59 | 		return "-"
60 | 
61 | 	def op(self):
62 | 		return self.op_
63 | 
64 | 	def mod(self):
65 | 		return self.mod_
66 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | class OperatorLayerBase(ABC):
 4 | 	"""
 5 | 	Base class for all layers and operators.
 6 | 	Every derived class should have the following functions.
 7 | 	"""
 8 | 
 9 | 	@abstractmethod
10 | 	def tc(self):
11 | 		"""
12 | 		Tensor core usage by the kernel.
13 | 		Return "1" (yes), "0" (no, but possible), "-" (not applicable)
14 | 		"""
15 | 		pass
16 | 
17 | 	@abstractmethod
18 | 	def params(self):
19 | 		"""
20 | 		Kernel parameters to be printed.
21 | 		"""
22 | 		pass
23 | 
24 | 	@abstractmethod
25 | 	def flops(self):
26 | 		"""
27 | 		Note that 1 FMA = 2 flops.
28 | 		"""
29 | 		pass
30 | 
31 | 	@abstractmethod
32 | 	def bytes(self):
33 | 		pass
34 | 
35 | 	@abstractmethod
36 | 	def mod(self):
37 | 		"""
38 | 		Name of the module/class e.g. torch.nn.functional.
39 | 		"""
40 | 		pass
41 | 
42 | 	@abstractmethod
43 | 	def op(self):
44 | 		"""
45 | 		Name of the operator e.g. sigmoid.
46 | 		"""
47 | 		pass
48 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/convert.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Convert(OperatorLayerBase):
 6 | 	"""
 7 | 	Class to handle convert operations.
 8 | 	"""
 9 | 	ops = ["byte", "char", "double", "float", "half", "int", "long", "short", "to"]
10 | 
11 | 	def __init__(self, d):
12 | 		marker = eval(d.argMarker[0])
13 | 		mod = marker['mod']
14 | 		op = marker['op']
15 | 		args = marker['args']
16 | 
17 | 		self.marker = marker
18 | 		self.mod_ = mod
19 | 		self.op_ = op
20 | 		self.args = args
21 | 
22 | 		assert (mod == "Tensor")
23 | 		assert (op in Convert.ops)
24 | 		assert (len(args) == 1)
25 | 
26 | 		#The argument could be a tensor or scalar
27 | 		t = args[0]
28 | 		if t['type'] == "tensor":
29 | 			shape = t['shape']
30 | 			stype = t['dtype']
31 | 		else:
32 | 			shape = (1,)
33 | 			stype = t['type']
34 | 		if self.op_ == "to":
35 | 			op = stype
36 | 
37 | 		self.shape = shape
38 | 		self.stype = stype
39 | 		self.dtype = op
40 | 
41 | 	def params(self):
42 | 		p = OrderedDict([('T', self.shape), ('stype', self.stype), ('dtype', self.dtype)])
43 | 		return p
44 | 
45 | 	def op(self):
46 | 		return self.op_
47 | 
48 | 	def mod(self):
49 | 		return self.mod_
50 | 
51 | 	def tc(self):
52 | 		return "-"
53 | 
54 | 	def elems(self):
55 | 		return Utility.numElems(self.shape)
56 | 
57 | 	def flops(self):
58 | 		return 0
59 | 
60 | 	def bytes(self):
61 | 		b = self.elems() * (Utility.typeToBytes(self.stype) + Utility.typeToBytes(self.dtype))
62 | 		return b
63 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/data.py:
--------------------------------------------------------------------------------
 1 | from .utility import Utility
 2 | 
 3 | class Data(object):
 4 | 	"""
 5 | 	Class to store all the data for every kernel e.g. name, bytes, flops, device, stream etc.
 6 | 	"""
 7 | 	def __init__(self, kernel):
 8 | 		#Available from NVprof
 9 | 		self.tid = kernel['tid']
10 | 		self.device = kernel['device']
11 | 		self.stream = kernel['stream']
12 | 		self.grid = str(kernel['grid']).replace(" ","").replace("(","").replace(")","")
13 | 		self.block = str(kernel['block']).replace(" ","").replace("(","").replace(")","")
14 | 		self.name = kernel['kShortName'].replace(" ","_")
15 | 		self.lName = kernel['kLongName']
16 | 		self.sil = kernel['kDuration']	#units ns
17 | 
18 | 		self.index = None
19 | 
20 | 		#Markers
21 | 		self.argMarker = kernel['marker']
22 | 		self.modMarker = kernel['reprMarkers']
23 | 		self.seqMarker = kernel['seqMarker']
24 | 
25 | 		self.layer = kernel['layer']
26 | 		self.trace = kernel['trace']
27 | 
28 | 		self.seqId = kernel['seqId']
29 | 		self.altSeqId = kernel['altSeqId']
30 | 
31 | 		self.dir = kernel['dir']
32 | 		self.sub = kernel['subSeqId']
33 | 
34 | 		self.mod = "na"
35 | 		self.op = "na"
36 | 		self.params = {"na":"na"}
37 | 		self.tc = "na"
38 | 		self.flops = 0
39 | 		self.bytes = 0
40 | 
41 | 	def setParams(self, params):
42 | 		#Remove space from params
43 | 		qaz = ""
44 | 		for key,value in params.items():
45 | 			if "type" not in key:
46 | 				qaz += "{}={},".format(key,value)
47 | 			else:
48 | 				if type(value) is str:
49 | 					qaz += "{},".format(Utility.typeToString(value))
50 | 				else:
51 | 					qaz += "{}".format(value)
52 | 
53 | 		self.params = qaz.replace(" ", "")
54 | 
55 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/dropout.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Dropout(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (mod == "torch.nn.functional")
19 | 		assert (op == "dropout")
20 | 		#assert (len(args) == 1)
21 | 
22 | 		self.shape = args[0]['shape']
23 | 		self.type  = args[0]['dtype']
24 | 		self.dir = d.dir
25 | 
26 | 		return
27 | 
28 | 	def params(self):
29 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
30 | 		return p
31 | 
32 | 	def op(self):
33 | 		return self.op_
34 | 
35 | 	def mod(self):
36 | 		return self.mod_
37 | 
38 | 	def tc(self):
39 | 		return "-"
40 | 
41 | 	def elems(self):
42 | 		return Utility.numElems(self.shape)
43 | 
44 | 	def bytes(self):
45 | 		#Ignoring the cost of writing and reading the mask
46 | 		return Utility.typeToBytes(self.type) * self.elems() * 2
47 | 
48 | 	def flops(self):
49 | 		# Note: This is approximate and depends on the RNG
50 | 		return 5*self.elems()
51 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/embedding.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class Embedding(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (mod == "torch.nn.functional")
19 | 		assert (op == "embedding")
20 | 
21 | 		self.ishape = args[0]['shape']
22 | 		self.itype = args[0]['dtype']
23 | 
24 | 		self.eshape = args[1]['shape']
25 | 		self.etype = args[1]['dtype']
26 | 
27 | 		assert (len(self.eshape) == 2)
28 | 
29 | 		self.dir = d.dir
30 | 		self.sub = d.sub
31 | 		return
32 | 
33 | 	def params(self):
34 | 		p = OrderedDict([('I', self.ishape), ('itype', self.itype), ('E', self.eshape), ('etype', self.etype)])
35 | 		return p
36 | 
37 | 	def op(self):
38 | 		return self.op_
39 | 
40 | 	def mod(self):
41 | 		return self.mod_
42 | 
43 | 	def tc(self):
44 | 		return "-"
45 | 
46 | 	def bytes(self):
47 | 		ishape = self.ishape
48 | 		itype = self.itype
49 | 		eshape = self.eshape
50 | 		etype = self.etype
51 | 
52 | 		ielems = Utility.numElems(ishape)
53 | 
54 | 		b = 0
55 | 		if self.dir == "fprop":
56 | 			#indices
57 | 			b += ielems * Utility.typeToBytes(itype)
58 | 			#read and write the embedding matrix
59 | 			b += ielems * eshape[1] * 2 * Utility.typeToBytes(etype)
60 | 		else:
61 | 			#3 times the size of the incoming gradient
62 | 			b = ielems * eshape[1] * 3 * Utility.typeToBytes(etype)
63 | 
64 | 			if self.sub > 0:
65 | 				b = 0
66 | 
67 | 		return b
68 | 
69 | 	def flops(self):
70 | 		# Note: not implemented yet
71 | 		return 0
72 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/loss.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | #TODO: Add support for additional loss functions.
 6 | 
 7 | class MSELoss(OperatorLayerBase):
 8 | 
 9 | 	def __init__(self, d):
10 | 		marker = eval(d.argMarker[0])
11 | 		mod = marker['mod']
12 | 		op = marker['op']
13 | 		args = marker['args']
14 | 
15 | 		self.marker = marker
16 | 		self.mod_ = mod
17 | 		self.op_ = op
18 | 		self.args = args
19 | 
20 | 		assert (mod == "torch.nn.functional")
21 | 		assert (op == "mse_loss")
22 | 		assert (len(args) == 3)
23 | 
24 | 		#Get input, target and reduction
25 | 		if (args[0]['name'] == ""):
26 | 			x = args[0]
27 | 		else:
28 | 			x = list(filter(lambda x : x['name'] == "input", args))[0]
29 | 
30 | 		if (args[1]['name'] == ""):
31 | 			y = args[1]
32 | 		else:
33 | 			y = list(filter(lambda x : x['name'] == "target", args))[0]
34 | 
35 | 		if (args[2]['name'] == ""):
36 | 			r = args[2]
37 | 		else:
38 | 			r = list(filter(lambda x : x['name'] == "reduction", args))[0]
39 | 
40 | 		assert (x['type'] == y['type'] == "tensor")
41 | 		assert (x['shape'] == y['shape'])
42 | 		assert (x['dtype'] == y['dtype'])
43 | 		assert (r['type'] == "str")
44 | 		assert (r['value'] in ["none", "mean", "sum"])
45 | 
46 | 		self.shape = x['shape']
47 | 		self.type = x['dtype']
48 | 		self.red = r['value']
49 | 		self.dir = d.dir
50 | 
51 | 	def params(self):
52 | 		p = OrderedDict([('T', self.shape), ('type', self.type), ('red', self.red)])
53 | 		return p
54 | 
55 | 	def elems(self):
56 | 		red = self.red
57 | 		e = Utility.numElems(self.shape)
58 | 
59 | 		if self.dir == "fprop":
60 | 			if red == "none":
61 | 				e *= 3
62 | 			else:
63 | 				e *= 2
64 | 		else:
65 | 			if red == "none":
66 | 				e *= 4
67 | 			else:
68 | 				e *= 3
69 | 		return e
70 | 
71 | 	def bytes(self):
72 | 		return self.elems() * Utility.typeToBytes(self.type)
73 | 
74 | 	def flops(self):
75 | 		return self.elems() * 2 + 1
76 | 
77 | 	def tc(self):
78 | 		return "-"
79 | 
80 | 	def op(self):
81 | 		return self.op_
82 | 
83 | 	def mod(self):
84 | 		return self.mod_
85 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/normalization.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class BatchNorm(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (op == "batch_norm")
19 | 		assert (len(args) == 8)
20 | 		i = args[0]
21 | 		assert (i['type'] == "tensor")
22 | 
23 | 		self.shape = i['shape']
24 | 		self.type = i['dtype']
25 | 		self.dir = d.dir
26 | 
27 | 	def params(self):
28 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
29 | 		return p
30 | 
31 | 	def tc(self):
32 | 		return "-"
33 | 
34 | 	def op(self):
35 | 		return self.op_
36 | 
37 | 	def mod(self):
38 | 		return self.mod_
39 | 
40 | 	def elems(self):
41 | 		return Utility.numElems(self.shape)
42 | 
43 | 	def flops(self):
44 | 		# Variance algo-dependent, but this is a reasonable value.
45 | 		return self.elems() * 8
46 | 
47 | 	def bytes(self):
48 | 		e = self.elems()
49 | 		if self.dir == "fprop":
50 | 			e *= 4
51 | 		else:
52 | 			e *= 5
53 | 
54 | 		return e * Utility.typeToBytes(self.type)
55 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/optim.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | #TODO: Add support for other optimizers.
 6 | 
 7 | class Adam(OperatorLayerBase):
 8 | 
 9 | 	def __init__(self, d):
10 | 		marker = eval(d.argMarker[0])
11 | 		mod = marker['mod']
12 | 		op = marker['op']
13 | 		args = marker['args']
14 | 
15 | 		self.marker = marker
16 | 		self.mod_ = mod
17 | 		self.op_ = op
18 | 		self.args = args
19 | 
20 | 		assert(op == "adam")
21 | 		assert (len(args) == 12) or (len(args) == 14)
22 | 		w, hw, m, v, g = args[0:5]
23 | 		assert (w['shape'] == m['shape'] == v['shape'] == g['shape'])
24 | 		assert (hw['shape'] == w['shape']) or (hw['shape'] == (0,))		#hw could be null
25 | 		assert (w['type'] == m['type'] == v['type'] == g['type'] == hw['type'] == "tensor")
26 | 		assert (w['dtype'] == m['dtype'] == v['dtype'] == "float32")
27 | 
28 | 		self.w = w
29 | 		self.g = g
30 | 
31 | 	def params(self):
32 | 		p = OrderedDict([('T',self.w['shape']), ('wtype',self.w['dtype']), ('gtype',self.g['dtype'])])
33 | 		return p
34 | 
35 | 	def flops(self):
36 | 		return 0
37 | 
38 | 	def bytes(self):
39 | 		wshape = self.w['shape']
40 | 		wtype = self.w['dtype']
41 | 		gtype = self.g['dtype']
42 | 		b = 0
43 | 
44 | 		elems = Utility.numElems(wshape)
45 | 
46 | 		#Get time to stream read/write w, m, v
47 | 		b += 6 * elems *  Utility.typeToBytes(wtype)
48 | 
49 | 		#Get time to read "g"
50 | 		b += elems * Utility.typeToBytes(gtype)
51 | 
52 | 		if wtype != gtype: #mixed precision
53 | 			#Get time to write "hw
54 | 			b += elems * Utility.typeToBytes(gtype)
55 | 
56 | 		return b
57 | 
58 | 	def tc(self):
59 | 		return "-"
60 | 
61 | 	def op(self):
62 | 		return self.op_
63 | 
64 | 	def mod(self):
65 | 		return self.mod_
66 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/pooling.py:
--------------------------------------------------------------------------------
 1 | from .collections import OrderedDict
 2 | from .utility import Utility
 3 | 
 4 | # Work in progress.
 5 | 
 6 | #poolFuncs = ["max_pool2d_with_indices_forward", "max_pool2d_with_indices"]
 7 | class MaxPool2d(object):
 8 | 
 9 | 	def parse(marker):
10 | 
11 | 		def convert2Tuple(arg):
12 | 			assert (arg['type'] in ["int", "tuple"])
13 | 			if arg['type'] == "int":
14 | 				return (arg['value'], arg['value'])
15 | 			else:
16 | 				return arg['value']
17 | 
18 | 		mod = marker['mod']
19 | 		op = marker['op']
20 | 		args = marker['args']
21 | 		assert (mod == "torch.nn.functional")
22 | 		assert (op == "max_pool2d")
23 | 		assert (len(args) >= 2)
24 | 
25 | 		#input
26 | 		assert (args[0]['name'] == "")
27 | 		inp = args[0]
28 | 		assert (inp['type'] == "tensor")
29 | 		i = inp['shape']
30 | 		t = inp['dtype']
31 | 		assert (len(i) == 4) #nchw tensor
32 | 
33 | 		#kernel
34 | 		if (args[1]['name'] == ""):
35 | 			k = args[1]
36 | 		else:
37 | 			k = list(filter(lambda x : x['name'] == "kernel_size", args))[0]
38 | 		k = convert2Tuple(k)
39 | 
40 | 		#stride
41 | 		s = k #default value
42 | 		if ((len(args) >= 3) and args[2] == ""):
43 | 			s = args[2]
44 | 			s = convert2Tuple(s)
45 | 		elif any(x['name'] == "stride" for x in args):
46 | 			s = list(filter(lambda x : x['name'] == "stride", args))[0]
47 | 			s = convert2Tuple(s)
48 | 
49 | 		#padding
50 | 		p = (0,0)
51 | 		if ((len(args) >= 4) and args[3] == ""):
52 | 			p = args[3]
53 | 			p = convert2Tuple(p)
54 | 		elif any(x['name'] == "padding" for x in args):
55 | 			p = list(filter(lambda x : x['name'] == "padding", args))[0]
56 | 			p = convert2Tuple(p)
57 | 		
58 | 		params = OrderedDict([('T', i), ('K', k), ('s',s), ('p',p), ('type', t)])
59 | 		return params
60 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/randomSample.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | from .utility import Utility
 3 | from .base import OperatorLayerBase
 4 | 
 5 | class RandPerm(OperatorLayerBase):
 6 | 
 7 | 	def __init__(self, d):
 8 | 		marker = eval(d.argMarker[0])
 9 | 		mod = marker['mod']
10 | 		op = marker['op']
11 | 		args = marker['args']
12 | 
13 | 		self.marker = marker
14 | 		self.mod_ = mod
15 | 		self.op_ = op
16 | 		self.args = args
17 | 
18 | 		assert (mod == "torch")
19 | 		assert (op == "randperm")
20 | 		assert (len(args) == 1)
21 | 		n = args[0]
22 | 		assert n['type'] == "int"
23 | 		self.n = n['value']
24 | 
25 | 	def params(self):
26 | 		p = OrderedDict([('N', self.n)])
27 | 		return p
28 | 
29 | 	def tc(self):
30 | 		return "-"
31 | 
32 | 	def op(self):
33 | 		return self.op_
34 | 
35 | 	def mod(self):
36 | 		return self.mod_
37 | 
38 | 	def bytes(self):
39 | 		return self.n * Utility.typeToBytes("int64")
40 | 
41 | 	def flops(self):
42 | 		# Depends on RNG but this is probably a reasonable assumption.
43 | 		return self.n * 3
44 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/softmax.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from .utility import Utility
  3 | from .base import OperatorLayerBase
  4 | 
  5 | class Softmax(OperatorLayerBase):
  6 | 
  7 | 	def __init__(self, d):
  8 | 		marker = eval(d.argMarker[0])
  9 | 		mod = marker['mod']
 10 | 		op = marker['op']
 11 | 		args = marker['args']
 12 | 
 13 | 		self.marker = marker
 14 | 		self.mod_ = mod
 15 | 		self.op_ = op
 16 | 		self.args = args
 17 | 
 18 | 		assert (mod == "torch.nn.functional")
 19 | 		assert (op == "softmax")
 20 | 
 21 | 		#Filter out named parameters
 22 | 		args = list(filter(lambda x : x['name'] == '', args))
 23 | 
 24 | 		assert (len(args) <= 2)
 25 | 		self.shape = args[0]['shape']
 26 | 		self.type = args[0]['dtype']
 27 | 		self.dir = d.dir
 28 | 
 29 | 		return
 30 | 
 31 | 	def op(self):
 32 | 		return self.op_
 33 | 
 34 | 	def mod(self):
 35 | 		return self.mod_
 36 | 
 37 | 	def tc(self):
 38 | 		return "-"
 39 | 
 40 | 	def params(self):
 41 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
 42 | 		return p
 43 | 
 44 | 	def elems(self):
 45 | 		return Utility.numElems(self.shape)
 46 | 
 47 | 	def flops(self):
 48 | 		# Note: exp, sum-reduce, divide
 49 | 		#flops = elems * 3
 50 | 		return 0
 51 | 
 52 | 	def bytes(self):
 53 | 		b = self.elems() * Utility.typeToBytes(self.type)
 54 | 		b *= 3 if self.dir == "fprop" else 5 #verify
 55 | 		return b
 56 | 
 57 | class LogSoftmax(OperatorLayerBase):
 58 | 
 59 | 	def __init__(self, d):
 60 | 		marker = eval(d.argMarker[0])
 61 | 		mod = marker['mod']
 62 | 		op = marker['op']
 63 | 		args = marker['args']
 64 | 
 65 | 		self.marker = marker
 66 | 		self.mod_ = mod
 67 | 		self.op_ = op
 68 | 		self.args = args
 69 | 
 70 | 		assert (mod == "torch.nn.functional")
 71 | 		assert (op == "log_softmax")
 72 | 
 73 | 		#Filter out named parameters
 74 | 		args = list(filter(lambda x : x['name'] == '', args))
 75 | 
 76 | 		assert (len(args) <= 2)
 77 | 
 78 | 		#Get input
 79 | 		if (args[0]['name'] == ""):
 80 | 			i = args[0]
 81 | 		else:
 82 | 			i = list(filter(lambda x : x['name'] == "input", args))[0]
 83 | 
 84 | 		t = i['dtype']
 85 | 
 86 | 		self.shape = i['shape']
 87 | 		self.type = i['dtype']
 88 | 		self.dir = d.dir
 89 | 		return
 90 | 
 91 | 	def op(self):
 92 | 		return self.op_
 93 | 
 94 | 	def mod(self):
 95 | 		return self.mod_
 96 | 
 97 | 	def tc(self):
 98 | 		return "-"
 99 | 
100 | 	def params(self):
101 | 		p = OrderedDict([('T', self.shape), ('type', self.type)])
102 | 		return p
103 | 
104 | 	def elems(self):
105 | 		return Utility.numElems(self.shape)
106 | 
107 | 	def flops(self):
108 | 		# Note: exp, sum-reduce, divide, log
109 | 		#flops = elems * 4
110 | 		return 0
111 | 
112 | 	def bytes(self):
113 | 		b = self.elems() * Utility.typeToBytes(self.type)
114 | 		b *= 3 if self.dir == "fprop" else 5 #verify
115 | 		return b
116 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/usage.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | def parseArgs():
 5 | 	"""
 6 | 	Print usage and parse arguments.
 7 | 	"""
 8 | 
 9 | 	def check_cols(value):
10 | 		valid = ["idx", "seq", "altseq", "tid", "layer", "trace", "dir", "sub", "mod", "op", "kernel", "params", "sil", "tc", "device", "stream", "grid", "block", "flops", "bytes"]
11 | 		cols = value.split(",")
12 | 		for col in cols:
13 | 			if col not in valid:
14 | 				raise argparse.ArgumentTypeError("{} is not a valid column name. Valid column names are {}.".format(col, ",".join(valid)))
15 | 		return cols
16 | 
17 | 	def openFile(f):
18 | 		try:
19 | 			d = open(f, "r")
20 | 			return d
21 | 		except IOError:
22 | 			print("Error opening file {}. Exiting.".format(f), file=sys.stderr)
23 | 			sys.exit(1)
24 | 
25 | 	parser = argparse.ArgumentParser(prog=sys.argv[0], description="PyTorch Profiler", formatter_class=argparse.RawTextHelpFormatter)
26 | 	parser.add_argument("file",
27 | 		nargs='?',
28 | 		type=str,
29 | 		default=None,
30 | 		help="Output of parse.py (Python dictionary).")
31 | 
32 | 	parser.add_argument("-c",
33 | 		type=check_cols,
34 | 		default="idx,dir,sub,mod,op,kernel,params,sil",
35 | 		help='''Comma seperated names of columns to print.
36 | idx:      Index
37 | seq:      PyTorch Sequence Id
38 | altseq:   PyTorch Alternate Sequence Id
39 | tid:      Thread Id
40 | layer:    User annotated NVTX string (can be nested)
41 | trace:    Function Call Trace
42 | dir:      Direction
43 | sub:      Sub Sequence Id
44 | mod:      Module
45 | op:       Operattion
46 | kernel:   Kernel Name
47 | params:   Parameters
48 | sil:      Silicon Time (in ns)
49 | tc:       Tensor Core Usage
50 | device:   GPU Device Id
51 | stream:   Stream Id
52 | grid:     Grid Dimensions
53 | block:    Block Dimensions
54 | flops:    Floating point ops (FMA = 2 FLOPs)
55 | bytes:    Number of bytes in and out of DRAM
56 | e.g. -c idx,kernel,sil''')
57 | 
58 | 	group = parser.add_mutually_exclusive_group()
59 | 	group.add_argument("--csv",
60 | 		action="store_true",
61 | 		default=False,
62 | 		help="Print a CSV output.")
63 | 	group.add_argument("-w",
64 | 		type=int,
65 | 		default=0,
66 | 		help="Width of columnated output.")
67 | 
68 | 	args = parser.parse_args()
69 | 	if args.file is None:
70 | 		args.file = sys.stdin
71 | 	else:
72 | 		args.file = openFile(args.file)
73 | 	return args
74 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/pyprof/prof/utility.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | class Utility(object):
 4 | 
 5 | 	@staticmethod
 6 | 	def numElems(shape):
 7 | 		assert (type(shape) == tuple)
 8 | 		return reduce(lambda x,y: x*y, shape, 1)
 9 | 
10 | 	@staticmethod
11 | 	def typeToBytes(t):
12 | 		if (t in ["uint8", "int8", "byte", "char", "bool"]):
13 | 			return 1
14 | 		elif (t in ["float16", "half", "int16", "short"]):
15 | 			return 2
16 | 		elif (t in ["float32", "float", "int32", "int"]):
17 | 			return 4
18 | 		elif (t in ["int64", "long", "float64", "double"]):
19 | 			return 8
20 | 		assert False
21 | 
22 | 	@staticmethod
23 | 	def typeToString(t):
24 | 		if (t in ["uint8", "byte", "char",]):
25 | 			return "uint8"
26 | 		elif (t in ["int8",]):
27 | 			return "int8"
28 | 		elif (t in ["int16", "short",]):
29 | 			return "int16"
30 | 		elif (t in ["float16", "half"]):
31 | 			return "fp16"
32 | 		elif (t in ["float32", "float"]):
33 | 			return "fp32"
34 | 		elif (t in ["int32", "int",]):
35 | 			return "int32"
36 | 		elif (t in ["int64", "long"]):
37 | 			return "int64"
38 | 		elif (t in ["float64", "double",]):
39 | 			return "fp64"
40 | 		elif (t in ["bool",]):
41 | 			return "bool"
42 | 		assert False
43 | 
44 | 	@staticmethod
45 | 	def hasNVTX(marker):
46 | 		if type(marker) is str:
47 | 			try:
48 | 				marker = eval(marker)
49 | 			except:
50 | 				return False
51 | 
52 | 		if type(marker) is dict:
53 | 			keys  = marker.keys()
54 | 			return ("mod" in keys) and ("op" in keys) and ("args" in keys)
55 | 		else:
56 | 			return False
57 | 
58 | 	@staticmethod
59 | 	def isscalar(t):
60 | 		return (t in ["float", "int"])
61 | 


--------------------------------------------------------------------------------
/ghost/apex/apex/reparameterization/README.md:
--------------------------------------------------------------------------------
1 | Under construction...
2 | 


--------------------------------------------------------------------------------
/ghost/apex/csrc/compat.h:
--------------------------------------------------------------------------------
 1 | #ifndef TORCH_CHECK
 2 | #define TORCH_CHECK AT_CHECK
 3 | #endif
 4 | 
 5 | #ifdef VERSION_GE_1_3
 6 | #define DATA_PTR data_ptr
 7 | #else
 8 | #define DATA_PTR data
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/ghost/apex/csrc/flatten_unflatten.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <torch/csrc/utils/tensor_flatten.h>
 3 | // https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_flatten.h
 4 | 
 5 | at::Tensor flatten(std::vector<at::Tensor> tensors)
 6 | {
 7 |   return torch::utils::flatten_dense_tensors(tensors);
 8 | }
 9 | 
10 | std::vector<at::Tensor> unflatten(at::Tensor flat, std::vector<at::Tensor> tensors)
11 | {
12 |   return torch::utils::unflatten_dense_tensors(flat, tensors);
13 | }
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("flatten", &flatten, "Flatten dense tensors");
17 |   m.def("unflatten", &unflatten, "Unflatten dense tensors");
18 | }
19 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = NVIDIAAPEX
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | gh-pages:
16 | 	git checkout gh-pages
17 | 	rm -rf build
18 | 	rm -rf source
19 | 	git checkout master -- .
20 | 	make html
21 | 	rm -rf ../_modules ../_sources ../_static
22 | 	mv -fv build/html/* ../
23 | 	rm -rf build
24 | 	git add -A
25 | 	git commit -m "Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit`" && git push origin gh-pages ; git checkout master
26 | 
27 | .PHONY: help Makefile
28 | 
29 | # Catch-all target: route all unknown targets to Sphinx using the new
30 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
31 | %: Makefile
32 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
33 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/_static/css/pytorch_theme.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |     font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
  3 | }
  4 | 
  5 | /* Default header fonts are ugly */
  6 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend, p.caption {
  7 |     font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
  8 | }
  9 | 
 10 | /* Use white for docs background */
 11 | .wy-side-nav-search {
 12 |     background-color: #fff;
 13 | }
 14 | 
 15 | .wy-nav-content-wrap, .wy-menu li.current > a  {
 16 |     background-color: #fff;
 17 | }
 18 | 
 19 | @media screen and (min-width: 1400px) {
 20 |     .wy-nav-content-wrap {
 21 |         background-color: rgba(0, 0, 0, 0.0470588);
 22 |     }
 23 | 
 24 |     .wy-nav-content {
 25 |         background-color: #fff;
 26 |     }
 27 | }
 28 | 
 29 | /* Fixes for mobile */
 30 | .wy-nav-top {
 31 |     background-color: #fff;
 32 |     background-image: url('../img/apex.jpg');
 33 |     background-repeat: no-repeat;
 34 |     background-position: center;
 35 |     padding: 0;
 36 |     margin: 0.4045em 0.809em;
 37 |     color: #333;
 38 | }
 39 | 
 40 | .wy-nav-top > a {
 41 |     display: none;
 42 | }
 43 | 
 44 | @media screen and (max-width: 768px) {
 45 |     .wy-side-nav-search>a img.logo {
 46 |         height: 60px;
 47 |     }
 48 | }
 49 | 
 50 | /* This is needed to ensure that logo above search scales properly */
 51 | .wy-side-nav-search a {
 52 |     display: block;
 53 | }
 54 | 
 55 | /* This ensures that multiple constructors will remain in separate lines. */
 56 | .rst-content dl:not(.docutils) dt {
 57 |     display: table;
 58 | }
 59 | 
 60 | /* Use our red for literals (it's very similar to the original color) */
 61 | .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
 62 |     color: #F05732;
 63 | }
 64 | 
 65 | .rst-content tt.xref, a .rst-content tt, .rst-content tt.xref,
 66 | .rst-content code.xref, a .rst-content tt, a .rst-content code {
 67 |     color: #404040;
 68 | }
 69 | 
 70 | /* Change link colors (except for the menu) */
 71 | 
 72 | a {
 73 |     color: #F05732;
 74 | }
 75 | 
 76 | a:hover {
 77 |     color: #F05732;
 78 | }
 79 | 
 80 | 
 81 | a:visited {
 82 |     color: #D44D2C;
 83 | }
 84 | 
 85 | .wy-menu a {
 86 |     color: #b3b3b3;
 87 | }
 88 | 
 89 | .wy-menu a:hover {
 90 |     color: #b3b3b3;
 91 | }
 92 | 
 93 | /* Default footer text is quite big */
 94 | footer {
 95 |     font-size: 80%;
 96 | }
 97 | 
 98 | footer .rst-footer-buttons {
 99 |     font-size: 125%; /* revert footer settings - 1/80% = 125% */
100 | }
101 | 
102 | footer p {
103 |     font-size: 100%;
104 | }
105 | 
106 | /* For hidden headers that appear in TOC tree */
107 | /* see http://stackoverflow.com/a/32363545/3343043 */
108 | .rst-content .hidden-section {
109 |     display: none;
110 | }
111 | 
112 | nav .hidden-section {
113 |     display: inherit;
114 | }
115 | 
116 | .wy-side-nav-search>div.version {
117 |     color: #000;
118 | }
119 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/_static/img/nv-pytorch2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/docs/source/_static/img/nv-pytorch2.png


--------------------------------------------------------------------------------
/ghost/apex/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 |   {% block sidebartitle %} {{ super() }}
 3 | 
 4 |   <style>
 5 |     /* Sidebar header (and topbar for mobile) */
 6 |     .wy-side-nav-search, .wy-nav-top {
 7 |       background: #76b900;
 8 |     }
 9 | 
10 |     .wy-side-nav-search a:link, .wy-nav-top a:link {
11 |       color: #fff;
12 |     }
13 |     .wy-side-nav-search a:visited, .wy-nav-top a:visited {
14 |       color: #fff;
15 |     }
16 |     .wy-side-nav-search a:hover, .wy-nav-top a:hover {
17 |       color: #fff;
18 |     }
19 | 
20 |     .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
21 |       color: #d9d9d9
22 |     }
23 | 
24 |     .wy-menu-vertical a:active {
25 |       background-color: #76b900
26 |     }
27 | 
28 |     .wy-side-nav-search>div.version {
29 |       color: rgba(0, 0, 0, 0.3)
30 |     }
31 |   </style>
32 |   {% endblock %}
33 | 
34 |   {% block footer %} {{ super() }}
35 | 
36 |   <style>
37 |   a:link, a:visited {
38 |     color: #76b900;
39 |   }
40 | 
41 |   a:hover {
42 |     color: #8c0;
43 |   }
44 | 
45 |   .rst-content dl:not(.docutils) dt {
46 |     background: rgba(118, 185, 0, 0.1);
47 |     color: rgba(59,93,0,1);
48 |     border-top: solid 3px rgba(59,93,0,1);
49 |   }
50 |   </style>
51 |   {% endblock %}
52 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/fp16_utils.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.fp16_utils
 5 | ===================================
 6 | 
 7 | This submodule contains utilities designed to streamline the mixed precision training recipe 
 8 | presented by NVIDIA `on Parallel Forall`_ and in GTC 2018 Sessions 
 9 | `Training Neural Networks with Mixed Precision: Theory and Practice`_ and 
10 | `Training Neural Networks with Mixed Precision: Real Examples`_.
11 | For Pytorch users, Real Examples in particular is recommended.
12 | 
13 | Full runnable Python scripts demonstrating ``apex.fp16_utils`` 
14 | can be found on the Github page:
15 | 
16 | | `Simple FP16_Optimizer demos`_
17 | |
18 | | `Distributed Mixed Precision Training with imagenet`_
19 | |
20 | | `Mixed Precision Training with word_language_model`_
21 | |
22 | |
23 | 
24 | .. _`on Parallel Forall`:
25 |     https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
26 | .. _`Training Neural Networks with Mixed Precision: Theory and Practice`:
27 |     http://on-demand.gputechconf.com/gtc/2018/video/S8923/
28 | .. _`Training Neural Networks with Mixed Precision: Real Examples`:
29 |     http://on-demand.gputechconf.com/gtc/2018/video/S81012/
30 | .. _`Simple FP16_Optimizer demos`:
31 |     https://github.com/NVIDIA/apex/tree/master/examples/FP16_Optimizer_simple
32 | .. _`Distributed Mixed Precision Training with imagenet`:
33 |     https://github.com/NVIDIA/apex/tree/master/examples/imagenet
34 | .. _`Mixed Precision Training with word_language_model`:
35 |     https://github.com/NVIDIA/apex/tree/master/examples/word_language_model
36 | 
37 | .. automodule:: apex.fp16_utils
38 | .. currentmodule:: apex.fp16_utils
39 | 
40 | Automatic management of master params + loss scaling
41 | ----------------------------------------------------
42 | 
43 | .. autoclass:: FP16_Optimizer
44 |     :members:
45 | 
46 | .. autoclass:: LossScaler
47 |     :members:
48 | 
49 | .. autoclass:: DynamicLossScaler
50 |     :members:
51 | 
52 | Manual master parameter management
53 | ----------------------------------
54 | 
55 | .. autofunction:: prep_param_lists
56 | 
57 | .. autofunction:: master_params_to_model_params
58 | 
59 | .. autofunction:: model_grads_to_master_grads
60 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyTorch documentation master file, created by
 2 |    sphinx-quickstart on Fri Dec 23 13:31:47 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | :github_url: https://github.com/nvidia/apex
 7 | 
 8 | Apex (A PyTorch Extension)
 9 | ===================================
10 | 
11 | This site contains the API documentation for Apex (https://github.com/nvidia/apex),
12 | a Pytorch extension with NVIDIA-maintained utilities to streamline mixed precision and distributed training.  Some of the code here will be included in upstream Pytorch eventually. The intention of Apex is to make up-to-date utilities available to users as quickly as possible.
13 | 
14 | Installation instructions can be found here:  https://github.com/NVIDIA/apex#quick-start.
15 | 
16 | Some other useful material, including GTC 2019 and Pytorch DevCon 2019 Slides, can be found here:  https://github.com/mcarilli/mixed_precision_references.
17 | 
18 | .. toctree::
19 |    :maxdepth: 1
20 |    :caption: AMP:  Automatic Mixed Precision
21 | 
22 |    amp
23 | 
24 | .. toctree::
25 |    :maxdepth: 1
26 |    :caption: Distributed Training
27 | 
28 |    parallel
29 | 
30 | .. toctree::
31 |    :maxdepth: 1
32 |    :caption: Fused Optimizers
33 | 
34 |    optimizers
35 | 
36 | .. toctree::
37 |    :maxdepth: 1
38 |    :caption: Fused Layer Norm
39 | 
40 |    layernorm
41 | 
42 | ..   .. toctree::
43 |      :maxdepth: 1
44 |      :caption: Deprecated mixed precision API
45 |      fp16_util
46 | 
47 | ..   reparameterization
48 | ..   RNN
49 |    
50 | Indices and tables
51 | ==================
52 | 
53 | * :ref:`genindex`
54 | * :ref:`modindex`
55 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/layernorm.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.normalization.fused_layer_norm
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.normalization
 8 | .. currentmodule:: apex.normalization
 9 | 
10 | .. FusedAdam
11 |    ----------
12 | 
13 | .. autoclass:: FusedLayerNorm
14 |     :members:
15 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/optimizers.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.optimizers
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.optimizers
 8 | .. currentmodule:: apex.optimizers
 9 | 
10 | .. FusedAdam
11 |    ----------
12 | 
13 | .. autoclass:: FusedAdam
14 |     :members:
15 | 
16 | .. autoclass:: FusedLAMB
17 |     :members:
18 | 
19 | .. autoclass:: FusedNovoGrad
20 |     :members:
21 | 
22 | .. autoclass:: FusedSGD
23 |     :members:
24 | 


--------------------------------------------------------------------------------
/ghost/apex/docs/source/parallel.rst:
--------------------------------------------------------------------------------
 1 | .. role:: hidden
 2 |     :class: hidden-section
 3 | 
 4 | apex.parallel
 5 | ===================================
 6 | 
 7 | .. automodule:: apex.parallel
 8 | .. currentmodule:: apex.parallel
 9 | 
10 | .. DistributedDataParallel
11 |    ----------
12 | 
13 | .. autoclass:: DistributedDataParallel
14 |     :members:
15 | 
16 | .. autoclass:: Reducer
17 |     :members:
18 | 
19 | .. autoclass:: SyncBatchNorm
20 |     :members:
21 | 
22 | Utility functions
23 | ----------------------------------
24 | 
25 | .. autofunction:: convert_syncbn_model
26 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/README.md:
--------------------------------------------------------------------------------
1 | This directory contains examples illustrating Apex mixed precision and distributed tools.
2 | 
3 | **Note for users of the pre-unification API**:
4 | `deprecated_api` contains examples illustrating the old (pre-unified) APIs.  These APIs will be removed soon, and users are strongly encouraged to switch.  The separate mixed precision tools called `Amp` and `FP16_Optimizer` in the old API are exposed via different flags/optimization levels in the new API.
5 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/dcgan/README.md:
--------------------------------------------------------------------------------
 1 | # Mixed Precision DCGAN Training in PyTorch
 2 | 
 3 | `main_amp.py` is based on [https://github.com/pytorch/examples/tree/master/dcgan](https://github.com/pytorch/examples/tree/master/dcgan).
 4 | It implements Automatic Mixed Precision (Amp) training of the DCGAN example for different datasets. Command-line flags forwarded to `amp.initialize` are used to easily manipulate and switch between various pure and mixed precision "optimization levels" or `opt_level`s.  For a detailed explanation of `opt_level`s, see the [updated API guide](https://nvidia.github.io/apex/amp.html).
 5 | 
 6 | We introduce these changes to the PyTorch DCGAN example as described in the [Multiple models/optimizers/losses](https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses) section of the documentation::
 7 | ```
 8 | # Added after models and optimizers construction
 9 | [netD, netG], [optimizerD, optimizerG] = amp.initialize(
10 |     [netD, netG], [optimizerD, optimizerG], opt_level=opt.opt_level, num_losses=3)
11 | ...
12 | # loss.backward() changed to:
13 | with amp.scale_loss(errD_real, optimizerD, loss_id=0) as errD_real_scaled:
14 |     errD_real_scaled.backward()
15 | ...
16 | with amp.scale_loss(errD_fake, optimizerD, loss_id=1) as errD_fake_scaled:
17 |     errD_fake_scaled.backward()
18 | ...
19 | with amp.scale_loss(errG, optimizerG, loss_id=2) as errG_scaled:
20 |     errG_scaled.backward()
21 | ```
22 | 
23 | Note that we use different `loss_scalers` for each computed loss.
24 | Using a separate loss scaler per loss is [optional, not required](https://nvidia.github.io/apex/advanced.html#optionally-have-amp-use-a-different-loss-scaler-per-loss).
25 | 
26 | To improve the numerical stability, we swapped `nn.Sigmoid() + nn.BCELoss()` to `nn.BCEWithLogitsLoss()`.
27 | 
28 | With the new Amp API **you never need to explicitly convert your model, or the input data, to half().**
29 | 
30 | "Pure FP32" training:
31 | ```
32 | $ python main_amp.py --opt_level O0
33 | ```
34 | Recommended mixed precision training:
35 | ```
36 | $ python main_amp.py --opt_level O1
37 | ```
38 | 
39 | Have a look at the original [DCGAN example](https://github.com/pytorch/examples/tree/master/dcgan) for more information about the used arguments.
40 | 
41 | To enable mixed precision training, we introduce the `--opt_level` argument.
42 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image must at least have pytorch and CUDA installed.
 2 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.07-py3
 3 | FROM $BASE_IMAGE
 4 | ARG BASE_IMAGE
 5 | RUN echo "Installing Apex on top of ${BASE_IMAGE}"
 6 | # make sure we don't overwrite some existing directory called "apex"
 7 | WORKDIR /tmp/unique_for_apex
 8 | # uninstall Apex if present, twice to make absolutely sure :)
 9 | RUN pip uninstall -y apex || :
10 | RUN pip uninstall -y apex || :
11 | # SHA is something the user can touch to force recreation of this Docker layer,
12 | # and therefore force cloning of the latest version of Apex
13 | RUN SHA=ToUcHMe git clone https://github.com/NVIDIA/apex.git
14 | WORKDIR /tmp/unique_for_apex/apex
15 | RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
16 | WORKDIR /workspace
17 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/docker/README.md:
--------------------------------------------------------------------------------
 1 | ## Option 1:  Create a new container with Apex
 2 | 
 3 | **Dockerfile** installs the latest Apex on top of an existing image.  Run
 4 | ```
 5 | docker build -t new_image_with_apex .
 6 | ```
 7 | By default, **Dockerfile** uses NVIDIA's Pytorch container as the base image,
 8 | which requires an NVIDIA GPU Cloud (NGC) account.  If you don't have an NGC account, you can sign up for free by following the instructions [here](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html#generating-api-key).
 9 | 
10 | Alternatively, you can supply your own base image via the `BASE_IMAGE` build-arg.
11 | `BASE_IMAGE` must have Pytorch and Cuda installed.  For example, any
12 | `-devel` image for Pytorch 1.0 and later from the
13 | [official Pytorch Dockerhub](https://hub.docker.com/r/pytorch/pytorch) may be used:
14 | ```
15 | docker build --build-arg BASE_IMAGE=1.3-cuda10.1-cudnn7-devel -t new_image_with_apex .
16 | ```
17 | 
18 | If you want to rebuild your image, and force the latest Apex to be cloned and installed, make any small change to the `SHA` variable in **Dockerfile**.
19 | 
20 | **Warning:**
21 | Currently, the non-`-devel` images on Pytorch Dockerhub do not contain the Cuda compiler `nvcc`.  Therefore,
22 | images whose name does not contain `-devel` are not eligible candidates for `BASE_IMAGE`.
23 | 
24 | ### Running your Apex container
25 | 
26 | Like any Cuda-enabled Pytorch container, a container with Apex should be run via [nvidia-docker](https://github.com/NVIDIA/nvidia-docker), for example:
27 | ```
28 | docker run --runtime=nvidia -it --rm --ipc=host new_image_with_apex
29 | ```
30 | 
31 | ## Option 2:  Install Apex in a running container
32 | 
33 | Instead of building a new container, it is also a viable option to `git clone https://github.com/NVIDIA/apex.git` on bare metal, mount the Apex repo into your container at launch by running, for example,
34 | ```
35 | docker run --runtime=nvidia -it --rm --ipc=host -v /bare/metal/apex:/apex/in/container <base image>
36 | ```
37 | then go to /apex/in/container within the running container and
38 | ```
39 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
40 | ```
41 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/simple/distributed/README.md:
--------------------------------------------------------------------------------
 1 | **distributed_data_parallel.py** and **run.sh** show an example using Amp with
 2 | [apex.parallel.DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) or
 3 | [torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#distributeddataparallel)
 4 | and the Pytorch multiprocess launcher script,
 5 | [torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
 6 | The use of `Amp` with DistributedDataParallel does not need to change from ordinary 
 7 | single-process use.  The only gotcha is that wrapping your model with `DistributedDataParallel` must
 8 | come after the call to `amp.initialize`.  Test via
 9 | ```bash
10 | bash run.sh
11 | ```
12 | 
13 | **This is intended purely as an instructional example, not a performance showcase.**
14 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/simple/distributed/distributed_data_parallel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import os
 4 | from apex import amp
 5 | # FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
 6 | from apex.parallel import DistributedDataParallel
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | # FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
10 | # automatically by torch.distributed.launch.
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | args = parser.parse_args()
13 | 
14 | # FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
15 | # the 'WORLD_SIZE' environment variable will also be set automatically.
16 | args.distributed = False
17 | if 'WORLD_SIZE' in os.environ:
18 |     args.distributed = int(os.environ['WORLD_SIZE']) > 1
19 | 
20 | if args.distributed:
21 |     # FOR DISTRIBUTED:  Set the device according to local_rank.
22 |     torch.cuda.set_device(args.local_rank)
23 | 
24 |     # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
25 |     # environment variables, and requires that you use init_method=`env://`.
26 |     torch.distributed.init_process_group(backend='nccl',
27 |                                          init_method='env://')
28 | 
29 | torch.backends.cudnn.benchmark = True
30 | 
31 | N, D_in, D_out = 64, 1024, 16
32 | 
33 | # Each process receives its own batch of "fake input data" and "fake target data."
34 | # The "training loop" in each process just uses this fake batch over and over.
35 | # https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
36 | # example of distributed data sampling for both training and validation.
37 | x = torch.randn(N, D_in, device='cuda')
38 | y = torch.randn(N, D_out, device='cuda')
39 | 
40 | model = torch.nn.Linear(D_in, D_out).cuda()
41 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
42 | 
43 | model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
44 | 
45 | if args.distributed:
46 |     # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
47 |     # apex.parallel.DistributedDataParallel.
48 |     model = DistributedDataParallel(model)
49 |     # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
50 |     # model = torch.nn.parallel.DistributedDataParallel(model,
51 |     #                                                   device_ids=[args.local_rank],
52 |     #                                                   output_device=args.local_rank)
53 | 
54 | loss_fn = torch.nn.MSELoss()
55 | 
56 | for t in range(500):
57 |     optimizer.zero_grad()
58 |     y_pred = model(x)
59 |     loss = loss_fn(y_pred, y)
60 |     with amp.scale_loss(loss, optimizer) as scaled_loss:
61 |         scaled_loss.backward()
62 |     optimizer.step()
63 | 
64 | if args.local_rank == 0:
65 |     print("final loss = ", loss)
66 | 


--------------------------------------------------------------------------------
/ghost/apex/examples/simple/distributed/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
3 | 


--------------------------------------------------------------------------------
/ghost/apex/requirements.txt:
--------------------------------------------------------------------------------
1 | cxxfilt>=0.2.0
2 | tqdm>=4.28.1
3 | numpy>=1.15.3
4 | PyYAML>=5.1
5 | pytest>=3.5.1
6 | 


--------------------------------------------------------------------------------
/ghost/apex/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | flake8>=3.7.9
3 | Sphinx>=3.0.3


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_amp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/tests/L0/run_amp/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_amp/test_larc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import Parameter
 6 | 
 7 | from apex import amp
 8 | from apex.parallel.LARC import LARC
 9 | from utils import common_init
10 | 
11 | 
12 | class MyModel(torch.nn.Module):
13 |     def __init__(self, unique):
14 |         super(MyModel, self).__init__()
15 |         self.weight0 = Parameter(
16 |             unique + torch.arange(2, device="cuda", dtype=torch.float32)
17 |         )
18 | 
19 |     def forward(self, input):
20 |         return (input * self.weight0).sum()
21 | 
22 | 
23 | class TestLARC(unittest.TestCase):
24 |     def setUp(self):
25 |         self.x = torch.ones((2), device="cuda", dtype=torch.float32)
26 |         common_init(self)
27 | 
28 |     def tearDown(self):
29 |         pass
30 | 
31 |     def test_larc_mixed_precision(self):
32 |         for opt_level in ["O0", "O1", "O2", "O3"]:
33 |             model = MyModel(1)
34 | 
35 |             optimizer = LARC(
36 |                 torch.optim.SGD(
37 |                     [{"params": model.parameters(), "lr": 0.25}], momentum=0.125
38 |                 )
39 |             )
40 | 
41 |             model, optimizer = amp.initialize(
42 |                 model, optimizer, opt_level=opt_level, verbosity=0
43 |             )
44 | 
45 |             optimizer.zero_grad()
46 |             loss = model(self.x)
47 |             with amp.scale_loss(loss, optimizer) as scaled_loss:
48 |                 scaled_loss.backward()
49 |             optimizer.step()
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import functools as ft
 4 | import itertools as it
 5 | 
 6 | from apex import amp
 7 | import torch
 8 | from torch import nn
 9 | import torch.nn.functional as F
10 | 
11 | from utils import common_init, HALF, FLOAT,\
12 |     ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT
13 | 
14 | try:
15 |   import amp_C
16 |   from amp_C import multi_tensor_l2norm
17 |   from apex.multi_tensor_apply import MultiTensorApply
18 |   disabled = False
19 | except ImportError as err:
20 |   print("amp_C fused kernels unavailable, disabling TestMultiTensorApply.  ImportError was ", err)
21 |   disabled = True
22 | 
23 | 
24 | class TestMultiTensorL2Norm(unittest.TestCase):
25 | 
26 |     def setUp(self):
27 |         common_init(self)
28 |         self.val = 4.0
29 |         self.overflow_buf = torch.cuda.IntTensor(1).zero_()
30 | 
31 |     def tearDown(self):
32 |         pass
33 | 
34 |     # The tensor creation here is written for convenience, not speed.
35 |     def l2norm(self, sizea, sizeb, applier, repeat_tensors, in_type, per_tensor):
36 |         self.overflow_buf.zero_()
37 |         a = torch.cuda.FloatTensor(sizea).fill_(self.val)
38 |         b = torch.cuda.FloatTensor(sizeb).fill_(self.val)
39 | 
40 |         in_list = []
41 |         for i in range(repeat_tensors):
42 |             in_list += [a.clone().to(in_type), b.clone().to(in_type)]
43 | 
44 |         if per_tensor:
45 |             norm, norm_per_tensor = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
46 |             normab = torch.cat((a.norm().view(1), b.norm().view(1)))
47 |             norm_per_tensor = norm_per_tensor.view(-1, 2)
48 |         else:
49 |             norm, _ = applier(multi_tensor_l2norm, self.overflow_buf, [in_list], True)
50 | 
51 |         reference = torch.cuda.FloatTensor((sizea + sizeb)*repeat_tensors).fill_(self.val).norm()
52 | 
53 |         self.assertTrue(torch.allclose(norm, reference))
54 |         if per_tensor:
55 |           self.assertTrue(torch.allclose(norm_per_tensor, normab))
56 |         self.assertTrue(self.overflow_buf.item() == 0)
57 | 
58 |     @unittest.skipIf(disabled, "amp_C is unavailable")
59 |     def test_fuzz(self):
60 |         input_size_pairs = (
61 |             (7777*77, 555*555),
62 |             (777, 555),
63 |             (555, 2048*32+1),
64 |             (2048*32+1, 555),
65 |             (555, 2048*32),
66 |             (2048*32, 555),
67 |             (33333, 555),
68 |             (555, 33333))
69 |         appliers = (
70 |             MultiTensorApply(2048*32), 
71 |             MultiTensorApply(333),
72 |             MultiTensorApply(33333))
73 |         repeat_tensors = (
74 |             1,
75 |             55)
76 | 
77 |         for sizea, sizeb in input_size_pairs:
78 |           for applier in appliers:
79 |             for repeat in repeat_tensors:
80 |               for in_type in (torch.float32, torch.float16):
81 |                 for per_tensor in (False, True):
82 |                   self.l2norm(sizea, sizeb, applier, repeat, in_type, per_tensor)
83 | 
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_amp/test_promotion.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import itertools as it
 4 | 
 5 | from apex import amp
 6 | import torch
 7 | from torch import nn
 8 | import torch.nn.functional as F
 9 | 
10 | from utils import common_init, HALF, FLOAT, DTYPES
11 | 
12 | class TestPromotion(unittest.TestCase):
13 |     def setUp(self):
14 |         self.handle = amp.init(enabled=True)
15 |         common_init(self)
16 | 
17 |     def tearDown(self):
18 |         self.handle._deactivate()
19 | 
20 |     def run_binary_promote_test(self, fns, input_shape, x_inplace=False):
21 |         type_pairs = it.product(DTYPES, DTYPES)
22 |         for fn, (xtype, ytype) in it.product(fns, type_pairs):
23 |             x = torch.randn(input_shape, dtype=xtype).requires_grad_()
24 |             x_leaf = x
25 |             if x_inplace:
26 |                 # We need a non-leaf to call in place on
27 |                 x = x.clone()
28 |             y = torch.randn(input_shape, dtype=ytype)
29 |             out = fn(x, y)
30 |             if x_inplace:
31 |                 # In place: always match xtype
32 |                 self.assertEqual(out.type(), x.type())
33 |             else:
34 |                 # Out of place: match widest type
35 |                 if xtype == torch.float or ytype == torch.float:
36 |                     self.assertEqual(out.type(), FLOAT)
37 |                 else:
38 |                     self.assertEqual(out.type(), HALF)
39 |             out.float().sum().backward()
40 |             self.assertEqual(x_leaf.grad.dtype, xtype)
41 | 
42 |     def test_atan2_matches_widest(self):
43 |         fns = [lambda x, y : torch.atan2(x, y),
44 |                lambda x, y : x.atan2(y)]
45 |         self.run_binary_promote_test(fns, (self.b,))
46 | 
47 |     def test_mul_matches_widest(self):
48 |         fns = [lambda x, y : torch.mul(x, y),
49 |                lambda x, y: x.mul(y)]
50 |         self.run_binary_promote_test(fns, (self.b,))
51 | 
52 |     def test_cat_matches_widest(self):
53 |         shape = self.b
54 |         ys = [torch.randn(shape, dtype=torch.half) for _ in range(5)]
55 |         x_float = torch.randn(shape)
56 |         out = torch.cat(ys + [x_float])
57 |         self.assertEqual(out.type(), FLOAT)
58 |         x_half = torch.randn(shape, dtype=torch.half)
59 |         out = torch.cat(ys + [x_half])
60 |         self.assertEqual(out.type(), HALF)
61 | 
62 |     def test_inplace_exp_is_error_for_half(self):
63 |         xs = torch.randn(self.b)
64 |         xs.exp_()
65 |         self.assertEqual(xs.type(), FLOAT)
66 |         xs = torch.randn(self.b, dtype=torch.half)
67 |         with self.assertRaises(NotImplementedError):
68 |             xs.exp_()
69 | 
70 |     def test_inplace_add_matches_self(self):
71 |         fn = lambda x, y: x.add_(y)
72 |         self.run_binary_promote_test([fn], (self.b,), x_inplace=True)
73 | 
74 | if __name__ == '__main__':
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_amp/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | HALF = 'torch.cuda.HalfTensor'
 4 | FLOAT = 'torch.cuda.FloatTensor'
 5 | 
 6 | DTYPES = [torch.half, torch.float]
 7 | 
 8 | ALWAYS_HALF = {torch.float: HALF,
 9 |                torch.half: HALF}
10 | ALWAYS_FLOAT = {torch.float: FLOAT,
11 |                 torch.half: FLOAT}
12 | MATCH_INPUT = {torch.float: FLOAT,
13 |                torch.half: HALF}
14 | 
15 | def common_init(test_case):
16 |     test_case.h = 64
17 |     test_case.b = 16
18 |     test_case.c = 16
19 |     test_case.k = 3
20 |     test_case.t = 10
21 |     torch.set_default_tensor_type(torch.cuda.FloatTensor)
22 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_fp16util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/tests/L0/run_fp16util/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_fp16util/test_fp16util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from apex.fp16_utils import FP16Model
 7 | 
 8 | 
 9 | class DummyBlock(nn.Module):
10 |     def __init__(self):
11 |         super(DummyBlock, self).__init__()
12 | 
13 |         self.conv = nn.Conv2d(10, 10, 2)
14 |         self.bn = nn.BatchNorm2d(10, affine=True)
15 | 
16 |     def forward(self, x):
17 |         return self.conv(self.bn(x))
18 | 
19 | 
20 | class DummyNet(nn.Module):
21 |     def __init__(self):
22 |         super(DummyNet, self).__init__()
23 | 
24 |         self.conv1 = nn.Conv2d(3, 10, 2)
25 |         self.bn1 = nn.BatchNorm2d(10, affine=False)
26 |         self.db1 = DummyBlock()
27 |         self.db2 = DummyBlock()
28 | 
29 |     def forward(self, x):
30 |         out = x
31 |         out = self.conv1(out)
32 |         out = self.bn1(out)
33 |         out = self.db1(out)
34 |         out = self.db2(out)
35 |         return out
36 | 
37 | 
38 | class DummyNetWrapper(nn.Module):
39 |     def __init__(self):
40 |         super(DummyNetWrapper, self).__init__()
41 | 
42 |         self.bn = nn.BatchNorm2d(3, affine=True)
43 |         self.dn = DummyNet()
44 | 
45 |     def forward(self, x):
46 |         return self.dn(self.bn(x))
47 | 
48 | 
49 | class TestFP16Model(unittest.TestCase):
50 |     def setUp(self):
51 |         self.N = 64
52 |         self.C_in = 3
53 |         self.H_in = 16
54 |         self.W_in = 32
55 |         self.in_tensor = torch.randn((self.N, self.C_in, self.H_in, self.W_in)).cuda()
56 |         self.orig_model = DummyNetWrapper().cuda()
57 |         self.fp16_model = FP16Model(self.orig_model)
58 | 
59 |     def test_params_and_buffers(self):
60 |         exempted_modules = [
61 |             self.fp16_model.network.bn,
62 |             self.fp16_model.network.dn.db1.bn,
63 |             self.fp16_model.network.dn.db2.bn,
64 |         ]
65 |         for m in self.fp16_model.modules():
66 |             expected_dtype = torch.float if (m in exempted_modules) else torch.half
67 |             for p in m.parameters(recurse=False):
68 |                 assert p.dtype == expected_dtype
69 |             for b in m.buffers(recurse=False):
70 |                 assert b.dtype in (expected_dtype, torch.int64)
71 | 
72 |     def test_output_is_half(self):
73 |         out_tensor = self.fp16_model(self.in_tensor)
74 |         assert out_tensor.dtype == torch.half
75 | 
76 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import random
 4 | 
 5 | import torch
 6 | import apex
 7 | from torch.autograd import Variable
 8 | 
 9 |         
10 | class TestFusedLayerNorm(unittest.TestCase):
11 |     def setUp(self):
12 |         # bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
13 |         self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cpu()
14 |         self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=False).cuda()
15 | 
16 |     def _test_same_output(self, batch_size):
17 |         torch.cuda.manual_seed(42)
18 |         self.input_ = torch.randn((batch_size, *self.module_cpu_.normalized_shape), device="cpu").requires_grad_(True)
19 |         self.input_cuda_ = self.input_.cuda().detach().requires_grad_(True)
20 |         out_cpu_ = self.module_cpu_(self.input_)
21 |         gO = torch.rand_like(out_cpu_)
22 |         out_cpu_.backward(gO)
23 |         out_cuda_ = self.module_cuda_(self.input_cuda_)
24 |         gO = gO.cuda()
25 |         out_cuda_.backward(gO)
26 |         assert out_cpu_.is_cuda == False
27 |         assert out_cuda_.is_cuda == True
28 |         torch.testing.assert_allclose(out_cpu_, out_cuda_.cpu())
29 |         torch.testing.assert_allclose(self.input_.grad, self.input_cuda_.grad.cpu())
30 | 
31 |     def test_layer_norm(self):
32 |         self._test_same_output(16)
33 | 
34 |     def test_large_batch(self):
35 |         self._test_same_output(65536)
36 |         
37 |         
38 | class TestFusedLayerNormElemWise(TestFusedLayerNorm):
39 |     def setUp(self):
40 |         self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
41 |         self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()
42 | 
43 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_optimizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/tests/L0/run_optimizers/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_pyprof_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/apex/tests/L0/run_pyprof_data/__init__.py


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_pyprof_data/test_pyprof_data.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import unittest
 3 | 
 4 | from apex.pyprof.prof.data import Data
 5 | from apex.pyprof.prof.prof import foo
 6 | 
 7 | 
 8 | class TestPyProfData(unittest.TestCase):
 9 | 
10 | 	def __init__(self, testName):
11 | 		super().__init__(testName)
12 | 
13 | 	def setUp(self):
14 | 		pass
15 | 
16 | 	def tearDown(self):
17 | 		pass
18 | 
19 | 	def test_data(self):
20 | 		kernels = [
21 | 			{'kShortName': 'elementwise_kernel', 'kDuration': 2848, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'float', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 104, 160), 'dtype': 'bool'}]}"], 'seqMarker': ['to, seq = 60471'], 'seqId': [60471], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['float'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (585, 1, 1), 'block': (512, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<512, 1, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, bool>(at::TensorIterator&)::{lambda(bool)#1} const&)::{lambda(int)#1})'},
22 | 			{'kShortName': 'elementwise_kernel', 'kDuration': 201182, 'layer': [], 'trace': [], 'reprMarkers': [], 'marker': ["{'mod': 'Tensor', 'op': 'clone', 'args': [{'name': '', 'type': 'tensor', 'shape': (18, 4, 416, 640), 'dtype': 'float32'}]}"], 'seqMarker': ['clone, seq = 60161'], 'seqId': [60161], 'subSeqId': 0, 'altSeqId': [], 'dir': 'fprop', 'mod': ['Tensor'], 'op': ['clone'], 'tid': 1431533376, 'device': 0, 'stream': 7, 'grid': (37440, 1, 1), 'block': (128, 1, 1), 'kLongName': 'void at::native::elementwise_kernel<128, 4, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2}>(int, void at::native::gpu_kernel_impl<void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1}>(at::TensorIterator&, void at::native::copy_kernel_impl<float, float>(at::TensorIterator&)::{lambda(float)#1} const&)::{lambda(int)#2})'},
23 | 		]
24 | 
25 | 		for k in kernels:
26 | 			d = Data(k)
27 | 			mod = k['mod']
28 | 			op = k['op']
29 | 			xx = foo(mod, op, d)
30 | 			d.setParams(xx.params())
31 | 
32 | 
33 | def run_tests(test_name):
34 | 	dummy = TestPyProfData(test_name)
35 | 	test_cases = list(filter(lambda x: 'test_' in x, map(lambda x: x[0], inspect.getmembers(dummy, predicate=inspect.ismethod))))
36 | 	print(f'Running tests for {test_name}')
37 | 	suite = unittest.TestSuite()
38 | 	for test_case in test_cases:
39 | 		suite.addTest(TestPyProfData(test_case))
40 | 	unittest.TextTestRunner().run(suite)
41 | 
42 | if __name__ == '__main__':
43 | 	run_tests('test_data')
44 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_pyprof_nvtx/__init__.py:
--------------------------------------------------------------------------------
1 | import test_pyprof_nvtx.TestPyProfNvtx as TestPyProfNvtx
2 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L0/run_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | 
 4 | test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]
 5 | 
 6 | runner = unittest.TextTestRunner(verbosity=2)
 7 | 
 8 | errcode = 0
 9 | 
10 | for test_dir in test_dirs:
11 |     suite = unittest.TestLoader().discover(test_dir)
12 | 
13 |     print("\nExecuting tests from " + test_dir)
14 | 
15 |     result = runner.run(suite)
16 | 
17 |     if not result.wasSuccessful():
18 |         errcode = 1
19 | 
20 | sys.exit(errcode)
21 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L1/common/compare.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | parser = argparse.ArgumentParser(description='Compare')
 5 | parser.add_argument('--opt-level', type=str)
 6 | parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
 7 | parser.add_argument('--loss-scale', type=str, default=None)
 8 | parser.add_argument('--fused-adam', action='store_true')
 9 | parser.add_argument('--use_baseline', action='store_true')
10 | args = parser.parse_args()
11 | 
12 | base_file = str(args.opt_level) + "_" +\
13 |             str(args.loss_scale) + "_" +\
14 |             str(args.keep_batchnorm_fp32) + "_" +\
15 |             str(args.fused_adam)
16 | 
17 | file_e = "True_" + base_file
18 | file_p = "False_" + base_file
19 | if args.use_baseline:
20 |     file_b = "baselines/True_" + base_file
21 | 
22 | dict_e = torch.load(file_e)
23 | dict_p = torch.load(file_p)
24 | if args.use_baseline:
25 |     dict_b = torch.load(file_b)
26 | 
27 | torch.set_printoptions(precision=10)
28 | 
29 | print(file_e)
30 | print(file_p)
31 | if args.use_baseline:
32 |     print(file_b)
33 | 
34 | # ugly duplication here...
35 | if not args.use_baseline:
36 |     for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
37 |         assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
38 | 
39 |         loss_e = dict_e["Loss"][n]
40 |         loss_p = dict_p["Loss"][n]
41 |         assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
42 |         print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
43 |               i_e,
44 |               loss_e,
45 |               loss_p,
46 |               dict_e["Speed"][n],
47 |               dict_p["Speed"][n]))
48 | else:
49 |     for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
50 |         assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
51 | 
52 |         loss_e = dict_e["Loss"][n]
53 |         loss_p = dict_p["Loss"][n]
54 |         loss_b = dict_b["Loss"][n]
55 |         assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
56 |         assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(i_e, loss_e, loss_b)
57 |         print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
58 |               i_e,
59 |               loss_b,
60 |               loss_e,
61 |               loss_p,
62 |               dict_b["Speed"][n],
63 |               dict_e["Speed"][n],
64 |               dict_p["Speed"][n]))
65 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L1/cross_product/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
4 | # DATADIR="/opt/home/apex/examples/imagenet/"
5 | cp ../common/* .
6 | bash run_test.sh single_gpu $1
7 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/L1/cross_product_distributed/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cp ../common/* .
4 | bash run_test.sh distributed $1
5 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/DDP/ddp_race_condition_test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from torch.nn import Parameter
 4 | from torch.nn import Module
 5 | from apex.parallel import DistributedDataParallel as DDP
 6 | import argparse
 7 | import os
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser(description='allreduce hook example')
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | args = parser.parse_args()
13 | 
14 | args.distributed = False
15 | if 'WORLD_SIZE' in os.environ:
16 |     args.distributed = int(os.environ['WORLD_SIZE']) > 1
17 | 
18 | if args.distributed:
19 |     args.gpu = args.local_rank % torch.cuda.device_count()
20 |     torch.cuda.set_device(args.gpu)
21 |     torch.distributed.init_process_group(backend='nccl',
22 |                                          init_method='env://')
23 |     args.world_size = torch.distributed.get_world_size()
24 | 
25 | torch.set_printoptions(precision=10)
26 | torch.manual_seed(args.local_rank)
27 | 
28 | class Model(Module):
29 |     def __init__(self):
30 |         super(Model, self).__init__()
31 |         self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
32 |         self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
33 |     def forward(self, input):
34 |         return (input*self.a)*self.b
35 | 
36 | model = Model()
37 | # model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
38 | # model = DDP(model, delay_allreduce=True)
39 | # model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
40 | model = DDP(model, message_size=1, allreduce_trigger_params=[model.b], num_allreduce_streams=3)
41 | 
42 | x = torch.cuda.FloatTensor(4096*4096)
43 | 
44 | passed = True
45 | torch.cuda.cudart().cudaProfilerStart()
46 | for i in range(10):
47 |     x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
48 |     model.zero_grad()
49 |     out = model(x)
50 |     loss = out.sum()
51 |     # torch.cuda.nvtx.range_push("backward")
52 |     loss.backward()
53 |     # torch.cuda.nvtx.range_pop()
54 |     
55 |     # torch.cuda.nvtx.range_push("synchronize() + info")
56 |     # torch.cuda.synchronize()
57 |     print("i = {}".format(i))
58 |     def info(name, param, val):
59 |         expected = val*4096*4096*(2.*i+1)/2.
60 |         actual = param.grad.data.sum().item()
61 |         print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
62 |               param.grad.data_ptr(), expected, actual))
63 |         return (expected == actual)
64 |     if not info("model.a", model.module.a, 2.):  passed = False
65 |     if not info("model.b", model.module.b, 1.):  passed = False
66 |     # torch.cuda.nvtx.range_pop()
67 | torch.cuda.cudart().cudaProfilerStop()
68 | 
69 | print("passed = ", passed)
70 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/DDP/run_race_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 ddp_race_condition_test.py
4 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/amp_master_params/amp_master_params.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import os
 4 | from apex import amp
 5 | # FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
 6 | from apex.parallel import DistributedDataParallel
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | # FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
10 | # automatically by torch.distributed.launch.
11 | parser.add_argument("--local_rank", default=0, type=int)
12 | args = parser.parse_args()
13 | 
14 | # FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
15 | # the 'WORLD_SIZE' environment variable will also be set automatically.
16 | args.distributed = False
17 | if 'WORLD_SIZE' in os.environ:
18 |     args.distributed = int(os.environ['WORLD_SIZE']) > 1
19 | 
20 | if args.distributed:
21 |     # FOR DISTRIBUTED:  Set the device according to local_rank.
22 |     torch.cuda.set_device(args.local_rank)
23 | 
24 |     # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
25 |     # environment variables, and requires that you use init_method=`env://`.
26 |     torch.distributed.init_process_group(backend='nccl',
27 |                                          init_method='env://')
28 | 
29 |     torch.manual_seed(torch.distributed.get_rank())
30 | 
31 | torch.backends.cudnn.benchmark = True
32 | 
33 | N, D_in, D_out = 64, 1024, 16
34 | 
35 | # Each process receives its own batch of "fake input data" and "fake target data."
36 | # The "training loop" in each process just uses this fake batch over and over.
37 | # https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
38 | # example of distributed data sampling for both training and validation.
39 | x = torch.randn(N, D_in, device='cuda')
40 | y = torch.randn(N, D_out, device='cuda')
41 | 
42 | model = torch.nn.Linear(D_in, D_out).cuda()
43 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
44 | 
45 | model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
46 | 
47 | if args.distributed:
48 |     # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
49 |     # apex.parallel.DistributedDataParallel.
50 |     model = DistributedDataParallel(model)
51 |     # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
52 |     # model = torch.nn.parallel.DistributedDataParallel(model,
53 |     #                                                   device_ids=[args.local_rank],
54 |     #                                                   output_device=args.local_rank)
55 | 
56 | loss_fn = torch.nn.MSELoss()
57 | 
58 | for t in range(500):
59 |     optimizer.zero_grad()
60 |     y_pred = model(x)
61 |     loss = loss_fn(y_pred, y)
62 |     with amp.scale_loss(loss, optimizer) as scaled_loss:
63 |         scaled_loss.backward()
64 |     optimizer.step()
65 | 
66 | if args.local_rank == 0:
67 |     print("final loss = ", loss)
68 | 
69 | torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
70 | torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
71 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/amp_master_params/compare.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | model_params_rank0 = torch.load("rank0model.pth",
 4 |                            map_location = lambda storage, loc: storage.cuda(0))
 5 | model_params_rank1 = torch.load("rank1model.pth",
 6 |                                  map_location = lambda storage, loc: storage.cuda(0))
 7 | master_params_rank0 = torch.load("rank0master.pth",
 8 |                                  map_location = lambda storage, loc: storage.cuda(0))
 9 | master_params_rank1 = torch.load("rank1master.pth",
10 |                                  map_location = lambda storage, loc: storage.cuda(0))
11 | 
12 | for model_rank0, model_rank1, master_rank0, master_rank1 in zip(
13 |         model_params_rank0,
14 |         model_params_rank1,
15 |         master_params_rank0,
16 |         master_params_rank1):
17 |     assert torch.allclose(model_rank0, model_rank1), "Model param mismatch"
18 |     assert torch.allclose(master_rank0, master_rank1), "Master param mismatch"
19 |     # Some debugging/investigation assistance code:
20 |     # maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
21 |     # offending_val_half = model_rank0.view(-1)[maxind.item()]
22 |     # offending_val_float = master_rank0.view(-1)[maxind.item()]
23 |     # print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
24 |     #       offending_val_float.half().item())
25 |     # rtol needs to be > 2^-11 because of denormals...
26 |     assert torch.allclose(model_rank0, master_rank0.half(), rtol=.005), "Model-master mismatch"
27 | 
28 | print("OK:  Model and master params match across ranks.")
29 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/amp_master_params/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m torch.distributed.launch --nproc_per_node=2 amp_master_params.py
3 | 
4 | python compare.py
5 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/synced_batchnorm/test_batchnorm1d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import apex
 3 | 
 4 | model = apex.parallel.SyncBatchNorm(4).cuda()
 5 | model.weight.data.uniform_()
 6 | model.bias.data.uniform_()
 7 | data = torch.rand((8,4)).cuda()
 8 | 
 9 | model_ref = torch.nn.BatchNorm1d(4).cuda()
10 | model_ref.load_state_dict(model.state_dict())
11 | data_ref = data.clone()
12 | 
13 | output = model(data)
14 | output_ref = model_ref(data_ref)
15 | 
16 | assert(output.allclose(output_ref))
17 | assert(model.running_mean.allclose(model_ref.running_mean))
18 | assert(model.running_var.allclose(model_ref.running_var))
19 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/distributed/synced_batchnorm/unit_test.sh:
--------------------------------------------------------------------------------
1 | python python_single_gpu_unit_test.py
2 | python single_gpu_unit_test.py
3 | python test_batchnorm1d.py
4 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py
5 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp16
6 | python -m torch.distributed.launch --nproc_per_node=2 two_gpu_test_different_batch_size.py --apex
7 | #beware, you need a system with at least 4 gpus to test group_size<world_size
8 | #python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2
9 | 


--------------------------------------------------------------------------------
/ghost/apex/tests/docker_extension_builds/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | print_banner() {
 4 |   printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
 5 | }
 6 | 
 7 | print_green() {
 8 |   printf "\e[30m\e[42m$1\e[0m\n"
 9 | }
10 | 
11 | print_red() {
12 |   printf "\e[30m\e[41m$1\e[0m\n"
13 | }
14 | 
15 | images=(
16 | "gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.08-py3-devel"
17 | "gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
18 | "pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
19 | "pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
20 | "pytorch/pytorch:1.0.1-cuda10.0-cudnn7-devel"
21 | "pytorch/pytorch:1.0-cuda10.0-cudnn7-devel"
22 | "pytorch/pytorch:nightly-devel-cuda9.2-cudnn7"
23 | )
24 | 
25 | branch="master"
26 | 
27 | # Associative array for exit codes
28 | declare -A exit_codes
29 | for image in images
30 | do
31 |   exit_codes[$image]="None"
32 | done
33 | 
34 | for image in "${images[@]}"
35 | do
36 |   print_banner "$image"
37 |   set -x
38 |   docker pull $image
39 |   # Trying python setup.py install instead of pip install to ensure direct access to error codes.
40 |   # Maybe pip install would be ok too but this works.
41 |   docker run --runtime=nvidia --rm $image /bin/bash -c "yes | pip uninstall apex; yes | pip uninstall apex; git clone https://github.com/NVIDIA/apex.git; cd apex; git checkout $branch; set -e;  python setup.py install --cuda_ext --cpp_ext"
42 |   exit_code=$?
43 |   set +x
44 |   if [ $exit_code != 0 ]
45 |   then
46 |     print_red "Exit code: $exit_code"
47 |   else
48 |     print_green "Exit code: $exit_code"
49 |   fi
50 |   exit_codes[$image]=$exit_code
51 | done
52 | 
53 | success=0
54 | for image in "${images[@]}"
55 | do
56 |   exit_code=${exit_codes[$image]}
57 |   if [ $exit_code != 0 ]
58 |   then
59 |     print_red "$image : $exit_code"
60 |     success=1
61 |   else
62 |     print_green "$image : $exit_code"
63 |   fi
64 | done
65 | 
66 | if [ $success != 0 ]
67 | then
68 |   print_red "Overall status:  failure"
69 | else
70 |   print_green "Overall status:  success"
71 | fi
72 | 
73 | exit $success
74 | 


--------------------------------------------------------------------------------
/ghost/coordinate_reg/model/2d106det-0000.params:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/coordinate_reg/model/2d106det-0000.params


--------------------------------------------------------------------------------
/ghost/coordinate_reg/model/2d106det.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/coordinate_reg/model/2d106det.zip


--------------------------------------------------------------------------------
/ghost/coordinate_reg/model/2d106det/2d106det-0000.params:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/coordinate_reg/model/2d106det/2d106det-0000.params


--------------------------------------------------------------------------------
/ghost/coordinate_reg/model/SCRFD/model_25GF.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/coordinate_reg/model/SCRFD/model_25GF.pth


--------------------------------------------------------------------------------
/ghost/download_models.sh:
--------------------------------------------------------------------------------
 1 | # cd
 2 | 
 3 | # load arcface
 4 | wget -P ./arcface_model https://github.com/sberbank-ai/sber-swap/releases/download/arcface/backbone.pth
 5 | wget -P ./arcface_model https://github.com/sberbank-ai/sber-swap/releases/download/arcface/iresnet.py
 6 | 
 7 | # load landmarks detector
 8 | wget -P ./insightface_func/models/antelope https://github.com/sberbank-ai/sber-swap/releases/download/antelope/glintr100.onnx
 9 | wget -P ./insightface_func/models/antelope https://github.com/sberbank-ai/sber-swap/releases/download/antelope/scrfd_10g_bnkps.onnx
10 | 
11 | # load G and D models with 1, 2, 3 blocks
12 | # model with 2 blocks is main
13 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/sber-swap-v2.0/G_unet_2blocks.pth
14 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/sber-swap-v2.0/D_unet_2blocks.pth
15 | 
16 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/sber-swap-v2.0/G_unet_1block.pth
17 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/sber-swap-v2.0/D_unet_1block.pth
18 | 
19 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/sber-swap-v2.0/G_unet_3blocks.pth
20 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/sber-swap-v2.0/D_unet_3blocks.pth
21 | 
22 | # load model for eyes loss
23 | wget -P ./AdaptiveWingLoss/AWL_detector https://github.com/sberbank-ai/sber-swap/releases/download/awl_detector/WFLW_4HG.pth
24 | 
25 | # load super res model
26 | wget -P ./weights https://github.com/sberbank-ai/sber-swap/releases/download/super-res/10_net_G.pth
27 | 


--------------------------------------------------------------------------------
/ghost/examples/images/beckham.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/beckham.jpg


--------------------------------------------------------------------------------
/ghost/examples/images/elon_musk.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/elon_musk.jpg


--------------------------------------------------------------------------------
/ghost/examples/images/example1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/example1.png


--------------------------------------------------------------------------------
/ghost/examples/images/example2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/example2.png


--------------------------------------------------------------------------------
/ghost/examples/images/mark.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/mark.jpg


--------------------------------------------------------------------------------
/ghost/examples/images/murakami.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/murakami.jpg


--------------------------------------------------------------------------------
/ghost/examples/images/p1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/p1.jpg


--------------------------------------------------------------------------------
/ghost/examples/images/p2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/p2.jpg


--------------------------------------------------------------------------------
/ghost/examples/images/tgt1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/tgt1.png


--------------------------------------------------------------------------------
/ghost/examples/images/tgt2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/tgt2.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/source1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/source1.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/source2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/source2.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/source3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/source3.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/source4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/source4.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/source5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/source5.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/source6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/source6.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/target1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/target1.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/target2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/target2.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/target3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/target3.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/target4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/target4.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/target5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/target5.png


--------------------------------------------------------------------------------
/ghost/examples/images/training/target6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/images/training/target6.png


--------------------------------------------------------------------------------
/ghost/examples/results/result.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/results/result.mp4


--------------------------------------------------------------------------------
/ghost/examples/results/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/results/result.png


--------------------------------------------------------------------------------
/ghost/examples/results/result_multi.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/results/result_multi.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/dance.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/dance.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/dirtydancing.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/dirtydancing.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/elon.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/elon.webp


--------------------------------------------------------------------------------
/ghost/examples/videos/inVideo1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/inVideo1.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/inVideo2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/inVideo2.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/inVideo3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/inVideo3.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/inVideo4.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/inVideo4.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/inVideo5.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/inVideo5.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/khabenskii.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/khabenskii.webp


--------------------------------------------------------------------------------
/ghost/examples/videos/mark.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/mark.webp


--------------------------------------------------------------------------------
/ghost/examples/videos/nggyup.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/nggyup.mp4


--------------------------------------------------------------------------------
/ghost/examples/videos/orig.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/orig.webp


--------------------------------------------------------------------------------
/ghost/examples/videos/random_gif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/examples/videos/random_gif.gif


--------------------------------------------------------------------------------
/ghost/insightface_func/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/insightface_func/__init__.py


--------------------------------------------------------------------------------
/ghost/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import ArcMarginModel
2 | from .models import ResNet
3 | from .models import IRBlock
4 | from .models import SEBlock


--------------------------------------------------------------------------------
/ghost/models/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | 
 5 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # sets device for model and PyTorch tensors
 6 | 
 7 | # Model parameters
 8 | image_w = 112
 9 | image_h = 112
10 | channel = 3
11 | emb_size = 512
12 | 
13 | # Training parameters
14 | num_workers = 1  # for data-loading; right now, only 1 works with h5py
15 | grad_clip = 5.  # clip gradients at an absolute value of
16 | print_freq = 100  # print training/validation stats  every __ batches
17 | checkpoint = None  # path to checkpoint, None if none
18 | 
19 | # Data parameters
20 | num_classes = 93431
21 | num_samples = 5179510
22 | DATA_DIR = 'data'
23 | # faces_ms1m_folder = 'data/faces_ms1m_112x112'
24 | faces_ms1m_folder = 'data/ms1m-retinaface-t1'
25 | path_imgidx = os.path.join(faces_ms1m_folder, 'train.idx')
26 | path_imgrec = os.path.join(faces_ms1m_folder, 'train.rec')
27 | IMG_DIR = 'data/images'
28 | pickle_file = 'data/faces_ms1m_112x112.pickle'
29 | 


--------------------------------------------------------------------------------
/ghost/models/config_sr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | class TestOptions(object):
 4 |     name = 'weights'
 5 |     results_dir = './results/'
 6 |     gpu_ids = [0]
 7 |     crop_size = 256
 8 |     dataset_mode = 'test'
 9 |     which_epoch = '10'
10 | 
11 |     aspect_ratio = 1.0
12 |     checkpoints_dir = ''
13 |     
14 |     init_type = 'xavier'
15 |     init_variance = 0.02
16 |     isTrain = False
17 |     is_test = True
18 |     semantic_nc = 3
19 |     
20 |     model = 'pix2pix'
21 |     netG = 'lipspade'
22 |     nef = 16
23 |     ngf = 48 
24 |     
25 |     norm_G = 'spectralspadesyncbatch3x3'
26 |     num_upsampling_layers = 'normal'
27 |     phase = 'test'
28 |     use_vae = False
29 |     z_dim = 256
30 | 


--------------------------------------------------------------------------------
/ghost/models/networks/Synchronized-BatchNorm-PyTorch/sync_batchnorm/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : __init__.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
12 | from .batchnorm import patch_sync_batchnorm, convert_model
13 | from .replicate import DataParallelWithCallback, patch_replication_callback
14 | 


--------------------------------------------------------------------------------
/ghost/models/networks/Synchronized-BatchNorm-PyTorch/sync_batchnorm/batchnorm_reimpl.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # File   : batchnorm_reimpl.py
 4 | # Author : acgtyrant
 5 | # Date   : 11/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.init as init
14 | 
15 | __all__ = ['BatchNorm2dReimpl']
16 | 
17 | 
18 | class BatchNorm2dReimpl(nn.Module):
19 |     """
20 |     A re-implementation of batch normalization, used for testing the numerical
21 |     stability.
22 | 
23 |     Author: acgtyrant
24 |     See also:
25 |     https://github.com/vacancy/Synchronized-BatchNorm-PyTorch/issues/14
26 |     """
27 |     def __init__(self, num_features, eps=1e-5, momentum=0.1):
28 |         super().__init__()
29 | 
30 |         self.num_features = num_features
31 |         self.eps = eps
32 |         self.momentum = momentum
33 |         self.weight = nn.Parameter(torch.empty(num_features))
34 |         self.bias = nn.Parameter(torch.empty(num_features))
35 |         self.register_buffer('running_mean', torch.zeros(num_features))
36 |         self.register_buffer('running_var', torch.ones(num_features))
37 |         self.reset_parameters()
38 | 
39 |     def reset_running_stats(self):
40 |         self.running_mean.zero_()
41 |         self.running_var.fill_(1)
42 | 
43 |     def reset_parameters(self):
44 |         self.reset_running_stats()
45 |         init.uniform_(self.weight)
46 |         init.zeros_(self.bias)
47 | 
48 |     def forward(self, input_):
49 |         batchsize, channels, height, width = input_.size()
50 |         numel = batchsize * height * width
51 |         input_ = input_.permute(1, 0, 2, 3).contiguous().view(channels, numel)
52 |         sum_ = input_.sum(1)
53 |         sum_of_square = input_.pow(2).sum(1)
54 |         mean = sum_ / numel
55 |         sumvar = sum_of_square - sum_ * mean
56 | 
57 |         self.running_mean = (
58 |                 (1 - self.momentum) * self.running_mean
59 |                 + self.momentum * mean.detach()
60 |         )
61 |         unbias_var = sumvar / (numel - 1)
62 |         self.running_var = (
63 |                 (1 - self.momentum) * self.running_var
64 |                 + self.momentum * unbias_var.detach()
65 |         )
66 | 
67 |         bias_var = sumvar / numel
68 |         inv_std = 1 / (bias_var + self.eps).pow(0.5)
69 |         output = (
70 |                 (input_ - mean.unsqueeze(1)) * inv_std.unsqueeze(1) *
71 |                 self.weight.unsqueeze(1) + self.bias.unsqueeze(1))
72 | 
73 |         return output.view(channels, batchsize, height, width).permute(1, 0, 2, 3).contiguous()
74 | 
75 | 


--------------------------------------------------------------------------------
/ghost/models/networks/Synchronized-BatchNorm-PyTorch/sync_batchnorm/unittest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : unittest.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import unittest
12 | import torch
13 | 
14 | 
15 | class TorchTestCase(unittest.TestCase):
16 |     def assertTensorClose(self, x, y):
17 |         adiff = float((x - y).abs().max())
18 |         if (y == 0).all():
19 |             rdiff = 'NaN'
20 |         else:
21 |             rdiff = float((adiff / y).abs().max())
22 | 
23 |         message = (
24 |             'Tensor close check failed\n'
25 |             'adiff={}\n'
26 |             'rdiff={}\n'
27 |         ).format(adiff, rdiff)
28 |         self.assertTrue(torch.allclose(x, y), message)
29 | 
30 | 


--------------------------------------------------------------------------------
/ghost/models/networks/Synchronized-BatchNorm-PyTorch/tests/test_numeric_batchnorm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : test_numeric_batchnorm.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | 
 9 | import unittest
10 | 
11 | import torch
12 | import torch.nn as nn
13 | from torch.autograd import Variable
14 | 
15 | from sync_batchnorm.unittest import TorchTestCase
16 | 
17 | 
18 | def handy_var(a, unbias=True):
19 |     n = a.size(0)
20 |     asum = a.sum(dim=0)
21 |     as_sum = (a ** 2).sum(dim=0)  # a square sum
22 |     sumvar = as_sum - asum * asum / n
23 |     if unbias:
24 |         return sumvar / (n - 1)
25 |     else:
26 |         return sumvar / n
27 | 
28 | 
29 | class NumericTestCase(TorchTestCase):
30 |     def testNumericBatchNorm(self):
31 |         a = torch.rand(16, 10)
32 |         bn = nn.BatchNorm1d(10, momentum=1, eps=1e-5, affine=False)
33 |         bn.train()
34 | 
35 |         a_var1 = Variable(a, requires_grad=True)
36 |         b_var1 = bn(a_var1)
37 |         loss1 = b_var1.sum()
38 |         loss1.backward()
39 | 
40 |         a_var2 = Variable(a, requires_grad=True)
41 |         a_mean2 = a_var2.mean(dim=0, keepdim=True)
42 |         a_std2 = torch.sqrt(handy_var(a_var2, unbias=False).clamp(min=1e-5))
43 |         # a_std2 = torch.sqrt(a_var2.var(dim=0, keepdim=True, unbiased=False) + 1e-5)
44 |         b_var2 = (a_var2 - a_mean2) / a_std2
45 |         loss2 = b_var2.sum()
46 |         loss2.backward()
47 | 
48 |         self.assertTensorClose(bn.running_mean, a.mean(dim=0))
49 |         self.assertTensorClose(bn.running_var, handy_var(a))
50 |         self.assertTensorClose(a_var1.data, a_var2.data)
51 |         self.assertTensorClose(b_var1.data, b_var2.data)
52 |         self.assertTensorClose(a_var1.grad, a_var2.grad)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/ghost/models/networks/Synchronized-BatchNorm-PyTorch/tests/test_numeric_batchnorm_v2.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # File   : test_numeric_batchnorm_v2.py
 4 | # Author : Jiayuan Mao
 5 | # Email  : maojiayuan@gmail.com
 6 | # Date   : 11/01/2018
 7 | #
 8 | # Distributed under terms of the MIT license.
 9 | 
10 | """
11 | Test the numerical implementation of batch normalization.
12 | 
13 | Author: acgtyrant.
14 | See also: https://github.com/vacancy/Synchronized-BatchNorm-PyTorch/issues/14
15 | """
16 | 
17 | import unittest
18 | 
19 | import torch
20 | import torch.nn as nn
21 | import torch.optim as optim
22 | 
23 | from sync_batchnorm.unittest import TorchTestCase
24 | from sync_batchnorm.batchnorm_reimpl import BatchNorm2dReimpl
25 | 
26 | 
27 | class NumericTestCasev2(TorchTestCase):
28 |     def testNumericBatchNorm(self):
29 |         CHANNELS = 16
30 |         batchnorm1 = nn.BatchNorm2d(CHANNELS, momentum=1)
31 |         optimizer1 = optim.SGD(batchnorm1.parameters(), lr=0.01)
32 | 
33 |         batchnorm2 = BatchNorm2dReimpl(CHANNELS, momentum=1)
34 |         batchnorm2.weight.data.copy_(batchnorm1.weight.data)
35 |         batchnorm2.bias.data.copy_(batchnorm1.bias.data)
36 |         optimizer2 = optim.SGD(batchnorm2.parameters(), lr=0.01)
37 | 
38 |         for _ in range(100):
39 |             input_ = torch.rand(16, CHANNELS, 16, 16)
40 | 
41 |             input1 = input_.clone().requires_grad_(True)
42 |             output1 = batchnorm1(input1)
43 |             output1.sum().backward()
44 |             optimizer1.step()
45 | 
46 |             input2 = input_.clone().requires_grad_(True)
47 |             output2 = batchnorm2(input2)
48 |             output2.sum().backward()
49 |             optimizer2.step()
50 | 
51 |         self.assertTensorClose(input1, input2)
52 |         self.assertTensorClose(output1, output2)
53 |         self.assertTensorClose(input1.grad, input2.grad)
54 |         self.assertTensorClose(batchnorm1.weight.grad, batchnorm2.weight.grad)
55 |         self.assertTensorClose(batchnorm1.bias.grad, batchnorm2.bias.grad)
56 |         self.assertTensorClose(batchnorm1.running_mean, batchnorm2.running_mean)
57 |         self.assertTensorClose(batchnorm2.running_mean, batchnorm2.running_mean)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 | 
63 | 


--------------------------------------------------------------------------------
/ghost/models/networks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (C) 2019 NVIDIA Corporation.  All rights reserved.
 3 | Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
 4 | """
 5 | 
 6 | import torch
 7 | from models.networks.base_network import BaseNetwork
 8 | from models.networks.loss import *
 9 | from models.networks.discriminator import *
10 | from models.networks.generator import *
11 | from models.networks.encoder import *
12 | import utils.inference.util as util
13 | 
14 | 
15 | def find_network_using_name(target_network_name, filename):
16 |     target_class_name = target_network_name + filename
17 |     module_name = 'models.networks.' + filename
18 |     network = util.find_class_in_module(target_class_name, module_name)
19 | 
20 |     assert issubclass(network, BaseNetwork), \
21 |         "Class %s should be a subclass of BaseNetwork" % network
22 | 
23 |     return network
24 | 
25 | 
26 | def modify_commandline_options(parser, is_train):
27 |     opt, _ = parser.parse_known_args()
28 | 
29 |     netG_cls = find_network_using_name(opt.netG, 'generator')
30 |     parser = netG_cls.modify_commandline_options(parser, is_train)
31 |     if is_train:
32 |         netD_cls = find_network_using_name(opt.netD, 'discriminator')
33 |         parser = netD_cls.modify_commandline_options(parser, is_train)
34 |     netE_cls = find_network_using_name('conv', 'encoder')
35 |     parser = netE_cls.modify_commandline_options(parser, is_train)
36 | 
37 |     return parser
38 | 
39 | 
40 | def create_network(cls, opt):
41 |     net = cls(opt)
42 |     net.print_network()
43 |     if len(opt.gpu_ids) > 0:
44 |         assert(torch.cuda.is_available())
45 |         net.cuda()
46 |     net.init_weights(opt.init_type, opt.init_variance)
47 |     return net
48 | 
49 | 
50 | def define_G(opt):
51 |     netG_cls = find_network_using_name(opt.netG, 'generator')
52 |     return create_network(netG_cls, opt)
53 | 
54 | 
55 | def define_D(opt):
56 |     netD_cls = find_network_using_name(opt.netD, 'discriminator')
57 |     return create_network(netD_cls, opt)
58 | 
59 | 
60 | def define_E(opt):
61 |     # there exists only one encoder type
62 |     netE_cls = find_network_using_name('conv', 'encoder')
63 |     return create_network(netE_cls, opt)
64 | 


--------------------------------------------------------------------------------
/ghost/models/networks/base_network.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (C) 2019 NVIDIA Corporation.  All rights reserved.
 3 | Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
 4 | """
 5 | 
 6 | import torch.nn as nn
 7 | from torch.nn import init
 8 | 
 9 | 
10 | class BaseNetwork(nn.Module):
11 |     def __init__(self):
12 |         super(BaseNetwork, self).__init__()
13 | 
14 |     @staticmethod
15 |     def modify_commandline_options(parser, is_train):
16 |         return parser
17 | 
18 |     def print_network(self):
19 |         if isinstance(self, list):
20 |             self = self[0]
21 |         num_params = 0
22 |         for param in self.parameters():
23 |             num_params += param.numel()
24 |         print('Network [%s] was created. Total number of parameters: %.1f million. '
25 |               'To see the architecture, do print(network).'
26 |               % (type(self).__name__, num_params / 1000000))
27 | 
28 |     def init_weights(self, init_type='normal', gain=0.02):
29 |         def init_func(m):
30 |             classname = m.__class__.__name__
31 |             if classname.find('BatchNorm2d') != -1:
32 |                 if hasattr(m, 'weight') and m.weight is not None:
33 |                     init.normal_(m.weight.data, 1.0, gain)
34 |                 if hasattr(m, 'bias') and m.bias is not None:
35 |                     init.constant_(m.bias.data, 0.0)
36 |             elif hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
37 |                 if init_type == 'normal':
38 |                     init.normal_(m.weight.data, 0.0, gain)
39 |                 elif init_type == 'xavier':
40 |                     init.xavier_normal_(m.weight.data, gain=gain)
41 |                 elif init_type == 'xavier_uniform':
42 |                     init.xavier_uniform_(m.weight.data, gain=1.0)
43 |                 elif init_type == 'kaiming':
44 |                     init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
45 |                 elif init_type == 'orthogonal':
46 |                     init.orthogonal_(m.weight.data, gain=gain)
47 |                 elif init_type == 'none':  # uses pytorch's default init method
48 |                     m.reset_parameters()
49 |                 else:
50 |                     raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
51 |                 if hasattr(m, 'bias') and m.bias is not None:
52 |                     init.constant_(m.bias.data, 0.0)
53 | 
54 |         self.apply(init_func)
55 | 
56 |         # propagate to children
57 |         for m in self.children():
58 |             if hasattr(m, 'init_weights'):
59 |                 m.init_weights(init_type, gain)
60 | 


--------------------------------------------------------------------------------
/ghost/models/networks/encoder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (C) 2019 NVIDIA Corporation.  All rights reserved.
 3 | Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
 4 | """
 5 | 
 6 | import torch.nn as nn
 7 | import numpy as np
 8 | import torch.nn.functional as F
 9 | from models.networks.base_network import BaseNetwork
10 | from models.networks.normalization import get_nonspade_norm_layer
11 | 
12 | 
13 | class ConvEncoder(BaseNetwork):
14 |     """ Same architecture as the image discriminator """
15 | 
16 |     def __init__(self, opt):
17 |         super().__init__()
18 | 
19 |         kw = 3
20 |         pw = int(np.ceil((kw - 1.0) / 2))
21 |         ndf = opt.ngf
22 |         norm_layer = get_nonspade_norm_layer(opt, opt.norm_E)
23 |         self.layer1 = norm_layer(nn.Conv2d(3, ndf, kw, stride=2, padding=pw))
24 |         self.layer2 = norm_layer(nn.Conv2d(ndf * 1, ndf * 2, kw, stride=2, padding=pw))
25 |         self.layer3 = norm_layer(nn.Conv2d(ndf * 2, ndf * 4, kw, stride=2, padding=pw))
26 |         self.layer4 = norm_layer(nn.Conv2d(ndf * 4, ndf * 8, kw, stride=2, padding=pw))
27 |         self.layer5 = norm_layer(nn.Conv2d(ndf * 8, ndf * 8, kw, stride=2, padding=pw))
28 |         if opt.crop_size >= 256:
29 |             self.layer6 = norm_layer(nn.Conv2d(ndf * 8, ndf * 8, kw, stride=2, padding=pw))
30 | 
31 |         self.so = s0 = 4
32 |         self.fc_mu = nn.Linear(ndf * 8 * s0 * s0, 256)
33 |         self.fc_var = nn.Linear(ndf * 8 * s0 * s0, 256)
34 | 
35 |         self.actvn = nn.LeakyReLU(0.2, False)
36 |         self.opt = opt
37 | 
38 |     def forward(self, x):
39 |         if x.size(2) != 256 or x.size(3) != 256:
40 |             x = F.interpolate(x, size=(256, 256), mode='bilinear')
41 | 
42 |         x = self.layer1(x)
43 |         x = self.layer2(self.actvn(x))
44 |         x = self.layer3(self.actvn(x))
45 |         x = self.layer4(self.actvn(x))
46 |         x = self.layer5(self.actvn(x))
47 |         if self.opt.crop_size >= 256:
48 |             x = self.layer6(self.actvn(x))
49 |         x = self.actvn(x)
50 | 
51 |         x = x.view(x.size(0), -1)
52 |         mu = self.fc_mu(x)
53 |         logvar = self.fc_var(x)
54 | 
55 |         return mu, logvar
56 | 


--------------------------------------------------------------------------------
/ghost/models/networks/sync_batchnorm/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : __init__.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
12 | from .batchnorm import patch_sync_batchnorm, convert_model
13 | from .replicate import DataParallelWithCallback, patch_replication_callback
14 | 


--------------------------------------------------------------------------------
/ghost/models/networks/sync_batchnorm/batchnorm_reimpl.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # File   : batchnorm_reimpl.py
 4 | # Author : acgtyrant
 5 | # Date   : 11/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.init as init
14 | 
15 | __all__ = ['BatchNorm2dReimpl']
16 | 
17 | 
18 | class BatchNorm2dReimpl(nn.Module):
19 |     """
20 |     A re-implementation of batch normalization, used for testing the numerical
21 |     stability.
22 | 
23 |     Author: acgtyrant
24 |     See also:
25 |     https://github.com/vacancy/Synchronized-BatchNorm-PyTorch/issues/14
26 |     """
27 |     def __init__(self, num_features, eps=1e-5, momentum=0.1):
28 |         super().__init__()
29 | 
30 |         self.num_features = num_features
31 |         self.eps = eps
32 |         self.momentum = momentum
33 |         self.weight = nn.Parameter(torch.empty(num_features))
34 |         self.bias = nn.Parameter(torch.empty(num_features))
35 |         self.register_buffer('running_mean', torch.zeros(num_features))
36 |         self.register_buffer('running_var', torch.ones(num_features))
37 |         self.reset_parameters()
38 | 
39 |     def reset_running_stats(self):
40 |         self.running_mean.zero_()
41 |         self.running_var.fill_(1)
42 | 
43 |     def reset_parameters(self):
44 |         self.reset_running_stats()
45 |         init.uniform_(self.weight)
46 |         init.zeros_(self.bias)
47 | 
48 |     def forward(self, input_):
49 |         batchsize, channels, height, width = input_.size()
50 |         numel = batchsize * height * width
51 |         input_ = input_.permute(1, 0, 2, 3).contiguous().view(channels, numel)
52 |         sum_ = input_.sum(1)
53 |         sum_of_square = input_.pow(2).sum(1)
54 |         mean = sum_ / numel
55 |         sumvar = sum_of_square - sum_ * mean
56 | 
57 |         self.running_mean = (
58 |                 (1 - self.momentum) * self.running_mean
59 |                 + self.momentum * mean.detach()
60 |         )
61 |         unbias_var = sumvar / (numel - 1)
62 |         self.running_var = (
63 |                 (1 - self.momentum) * self.running_var
64 |                 + self.momentum * unbias_var.detach()
65 |         )
66 | 
67 |         bias_var = sumvar / numel
68 |         inv_std = 1 / (bias_var + self.eps).pow(0.5)
69 |         output = (
70 |                 (input_ - mean.unsqueeze(1)) * inv_std.unsqueeze(1) *
71 |                 self.weight.unsqueeze(1) + self.bias.unsqueeze(1))
72 | 
73 |         return output.view(channels, batchsize, height, width).permute(1, 0, 2, 3).contiguous()
74 | 
75 | 


--------------------------------------------------------------------------------
/ghost/models/networks/sync_batchnorm/unittest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # File   : unittest.py
 3 | # Author : Jiayuan Mao
 4 | # Email  : maojiayuan@gmail.com
 5 | # Date   : 27/01/2018
 6 | #
 7 | # This file is part of Synchronized-BatchNorm-PyTorch.
 8 | # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
 9 | # Distributed under MIT License.
10 | 
11 | import unittest
12 | import torch
13 | 
14 | 
15 | class TorchTestCase(unittest.TestCase):
16 |     def assertTensorClose(self, x, y):
17 |         adiff = float((x - y).abs().max())
18 |         if (y == 0).all():
19 |             rdiff = 'NaN'
20 |         else:
21 |             rdiff = float((adiff / y).abs().max())
22 | 
23 |         message = (
24 |             'Tensor close check failed\n'
25 |             'adiff={}\n'
26 |             'rdiff={}\n'
27 |         ).format(adiff, rdiff)
28 |         self.assertTrue(torch.allclose(x, y), message)
29 | 
30 | 


--------------------------------------------------------------------------------
/ghost/network/AADLayer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class AADLayer(nn.Module):
 6 |     def __init__(self, c_x, attr_c, c_id):
 7 |         super(AADLayer, self).__init__()
 8 |         self.attr_c = attr_c
 9 |         self.c_id = c_id
10 |         self.c_x = c_x
11 | 
12 |         self.conv1 = nn.Conv2d(attr_c, c_x, kernel_size=1, stride=1, padding=0, bias=True)
13 |         self.conv2 = nn.Conv2d(attr_c, c_x, kernel_size=1, stride=1, padding=0, bias=True)
14 |         self.fc1 = nn.Linear(c_id, c_x)
15 |         self.fc2 = nn.Linear(c_id, c_x)
16 |         self.norm = nn.InstanceNorm2d(c_x, affine=False)
17 | 
18 |         self.conv_h = nn.Conv2d(c_x, 1, kernel_size=1, stride=1, padding=0, bias=True)
19 | 
20 |     def forward(self, h_in, z_attr, z_id):
21 |         # h_in cxnxn
22 |         # zid 256x1x1
23 |         # zattr cxnxn
24 |         h = self.norm(h_in)
25 |         gamma_attr = self.conv1(z_attr)
26 |         beta_attr = self.conv2(z_attr)
27 | 
28 |         gamma_id = self.fc1(z_id)
29 |         beta_id = self.fc2(z_id)
30 |         A = gamma_attr * h + beta_attr
31 |         gamma_id = gamma_id.reshape(h.shape[0], self.c_x, 1, 1).expand_as(h)
32 |         beta_id = beta_id.reshape(h.shape[0], self.c_x, 1, 1).expand_as(h)
33 |         I = gamma_id * h + beta_id
34 | 
35 |         M = torch.sigmoid(self.conv_h(h))
36 | 
37 |         out = (torch.ones_like(M).to(M.device) - M) * A + M * I
38 |         return out
39 | 
40 |     
41 | class AddBlocksSequential(nn.Sequential):
42 |     def forward(self, *inputs):
43 |         h, z_attr, z_id = inputs
44 |         for i, module in enumerate(self._modules.values()):
45 |             if i%3 == 0 and i > 0:
46 |                 inputs = (inputs, z_attr, z_id)
47 |             if type(inputs) == tuple:
48 |                 inputs = module(*inputs)
49 |             else:
50 |                 inputs = module(inputs)
51 |         return inputs
52 |         
53 |         
54 | class AAD_ResBlk(nn.Module):
55 |     def __init__(self, cin, cout, c_attr, c_id, num_blocks):
56 |         super(AAD_ResBlk, self).__init__()
57 |         self.cin = cin
58 |         self.cout = cout
59 |         
60 |         add_blocks = []
61 |         for i in range(num_blocks):
62 |             out = cin if i < (num_blocks-1) else cout
63 |             add_blocks.extend([AADLayer(cin, c_attr, c_id),
64 |                                nn.ReLU(inplace=True),
65 |                                nn.Conv2d(cin, out, kernel_size=3, stride=1, padding=1, bias=False)
66 |                               ])
67 |         self.add_blocks = AddBlocksSequential(*add_blocks)
68 |         
69 |         if cin != cout:
70 |             last_add_block = [AADLayer(cin, c_attr, c_id), 
71 |                              nn.ReLU(inplace=True), 
72 |                              nn.Conv2d(cin, cout, kernel_size=3, stride=1, padding=1, bias=False)]
73 |             self.last_add_block = AddBlocksSequential(*last_add_block)
74 |             
75 | 
76 |     def forward(self, h, z_attr, z_id):
77 |         x =  self.add_blocks(h, z_attr, z_id)
78 |         if self.cin != self.cout:
79 |             h = self.last_add_block(h, z_attr, z_id)
80 |         x = x + h
81 |         return x
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/ghost/network/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/ghost/network/__init__.py


--------------------------------------------------------------------------------
/ghost/preprocess_vgg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import cv2
 4 | import argparse
 5 | from insightface_func.face_detect_crop_single import Face_detect_crop
 6 | from pathlib import Path
 7 | from tqdm import tqdm
 8 | 
 9 | def main(args):
10 |     app = Face_detect_crop(name='antelope', root='./insightface_func/models')
11 |     app.prepare(ctx_id= 0, det_thresh=0.6, det_size=(640,640))
12 |     crop_size = 224
13 | 
14 |     dirs = os.listdir(args.path_to_dataset)
15 |     for i in tqdm(range(len(dirs))):
16 |         d = os.path.join(args.path_to_dataset, dirs[i])
17 |         dir_to_save = os.path.join(args.save_path, dirs[i])
18 |         Path(dir_to_save).mkdir(parents=True, exist_ok=True)
19 |         
20 |         image_names = os.listdir(d)
21 |         for image_name in image_names:
22 |             try:
23 |                 image_path = os.path.join(d, image_name)
24 |                 image = cv2.imread(image_path)
25 |                 cropped_image, _ = app.get(image, crop_size)
26 |                 cv2.imwrite(os.path.join(dir_to_save, image_name), cropped_image[0])
27 |             except:
28 |                 pass
29 |         
30 |     
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     
34 |     parser.add_argument('--path_to_dataset', default='./VggFace2/VGG-Face2/data/preprocess_train', type=str)
35 |     parser.add_argument('--save_path', default='./VggFace2-crop', type=str)
36 |     
37 |     args = parser.parse_args()
38 |     
39 |     main(args)
40 | 


--------------------------------------------------------------------------------
/ghost/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | -f https://download.pytorch.org/whl/torch_stable.html
 3 | torch==1.6.0+cu101
 4 | -f https://download.pytorch.org/whl/torch_stable.html
 5 | torchvision==0.7.0+cu101
 6 | opencv-python
 7 | onnx==1.9.0
 8 | onnxruntime-gpu==1.4.0
 9 | mxnet-cu101mkl
10 | scikit-image
11 | insightface==0.2.1
12 | requests==2.25.1
13 | kornia==0.5.4
14 | dill
15 | wandb


--------------------------------------------------------------------------------
/ghost/utils/inference/faceshifter_run.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | def faceshifter_batch(source_emb: torch.tensor, 
 6 |                       target: torch.tensor,
 7 |                       G: torch.nn.Module) -> np.ndarray:
 8 |     """
 9 |     Apply faceshifter model for batch of target images
10 |     """
11 |     
12 |     bs = target.shape[0]
13 |     assert target.ndim == 4, "target should have 4 dimentions -- B x C x H x W"
14 |     
15 |     if bs > 1:
16 |         source_emb = torch.cat([source_emb]*bs)
17 |     
18 |     with torch.no_grad():
19 |         Y_st, _ = G(target, source_emb)
20 |         Y_st = (Y_st.permute(0, 2, 3, 1)*0.5 + 0.5)*255
21 |         Y_st = Y_st[:, :, :, [2,1,0]].type(torch.uint8)
22 |         Y_st = Y_st.cpu().detach().numpy()    
23 |     return Y_st


--------------------------------------------------------------------------------
/ghost/utils/training/detector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import cv2
 4 | from PIL import Image
 5 | import torchvision.transforms as transforms
 6 | from AdaptiveWingLoss.utils.utils import get_preds_fromhm
 7 | from .image_processing import torch2image
 8 | 
 9 | 
10 | transforms_base = transforms.Compose([
11 |             transforms.ColorJitter(0.2, 0.2, 0.2, 0.01),
12 |             transforms.Resize((256, 256)),
13 |             transforms.ToTensor(),
14 |             transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
15 |         ])
16 | 
17 | 
18 | def detect_landmarks(inputs, model_ft):
19 |     mean = torch.tensor([0.5, 0.5, 0.5]).unsqueeze(1).unsqueeze(2).to(inputs.device)
20 |     std = torch.tensor([0.5, 0.5, 0.5]).unsqueeze(1).unsqueeze(2).to(inputs.device)
21 |     inputs = (std * inputs) + mean
22 | 
23 |     outputs, boundary_channels = model_ft(inputs)    
24 |     pred_heatmap = outputs[-1][:, :-1, :, :].cpu() 
25 |     pred_landmarks, _ = get_preds_fromhm(pred_heatmap)
26 |     landmarks = pred_landmarks*4.0
27 |     eyes = torch.cat((landmarks[:,96,:], landmarks[:,97,:]), 1)
28 |     return eyes, pred_heatmap[:,96,:,:], pred_heatmap[:,97,:,:]
29 | 
30 | 
31 | def paint_eyes(images, eyes):
32 |     list_eyes = []
33 |     for i in range(len(images)):
34 |         mask = torch2image(images[i])
35 |         mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB) 
36 |         
37 |         cv2.circle(mask, (int(eyes[i][0]),int(eyes[i][1])), radius=3, color=(0,255,255), thickness=-1)
38 |         cv2.circle(mask, (int(eyes[i][2]),int(eyes[i][3])), radius=3, color=(0,255,255), thickness=-1)
39 |         
40 |         mask = mask[:, :, ::-1]
41 |         mask = transforms_base(Image.fromarray(mask))
42 |         list_eyes.append(mask)
43 |     tensor_eyes = torch.stack(list_eyes)
44 |     return tensor_eyes


--------------------------------------------------------------------------------
/ghost/utils/training/image_processing.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from PIL import Image
 4 | 
 5 | import torch
 6 | import torchvision.transforms as transforms
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | transformer_Arcface = transforms.Compose([
11 |         transforms.ToTensor(),
12 |         transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
13 |     ])
14 | 
15 | 
16 | def torch2image(torch_image: torch.tensor) -> np.ndarray:
17 |     batch = False
18 |     
19 |     if torch_image.dim() == 4:
20 |         torch_image = torch_image[:8]
21 |         batch = True
22 |     
23 |     device = torch_image.device
24 |     # mean = torch.tensor([0.485, 0.456, 0.406]).unsqueeze(1).unsqueeze(2)
25 |     # std = torch.tensor([0.229, 0.224, 0.225]).unsqueeze(1).unsqueeze(2)
26 |     mean = torch.tensor([0.5, 0.5, 0.5]).unsqueeze(1).unsqueeze(2).to(device)
27 |     std = torch.tensor([0.5, 0.5, 0.5]).unsqueeze(1).unsqueeze(2).to(device)
28 |     
29 |     denorm_image = (std * torch_image) + mean
30 |     
31 |     if batch:
32 |         denorm_image = denorm_image.permute(0, 2, 3, 1)
33 |     else:
34 |         denorm_image = denorm_image.permute(1, 2, 0)
35 |     
36 |     np_image = denorm_image.detach().cpu().numpy()
37 |     np_image = np.clip(np_image*255., 0, 255).astype(np.uint8)
38 |     
39 |     if batch:
40 |         return np.concatenate(np_image, axis=1)
41 |     else:
42 |         return np_image
43 | 
44 | 
45 | def make_image_list(images) -> np.ndarray:    
46 |     np_images = []
47 |     
48 |     for torch_image in images:
49 |         np_img = torch2image(torch_image)
50 |         np_images.append(np_img)
51 |     
52 |     return np.concatenate(np_images, axis=0)
53 | 
54 | 
55 | def read_torch_image(path: str) -> torch.tensor:
56 |     
57 |     image = cv2.imread(path)
58 |     image = cv2.resize(image, (256, 256))
59 |     image = Image.fromarray(image[:, :, ::-1])
60 |     image = transformer_Arcface(image)
61 |     image = image.view(-1, image.shape[0], image.shape[1], image.shape[2])
62 |     
63 |     return image
64 | 
65 | 
66 | def get_faceswap(source_path: str, target_path: str, 
67 |                  G: 'generator model', netArc: 'arcface model', 
68 |                  device: 'torch device') -> np.array:
69 |     source = read_torch_image(source_path)
70 |     source = source.to(device)
71 | 
72 |     embeds = netArc(F.interpolate(source, [112, 112], mode='bilinear', align_corners=False))
73 |     # embeds = F.normalize(embeds, p=2, dim=1)
74 | 
75 |     target = read_torch_image(target_path)
76 |     target = target.cuda()
77 | 
78 |     with torch.no_grad():
79 |         Yt, _ = G(target, embeds)
80 |         Yt = torch2image(Yt)
81 | 
82 |     source = torch2image(source)
83 |     target = torch2image(target)
84 | 
85 |     return np.concatenate((cv2.resize(source, (256, 256)), target, Yt), axis=1)  
86 |         


--------------------------------------------------------------------------------
/ghost/utils/training/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | l1_loss = torch.nn.L1Loss()
 4 | l2_loss = torch.nn.MSELoss()
 5 | 
 6 | 
 7 | def hinge_loss(X, positive=True):
 8 |     if positive:
 9 |         return torch.relu(1-X)
10 |     else:
11 |         return torch.relu(X+1)
12 |     
13 |     
14 | def compute_generator_losses(G, Y, Xt, Xt_attr, Di, embed, ZY, eye_heatmaps, loss_adv_accumulated, 
15 |                              diff_person, same_person, args):
16 |     # adversarial loss
17 |     L_adv = 0.
18 |     for di in Di:
19 |         L_adv += hinge_loss(di[0], True).mean(dim=[1, 2, 3])
20 |     L_adv = torch.sum(L_adv * diff_person) / (diff_person.sum() + 1e-4)
21 | 
22 |     # id loss
23 |     L_id =(1 - torch.cosine_similarity(embed, ZY, dim=1)).mean()
24 | 
25 |     # attr loss
26 |     if args.optim_level == "O2" or args.optim_level == "O3":
27 |         Y_attr = G.get_attr(Y.type(torch.half))
28 |     else:
29 |         Y_attr = G.get_attr(Y)
30 |     
31 |     L_attr = 0
32 |     for i in range(len(Xt_attr)):
33 |         L_attr += torch.mean(torch.pow(Xt_attr[i] - Y_attr[i], 2).reshape(args.batch_size, -1), dim=1).mean()
34 |     L_attr /= 2.0
35 | 
36 |     # reconstruction loss
37 |     L_rec = torch.sum(0.5 * torch.mean(torch.pow(Y - Xt, 2).reshape(args.batch_size, -1), dim=1) * same_person) / (same_person.sum() + 1e-6)
38 |     
39 |     # l2 eyes loss
40 |     if args.eye_detector_loss:
41 |         Xt_heatmap_left, Xt_heatmap_right, Y_heatmap_left, Y_heatmap_right = eye_heatmaps
42 |         L_l2_eyes = l2_loss(Xt_heatmap_left, Y_heatmap_left) + l2_loss(Xt_heatmap_right, Y_heatmap_right)
43 |     else:
44 |         L_l2_eyes = 0
45 |         
46 |     # final loss of generator
47 |     lossG = args.weight_adv*L_adv + args.weight_attr*L_attr + args.weight_id*L_id + args.weight_rec*L_rec + args.weight_eyes*L_l2_eyes
48 |     loss_adv_accumulated = loss_adv_accumulated*0.98 + L_adv.item()*0.02
49 |     
50 |     return lossG, loss_adv_accumulated, L_adv, L_attr, L_id, L_rec, L_l2_eyes
51 | 
52 | 
53 | def compute_discriminator_loss(D, Y, Xs, diff_person):
54 |     # fake part
55 |     fake_D = D(Y.detach())
56 |     loss_fake = 0
57 |     for di in fake_D:
58 |         loss_fake += torch.sum(hinge_loss(di[0], False).mean(dim=[1, 2, 3]) * diff_person) / (diff_person.sum() + 1e-4)
59 | 
60 |     # ground truth part
61 |     true_D = D(Xs)
62 |     loss_true = 0
63 |     for di in true_D:
64 |         loss_true += torch.sum(hinge_loss(di[0], True).mean(dim=[1, 2, 3]) * diff_person) / (diff_person.sum() + 1e-4)
65 | 
66 |     lossD = 0.5*(loss_true.mean() + loss_fake.mean())
67 | 
68 |     return lossD


--------------------------------------------------------------------------------
/inference流程.txt:
--------------------------------------------------------------------------------
 1 | 1. 从原图中截取面部轮廓特征；
 2 | 2. 加载视频；
 3 |    + 获取帧率、视频的每一帧列表；
 4 | 3. 获取需要被换脸的目标脸（视频中需要被替换的脸）；
 5 | 4. inference；
 6 |    + 使用原始帧得到换面帧
 7 |    + param: 视频的每一帧列表、源脸、目标脸、人脸emb、人脸生成、人脸裁剪、是否传入目标脸照片、
 8 |    + 1）获取目标图像的Arcface emb；
 9 |    + 2）从原始帧和变换中获取裁剪的面；
10 |    + 3）normalize原图并emb；
11 |    + 4）enumerate frames list：
12 |         + 调整裁剪帧的大小，并获得vector，显示在哪些帧上有面；
13 |         + 有面的帧进行normalize；
14 |         + for 对batch_size目标图像应用换面模型；
15 |         + for 生成转换后的最终帧列表（有面的）；
16 |    + return：转换后的帧列表、需要裁剪的帧、需要裁剪的帧对应的转换；
17 | + 5. if image2video：
18 | 	    + for 根据帧合成视频；
19 | 	    + 从原视频中抽取audio并add；
20 | 
21 | 
22 | inVideo4，1min 4s
23 | batch_size=40，9044 M、total time 3min 7s
24 | batch_size=10，6438 M、total time 2min 25s
25 | 减小batch_size能降低显存，但是生成结果没有明显区别
26 | 
27 | 
28 | 
29 | 
30 | '''
31 | 显存需求
32 | 中间帧率低导致快速动作脸跟不上？
33 | 文件大小？生成分辨率、帧率一样
34 | 调下效果
35 | '''


--------------------------------------------------------------------------------
/tmp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGS-note/face_swap/fd89df399d764dc3b6ea7b638adc57b2ae4442d7/tmp.jpg


--------------------------------------------------------------------------------