├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── configs ├── campus │ ├── prn32_cpn80x80x20.yaml │ └── prn64_cpn80x80x20.yaml ├── panoptic │ └── resnet50 │ │ ├── prn32_cpn48x48x12_960x512_cam5.yaml │ │ └── prn64_cpn80x80x20_960x512_cam5.yaml └── shelf │ ├── prn32_cpn48x48x12.yaml │ └── prn64_cpn80x80x20.yaml ├── data ├── CampusSeq1 │ ├── calibration_campus.json │ └── pred_campus_maskrcnn_hrnet_coco.pkl ├── Shelf │ ├── calibration_shelf.json │ └── pred_shelf_maskrcnn_hrnet_coco.pkl ├── panoptic.gif ├── panoptic2.gif └── panoptic_training_pose.pkl ├── lib ├── core │ ├── __init__.py │ ├── config.py │ ├── function.py │ ├── loss.py │ └── proposal.py ├── dataset │ ├── JointsDataset.py │ ├── __init__.py │ ├── campus.py │ ├── campus_synthetic.py │ ├── panoptic.py │ ├── shelf.py │ └── shelf_synthetic.py ├── models │ ├── __init__.py │ ├── cuboid_proposal_net.py │ ├── multi_person_posenet.py │ ├── pose_regression_net.py │ ├── pose_resnet.py │ ├── project_layer.py │ └── v2v_net.py └── utils │ ├── __init__.py │ ├── cameras.py │ ├── cameras_cpu.py │ ├── transforms.py │ ├── utils.py │ ├── vis.py │ └── zipreader.py ├── requirements.txt ├── run ├── _init_paths.py ├── train_3d.py └── validate_3d.py └── test ├── _init_paths.py └── evaluate.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VoxelPose 2 | 3 | 4 | 5 | This is the official implementation for: 6 | > [**VoxelPose: Towards Multi-Camera 3D Human Pose Estimation in Wild Environment**](https://arxiv.org/abs/2004.06239), 7 | > Hanyue Tu, Chunyu Wang, Wenjun Zeng 8 | > *ECCV 2020 (Oral) ([arXiv 2004.06239](https://arxiv.org/abs/2004.06239))* 9 | 10 | 11 | 12 | 13 | 14 | ## Installation 15 | 1. Clone this repo, and we'll call the directory that you cloned multiview-multiperson-pose as ${POSE_ROOT}. 16 | 2. Install dependencies. 17 | 18 | ## Data preparation 19 | 20 | ### Shelf/Campus datasets 21 | 1. Download the datasets from http://campar.in.tum.de/Chair/MultiHumanPose and extract them under `${POSE_ROOT}/data/Shelf` and `${POSE_ROOT}/data/CampusSeq1`, respectively. 22 | 23 | 2. We have processed the camera parameters to our formats and you can download them from this repository. They lie in `${POSE_ROOT}/data/Shelf/` and `${POSE_ROOT}/data/CampusSeq1/`, respectively. 24 | 25 | 3. Due to the limited and incomplete annotations of the two datasets, we don't train our model using this dataset. Instead, we directly use the 2D pose estimator trained on COCO, and use independent 3D human poses from the Panoptic dataset to train our 3D model. It lies in `${POSE_ROOT}/data/panoptic_training_pose.pkl`. See our paper for more details. 26 | 27 | 4. For testing, we first estimate 2D poses and generate 2D heatmaps for these two datasets in this repository. The predicted poses can also download from the repository. They lie in `${POSE_ROOT}/data/Shelf/` and `${POSE_ROOT}/data/CampusSeq1/`, respectively. You can also use the models trained on COCO dataset (like HigherHRNet) to generate 2D heatmaps directly. 28 | 29 | The directory tree should look like this: 30 | ``` 31 | ${POSE_ROOT} 32 | |-- data 33 | |-- Shelf 34 | | |-- Camera0 35 | | |-- ... 36 | | |-- Camera4 37 | | |-- actorsGT.mat 38 | | |-- calibration_shelf.json 39 | | |-- pred_shelf_maskrcnn_hrnet_coco.pkl 40 | |-- CampusSeq1 41 | | |-- Camera0 42 | | |-- Camera1 43 | | |-- Camera2 44 | | |-- actorsGT.mat 45 | | |-- calibration_campus.json 46 | | |-- pred_campus_maskrcnn_hrnet_coco.pkl 47 | |-- panoptic_training_pose.pkl 48 | ``` 49 | 50 | 51 | ### CMU Panoptic dataset 52 | 1. Download the dataset by following the instructions in [panoptic-toolbox](https://github.com/CMU-Perceptual-Computing-Lab/panoptic-toolbox) and extract them under `${POSE_ROOT}/data/panoptic_toolbox/data`. 53 | - You can only download those sequences you need. You can also just download a subset of camera views by specifying the number of views (HD_Video_Number) and changing the camera order in `./scripts/getData.sh`. The sequences and camera views used in our project can be obtained from our paper. 54 | - Note that we only use HD videos, calibration data, and 3D Body Keypoint in the codes. You can comment out other irrelevant codes such as downloading 3D Face data in `./scripts/getData.sh`. 55 | 2. Download the pretrained backbone model from [pretrained backbone](https://1drv.ms/u/s!AjX41AtnTHeTjn3H9PGSLcbSC0bl?e=cw7SQg) and place it here: `${POSE_ROOT}/models/pose_resnet50_panoptic.pth.tar` (ResNet-50 pretrained on COCO dataset and finetuned jointly on Panoptic dataset and MPII). 56 | 57 | The directory tree should look like this: 58 | ``` 59 | ${POSE_ROOT} 60 | |-- models 61 | | |-- pose_resnet50_panoptic.pth.tar 62 | |-- data 63 | |-- panoptic-toolbox 64 | |-- data 65 | |-- 16060224_haggling1 66 | | |-- hdImgs 67 | | |-- hdvideos 68 | | |-- hdPose3d_stage1_coco19 69 | | |-- calibration_160224_haggling1.json 70 | |-- 160226_haggling1 71 | |-- ... 72 | ``` 73 | 74 | ## Training 75 | ### CMU Panoptic dataset 76 | 77 | Train and validate on the five selected camera views. You can specify the GPU devices and batch size per GPU in the config file. We trained our models on two GPUs. 78 | ``` 79 | python run/train_3d.py --cfg configs/panoptic/resnet50/prn64_cpn80x80x20_960x512_cam5.yaml 80 | ``` 81 | ### Shelf/Campus datasets 82 | ``` 83 | python run/train_3d.py --cfg configs/shelf/prn64_cpn80x80x20.yaml 84 | python run/train_3d.py --cfg configs/campus/prn64_cpn80x80x20.yaml 85 | ``` 86 | 87 | ## Evaluation 88 | ### CMU Panoptic dataset 89 | 90 | Evaluate the models. It will print evaluation results to the screen./ 91 | ``` 92 | python test/evaluate.py --cfg configs/panoptic/resnet50/prn64_cpn80x80x20_960x512_cam5.yaml 93 | ``` 94 | ### Shelf/Campus datasets 95 | 96 | It will print the PCP results to the screen. 97 | ``` 98 | python test/evaluate.py --cfg configs/shelf/prn64_cpn80x80x20.yaml 99 | python test/evaluate.py --cfg configs/campus/prn64_cpn80x80x20.yaml 100 | ``` 101 | 102 | ## Citation 103 | If you use our code or models in your research, please cite with: 104 | ``` 105 | @inproceedings{voxelpose, 106 | author={Tu, Hanyue and Wang, Chunyu and Zeng, Wenjun}, 107 | title={VoxelPose: Towards Multi-Camera 3D Human Pose Estimation in Wild Environment}, 108 | booktitle = {European Conference on Computer Vision (ECCV)}, 109 | year = {2020} 110 | } 111 | ``` 112 | 113 | 114 | # Contributing 115 | 116 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 117 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 118 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 119 | 120 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 121 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 122 | provided by the bot. You will only need to do this once across all repos using our CLA. 123 | 124 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 125 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 126 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 127 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /configs/campus/prn32_cpn80x80x20.yaml: -------------------------------------------------------------------------------- 1 | CUDNN: 2 | BENCHMARK: true 3 | DETERMINISTIC: false 4 | ENABLED: true 5 | BACKBONE_MODEL: '' 6 | MODEL: 'multi_person_posenet' 7 | DATA_DIR: '' 8 | GPUS: '0' 9 | OUTPUT_DIR: 'output' 10 | LOG_DIR: 'log' 11 | WORKERS: 4 12 | PRINT_FREQ: 100 13 | 14 | DATASET: 15 | COLOR_RGB: True 16 | TRAIN_DATASET: 'campus_synthetic' 17 | TEST_DATASET: 'campus' 18 | DATA_FORMAT: png 19 | DATA_AUGMENTATION: False 20 | FLIP: False 21 | ROOT: 'data/CampusSeq1' 22 | ROT_FACTOR: 45 23 | SCALE_FACTOR: 0.35 24 | TEST_SUBSET: 'validation' 25 | TRAIN_SUBSET: 'train' 26 | ROOTIDX: 27 | - 2 28 | - 3 29 | CAMERA_NUM: 3 30 | NETWORK: 31 | PRETRAINED_BACKBONE: '' 32 | PRETRAINED: '' # 'models/pytorch/imagenet/resnet50-19c8e357.pth' 33 | TARGET_TYPE: gaussian 34 | IMAGE_SIZE: 35 | - 800 36 | - 640 37 | HEATMAP_SIZE: 38 | - 200 39 | - 160 40 | SIGMA: 3 41 | NUM_JOINTS: 17 42 | USE_GT: False 43 | LOSS: 44 | USE_TARGET_WEIGHT: true 45 | TRAIN: 46 | BATCH_SIZE: 4 47 | SHUFFLE: true 48 | BEGIN_EPOCH: 0 49 | END_EPOCH: 30 50 | RESUME: true 51 | OPTIMIZER: adam 52 | LR: 0.0001 53 | TEST: 54 | MODEL_FILE: "model_best.pth.tar" 55 | BATCH_SIZE: 4 56 | DEBUG: 57 | DEBUG: true 58 | SAVE_HEATMAPS_GT: true 59 | SAVE_HEATMAPS_PRED: true 60 | MULTI_PERSON: 61 | SPACE_SIZE: 62 | - 12000.0 63 | - 12000.0 64 | - 2000.0 65 | SPACE_CENTER: 66 | - 3000.0 67 | - 4500.0 68 | - 1000.0 69 | INITIAL_CUBE_SIZE: 70 | - 80 71 | - 80 72 | - 20 73 | MAX_PEOPLE_NUM: 10 74 | THRESHOLD: 0.1 75 | PICT_STRUCT: 76 | GRID_SIZE: 77 | - 2000.0 78 | - 2000.0 79 | - 2000.0 80 | CUBE_SIZE: 81 | - 32 82 | - 32 83 | - 32 84 | -------------------------------------------------------------------------------- /configs/campus/prn64_cpn80x80x20.yaml: -------------------------------------------------------------------------------- 1 | CUDNN: 2 | BENCHMARK: true 3 | DETERMINISTIC: false 4 | ENABLED: true 5 | BACKBONE_MODEL: '' 6 | MODEL: 'multi_person_posenet' 7 | DATA_DIR: '' 8 | GPUS: '0' 9 | OUTPUT_DIR: 'output' 10 | LOG_DIR: 'log' 11 | WORKERS: 4 12 | PRINT_FREQ: 100 13 | 14 | DATASET: 15 | COLOR_RGB: True 16 | TRAIN_DATASET: 'campus_synthetic' 17 | TEST_DATASET: 'campus' 18 | DATA_FORMAT: png 19 | DATA_AUGMENTATION: False 20 | FLIP: False 21 | ROOT: 'data/CampusSeq1' 22 | ROT_FACTOR: 45 23 | SCALE_FACTOR: 0.35 24 | TEST_SUBSET: 'validation' 25 | TRAIN_SUBSET: 'train' 26 | ROOTIDX: 27 | - 2 28 | - 3 29 | CAMERA_NUM: 3 30 | NETWORK: 31 | PRETRAINED_BACKBONE: '' 32 | PRETRAINED: '' # 'models/pytorch/imagenet/resnet50-19c8e357.pth' 33 | TARGET_TYPE: gaussian 34 | IMAGE_SIZE: 35 | - 800 36 | - 640 37 | HEATMAP_SIZE: 38 | - 200 39 | - 160 40 | SIGMA: 3 41 | NUM_JOINTS: 17 42 | USE_GT: False 43 | LOSS: 44 | USE_TARGET_WEIGHT: true 45 | TRAIN: 46 | BATCH_SIZE: 1 47 | SHUFFLE: true 48 | BEGIN_EPOCH: 0 49 | END_EPOCH: 30 50 | RESUME: true 51 | OPTIMIZER: adam 52 | LR: 0.0001 53 | TEST: 54 | MODEL_FILE: "model_best.pth.tar" 55 | BATCH_SIZE: 4 56 | DEBUG: 57 | DEBUG: true 58 | SAVE_HEATMAPS_GT: true 59 | SAVE_HEATMAPS_PRED: true 60 | MULTI_PERSON: 61 | SPACE_SIZE: 62 | - 12000.0 63 | - 12000.0 64 | - 2000.0 65 | SPACE_CENTER: 66 | - 3000.0 67 | - 4500.0 68 | - 1000.0 69 | INITIAL_CUBE_SIZE: 70 | - 80 71 | - 80 72 | - 20 73 | MAX_PEOPLE_NUM: 10 74 | THRESHOLD: 0.1 75 | PICT_STRUCT: 76 | GRID_SIZE: 77 | - 2000.0 78 | - 2000.0 79 | - 2000.0 80 | CUBE_SIZE: 81 | - 64 82 | - 64 83 | - 64 84 | -------------------------------------------------------------------------------- /configs/panoptic/resnet50/prn32_cpn48x48x12_960x512_cam5.yaml: -------------------------------------------------------------------------------- 1 | CUDNN: 2 | BENCHMARK: true 3 | DETERMINISTIC: false 4 | ENABLED: true 5 | BACKBONE_MODEL: 'pose_resnet' 6 | MODEL: 'multi_person_posenet' 7 | DATA_DIR: '' 8 | GPUS: '0' 9 | OUTPUT_DIR: 'output' 10 | LOG_DIR: 'log' 11 | WORKERS: 4 12 | PRINT_FREQ: 100 13 | 14 | DATASET: 15 | COLOR_RGB: True 16 | TRAIN_DATASET: 'panoptic' 17 | TEST_DATASET: 'panoptic' 18 | DATA_FORMAT: jpg 19 | DATA_AUGMENTATION: False 20 | FLIP: False 21 | ROOT: 'data/panoptic-toolbox/data/' # 'data/panoptic/' 22 | ROT_FACTOR: 45 23 | SCALE_FACTOR: 0.35 24 | TEST_SUBSET: 'validation' 25 | TRAIN_SUBSET: 'train' 26 | ROOTIDX: 2 27 | CAMERA_NUM: 5 28 | NETWORK: 29 | PRETRAINED_BACKBONE: "models/pose_resnet50_panoptic.pth.tar" 30 | PRETRAINED: '' # 'models/pytorch/imagenet/resnet50-19c8e357.pth' 31 | TARGET_TYPE: gaussian 32 | IMAGE_SIZE: 33 | - 960 34 | - 512 35 | HEATMAP_SIZE: 36 | - 240 37 | - 128 38 | SIGMA: 3 39 | NUM_JOINTS: 15 40 | USE_GT: False 41 | POSE_RESNET: 42 | FINAL_CONV_KERNEL: 1 43 | DECONV_WITH_BIAS: False 44 | NUM_DECONV_LAYERS: 3 45 | NUM_DECONV_FILTERS: 46 | - 256 47 | - 256 48 | - 256 49 | NUM_DECONV_KERNELS: 50 | - 4 51 | - 4 52 | - 4 53 | NUM_LAYERS: 50 54 | LOSS: 55 | USE_TARGET_WEIGHT: true 56 | TRAIN: 57 | BATCH_SIZE: 2 58 | SHUFFLE: true 59 | BEGIN_EPOCH: 0 60 | END_EPOCH: 10 61 | RESUME: true 62 | OPTIMIZER: adam 63 | LR: 0.0001 64 | TEST: 65 | MODEL_FILE: 'model_best.pth.tar' 66 | BATCH_SIZE: 4 67 | DEBUG: 68 | DEBUG: true 69 | SAVE_HEATMAPS_GT: true 70 | SAVE_HEATMAPS_PRED: true 71 | MULTI_PERSON: 72 | SPACE_SIZE: 73 | - 8000.0 74 | - 8000.0 75 | - 2000.0 76 | SPACE_CENTER: 77 | - 0.0 # 120.0 78 | - -500.0 # -600.0 79 | - 800.0 80 | INITIAL_CUBE_SIZE: 81 | - 48 82 | - 48 83 | - 12 84 | MAX_PEOPLE_NUM: 10 85 | THRESHOLD: 0.3 86 | PICT_STRUCT: 87 | GRID_SIZE: 88 | - 2000.0 89 | - 2000.0 90 | - 2000.0 91 | CUBE_SIZE: 92 | - 32 93 | - 32 94 | - 32 95 | -------------------------------------------------------------------------------- /configs/panoptic/resnet50/prn64_cpn80x80x20_960x512_cam5.yaml: -------------------------------------------------------------------------------- 1 | CUDNN: 2 | BENCHMARK: true 3 | DETERMINISTIC: false 4 | ENABLED: true 5 | BACKBONE_MODEL: 'pose_resnet' 6 | MODEL: 'multi_person_posenet' 7 | DATA_DIR: '' 8 | GPUS: '0' 9 | OUTPUT_DIR: 'output' 10 | LOG_DIR: 'log' 11 | WORKERS: 4 12 | PRINT_FREQ: 100 13 | 14 | DATASET: 15 | COLOR_RGB: True 16 | TRAIN_DATASET: 'panoptic' 17 | TEST_DATASET: 'panoptic' 18 | DATA_FORMAT: jpg 19 | DATA_AUGMENTATION: False 20 | FLIP: False 21 | ROOT: 'data/panoptic-toolbox/data/' # 'data/panoptic/' 22 | ROT_FACTOR: 45 23 | SCALE_FACTOR: 0.35 24 | TEST_SUBSET: 'validation' 25 | TRAIN_SUBSET: 'train' 26 | ROOTIDX: 2 27 | CAMERA_NUM: 5 28 | NETWORK: 29 | PRETRAINED_BACKBONE: "models/pose_resnet50_panoptic.pth.tar" 30 | PRETRAINED: '' # 'models/pytorch/imagenet/resnet50-19c8e357.pth' 31 | TARGET_TYPE: gaussian 32 | IMAGE_SIZE: 33 | - 960 34 | - 512 35 | HEATMAP_SIZE: 36 | - 240 37 | - 128 38 | SIGMA: 3 39 | NUM_JOINTS: 15 40 | USE_GT: False 41 | POSE_RESNET: 42 | FINAL_CONV_KERNEL: 1 43 | DECONV_WITH_BIAS: False 44 | NUM_DECONV_LAYERS: 3 45 | NUM_DECONV_FILTERS: 46 | - 256 47 | - 256 48 | - 256 49 | NUM_DECONV_KERNELS: 50 | - 4 51 | - 4 52 | - 4 53 | NUM_LAYERS: 50 54 | LOSS: 55 | USE_TARGET_WEIGHT: true 56 | TRAIN: 57 | BATCH_SIZE: 1 58 | SHUFFLE: true 59 | BEGIN_EPOCH: 0 60 | END_EPOCH: 10 61 | RESUME: true 62 | OPTIMIZER: adam 63 | LR: 0.0001 64 | TEST: 65 | MODEL_FILE: 'model_best.pth.tar' 66 | BATCH_SIZE: 4 67 | DEBUG: 68 | DEBUG: true 69 | SAVE_HEATMAPS_GT: true 70 | SAVE_HEATMAPS_PRED: true 71 | MULTI_PERSON: 72 | SPACE_SIZE: 73 | - 8000.0 74 | - 8000.0 75 | - 2000.0 76 | SPACE_CENTER: 77 | - 0.0 # 120.0 78 | - -500.0 # -600.0 79 | - 800.0 80 | INITIAL_CUBE_SIZE: 81 | - 80 82 | - 80 83 | - 20 84 | MAX_PEOPLE_NUM: 10 85 | THRESHOLD: 0.3 86 | PICT_STRUCT: 87 | GRID_SIZE: 88 | - 2000.0 89 | - 2000.0 90 | - 2000.0 91 | CUBE_SIZE: 92 | - 64 93 | - 64 94 | - 64 95 | -------------------------------------------------------------------------------- /configs/shelf/prn32_cpn48x48x12.yaml: -------------------------------------------------------------------------------- 1 | CUDNN: 2 | BENCHMARK: true 3 | DETERMINISTIC: false 4 | ENABLED: true 5 | BACKBONE_MODEL: '' 6 | MODEL: 'multi_person_posenet' 7 | DATA_DIR: '' 8 | GPUS: '0' 9 | OUTPUT_DIR: 'output' 10 | LOG_DIR: 'log' 11 | WORKERS: 4 12 | PRINT_FREQ: 100 13 | 14 | DATASET: 15 | COLOR_RGB: True 16 | TRAIN_DATASET: 'shelf_synthetic' 17 | TEST_DATASET: 'shelf' 18 | DATA_FORMAT: jpg 19 | DATA_AUGMENTATION: False 20 | FLIP: False 21 | ROOT: 'data/Shelf' 22 | ROT_FACTOR: 45 23 | SCALE_FACTOR: 0.35 24 | TEST_SUBSET: 'validation' 25 | TRAIN_SUBSET: 'train' 26 | ROOTIDX: 27 | - 2 28 | - 3 29 | CAMERA_NUM: 5 30 | NETWORK: 31 | PRETRAINED_BACKBONE: '' 32 | PRETRAINED: '' # 'models/pytorch/imagenet/resnet50-19c8e357.pth' 33 | TARGET_TYPE: gaussian 34 | IMAGE_SIZE: 35 | - 800 36 | - 608 37 | HEATMAP_SIZE: 38 | - 200 39 | - 152 40 | SIGMA: 3 41 | NUM_JOINTS: 17 42 | USE_GT: False 43 | LOSS: 44 | USE_TARGET_WEIGHT: true 45 | TRAIN: 46 | BATCH_SIZE: 2 47 | SHUFFLE: true 48 | BEGIN_EPOCH: 0 49 | END_EPOCH: 30 50 | RESUME: true 51 | OPTIMIZER: adam 52 | LR: 0.0001 53 | TEST: 54 | MODEL_FILE: "model_best.pth.tar" 55 | BATCH_SIZE: 4 56 | DEBUG: 57 | DEBUG: true 58 | SAVE_HEATMAPS_GT: true 59 | SAVE_HEATMAPS_PRED: true 60 | MULTI_PERSON: 61 | SPACE_SIZE: 62 | - 8000.0 63 | - 8000.0 64 | - 2000.0 65 | SPACE_CENTER: 66 | - 450.0 # 120.0 67 | - -320.0 # -600.0 68 | - 800.0 69 | INITIAL_CUBE_SIZE: 70 | - 48 71 | - 48 72 | - 12 73 | MAX_PEOPLE_NUM: 10 74 | THRESHOLD: 0.1 75 | PICT_STRUCT: 76 | GRID_SIZE: 77 | - 2000.0 78 | - 2000.0 79 | - 2000.0 80 | CUBE_SIZE: 81 | - 32 82 | - 32 83 | - 32 84 | -------------------------------------------------------------------------------- /configs/shelf/prn64_cpn80x80x20.yaml: -------------------------------------------------------------------------------- 1 | CUDNN: 2 | BENCHMARK: true 3 | DETERMINISTIC: false 4 | ENABLED: true 5 | BACKBONE_MODEL: '' 6 | MODEL: 'multi_person_posenet' 7 | DATA_DIR: '' 8 | GPUS: '0' 9 | OUTPUT_DIR: 'output' 10 | LOG_DIR: 'log' 11 | WORKERS: 4 12 | PRINT_FREQ: 100 13 | 14 | DATASET: 15 | COLOR_RGB: True 16 | TRAIN_DATASET: 'shelf_synthetic' 17 | TEST_DATASET: 'shelf' 18 | DATA_FORMAT: jpg 19 | DATA_AUGMENTATION: False 20 | FLIP: False 21 | ROOT: 'data/Shelf' # 'data/panoptic/' 22 | ROT_FACTOR: 45 23 | SCALE_FACTOR: 0.35 24 | TEST_SUBSET: 'validation' 25 | TRAIN_SUBSET: 'train' 26 | ROOTIDX: 27 | - 2 28 | - 3 29 | CAMERA_NUM: 5 30 | NETWORK: 31 | PRETRAINED_BACKBONE: '' 32 | PRETRAINED: '' # 'models/pytorch/imagenet/resnet50-19c8e357.pth' 33 | TARGET_TYPE: gaussian 34 | IMAGE_SIZE: 35 | - 800 36 | - 608 37 | HEATMAP_SIZE: 38 | - 200 39 | - 152 40 | SIGMA: 3 41 | NUM_JOINTS: 17 42 | USE_GT: False 43 | LOSS: 44 | USE_TARGET_WEIGHT: true 45 | TRAIN: 46 | BATCH_SIZE: 1 47 | SHUFFLE: true 48 | BEGIN_EPOCH: 0 49 | END_EPOCH: 30 50 | RESUME: true 51 | OPTIMIZER: adam 52 | LR: 0.0001 53 | TEST: 54 | MODEL_FILE: "model_best.pth.tar" 55 | BATCH_SIZE: 4 56 | DEBUG: 57 | DEBUG: true 58 | SAVE_HEATMAPS_GT: true 59 | SAVE_HEATMAPS_PRED: true 60 | MULTI_PERSON: 61 | SPACE_SIZE: 62 | - 8000.0 63 | - 8000.0 64 | - 2000.0 65 | SPACE_CENTER: 66 | - 450.0 # 120.0 67 | - -320.0 # -600.0 68 | - 800.0 69 | INITIAL_CUBE_SIZE: 70 | - 80 71 | - 80 72 | - 20 73 | MAX_PEOPLE_NUM: 10 74 | THRESHOLD: 0.1 75 | PICT_STRUCT: 76 | GRID_SIZE: 77 | - 2000.0 78 | - 2000.0 79 | - 2000.0 80 | CUBE_SIZE: 81 | - 64 82 | - 64 83 | - 64 84 | -------------------------------------------------------------------------------- /data/CampusSeq1/calibration_campus.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "R": [ 4 | [ 5 | 0.9998819135498813, 6 | -0.007627303394110196, 7 | -0.013341034396255802 8 | ], 9 | [ 10 | -0.01412240122676837, 11 | -0.11375390190151916, 12 | -0.9934085803866252 13 | ], 14 | [ 15 | 0.00605943391894462, 16 | 0.9934796797343738, 17 | -0.11384818494586636 18 | ] 19 | ], 20 | "T": [ 21 | [ 22 | 1774.8953318252247 23 | ], 24 | [ 25 | -5051.695948238737 26 | ], 27 | [ 28 | 1923.3559877015355 29 | ] 30 | ], 31 | "fx": 437.9852173913044, 32 | "fy": 437.9852173913044, 33 | "cx": 185.3596, 34 | "cy": 139.2537, 35 | "k": [ 36 | [ 37 | 0.0 38 | ], 39 | [ 40 | 0.0 41 | ], 42 | [ 43 | 0.0 44 | ] 45 | ], 46 | "p": [ 47 | [ 48 | 0.0 49 | ], 50 | [ 51 | 0.0 52 | ] 53 | ] 54 | }, 55 | "1": { 56 | "R": [ 57 | [ 58 | -0.04633107785835382, 59 | -0.9988140384937536, 60 | 0.014964883303310195 61 | ], 62 | [ 63 | -0.13065076504992335, 64 | -0.008793265243184023, 65 | -0.9913894573164639 66 | ], 67 | [ 68 | 0.9903452977706073, 69 | -0.04788731558734052, 70 | -0.1300884168152014 71 | ] 72 | ], 73 | "T": [ 74 | [ 75 | -6240.579909342256 76 | ], 77 | [ 78 | 5247.348264374987 79 | ], 80 | [ 81 | 1947.3802148598609 82 | ] 83 | ], 84 | "fx": 430.03326086956525, 85 | "fy": 430.03326086956525, 86 | "cx": 184.0583, 87 | "cy": 130.7467, 88 | "k": [ 89 | [ 90 | 0.0 91 | ], 92 | [ 93 | 0.0 94 | ], 95 | [ 96 | 0.0 97 | ] 98 | ], 99 | "p": [ 100 | [ 101 | 0.0 102 | ], 103 | [ 104 | 0.0 105 | ] 106 | ] 107 | }, 108 | "2": { 109 | "R": [ 110 | [ 111 | 0.5386991962445586, 112 | 0.8424723621738047, 113 | -0.006595069276080057 114 | ], 115 | [ 116 | 0.10782367722838201, 117 | -0.07670471706694504, 118 | -0.9912065581949252 119 | ], 120 | [ 121 | -0.835570003407504, 122 | 0.5332510715910186, 123 | -0.13215923748499042 124 | ] 125 | ], 126 | "T": [ 127 | [ 128 | 11943.56106545541 129 | ], 130 | [ 131 | -1803.8527374133198 132 | ], 133 | [ 134 | 1973.3939116534714 135 | ] 136 | ], 137 | "fx": 700.9856521739131, 138 | "fy": 700.9856521739131, 139 | "cx": 167.59475, 140 | "cy": 142.0545, 141 | "k": [ 142 | [ 143 | 0.0 144 | ], 145 | [ 146 | 0.0 147 | ], 148 | [ 149 | 0.0 150 | ] 151 | ], 152 | "p": [ 153 | [ 154 | 0.0 155 | ], 156 | [ 157 | 0.0 158 | ] 159 | ] 160 | } 161 | } -------------------------------------------------------------------------------- /data/CampusSeq1/pred_campus_maskrcnn_hrnet_coco.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/data/CampusSeq1/pred_campus_maskrcnn_hrnet_coco.pkl -------------------------------------------------------------------------------- /data/Shelf/calibration_shelf.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "k": [ 4 | [ 5 | 0.0 6 | ], 7 | [ 8 | 0.0 9 | ], 10 | [ 11 | 0.0 12 | ] 13 | ], 14 | "p": [ 15 | [ 16 | 0.0 17 | ], 18 | [ 19 | 0.0 20 | ] 21 | ], 22 | "R": [ 23 | [ 24 | 0.650977, 25 | -0.758717, 26 | 0.024027 27 | ], 28 | [ 29 | -0.018862, 30 | -0.04781, 31 | -0.998678 32 | ], 33 | [ 34 | 0.758863, 35 | 0.649664, 36 | -0.045434 37 | ] 38 | ], 39 | "T": [ 40 | [ 41 | -1586.4496077989998 42 | ], 43 | [ 44 | -2109.46905869 45 | ], 46 | [ 47 | 1104.209800652 48 | ] 49 | ], 50 | "fx": 1063.512085, 51 | "fy": 1071.863647, 52 | "cx": 511.738251, 53 | "cy": 350.088287 54 | }, 55 | "1": { 56 | "k": [ 57 | [ 58 | 0.0 59 | ], 60 | [ 61 | 0.0 62 | ], 63 | [ 64 | 0.0 65 | ] 66 | ], 67 | "p": [ 68 | [ 69 | 0.0 70 | ], 71 | [ 72 | 0.0 73 | ] 74 | ], 75 | "R": [ 76 | [ 77 | -0.016771, 78 | -0.999835, 79 | 0.006926 80 | ], 81 | [ 82 | -0.029435, 83 | -0.006431, 84 | -0.999546 85 | ], 86 | [ 87 | 0.999426, 88 | -0.016967, 89 | -0.029322 90 | ] 91 | ], 92 | "T": [ 93 | [ 94 | -3512.391424833 95 | ], 96 | [ 97 | 311.47771461800005 98 | ], 99 | [ 100 | 964.5481307480001 101 | ] 102 | ], 103 | "fx": 1097.697754, 104 | "fy": 1086.668457, 105 | "cx": 521.652161, 106 | "cy": 376.587067 107 | }, 108 | "2": { 109 | "k": [ 110 | [ 111 | 0.0 112 | ], 113 | [ 114 | 0.0 115 | ], 116 | [ 117 | 0.0 118 | ] 119 | ], 120 | "p": [ 121 | [ 122 | 0.0 123 | ], 124 | [ 125 | 0.0 126 | ] 127 | ], 128 | "R": [ 129 | [ 130 | -0.789986, 131 | -0.610527, 132 | 0.05638 133 | ], 134 | [ 135 | -0.370413, 136 | 0.401962, 137 | -0.837389 138 | ], 139 | [ 140 | 0.488586, 141 | -0.68241, 142 | -0.543691 143 | ] 144 | ], 145 | "T": [ 146 | [ 147 | -1420.944211509 148 | ], 149 | [ 150 | 2546.574076866 151 | ], 152 | [ 153 | 2688.8728944060003 154 | ] 155 | ], 156 | "fx": 1130.065552, 157 | "fy": 1112.470337, 158 | "cx": 566.884338, 159 | "cy": 375.212708 160 | }, 161 | "3": { 162 | "k": [ 163 | [ 164 | 0.0 165 | ], 166 | [ 167 | 0.0 168 | ], 169 | [ 170 | 0.0 171 | ] 172 | ], 173 | "p": [ 174 | [ 175 | 0.0 176 | ], 177 | [ 178 | 0.0 179 | ] 180 | ], 181 | "R": [ 182 | [ 183 | -0.970568, 184 | 0.235647, 185 | -0.049676 186 | ], 187 | [ 188 | 0.09763, 189 | 0.196438, 190 | -0.975644 191 | ], 192 | [ 193 | -0.22015, 194 | -0.951779, 195 | -0.213663 196 | ] 197 | ], 198 | "T": [ 199 | [ 200 | 963.489306486 201 | ], 202 | [ 203 | 3408.674914882 204 | ], 205 | [ 206 | 1422.035001899 207 | ] 208 | ], 209 | "fx": 1056.162598, 210 | "fy": 1059.639648, 211 | "cx": 552.43573, 212 | "cy": 393.180389 213 | }, 214 | "4": { 215 | "k": [ 216 | [ 217 | 0.0 218 | ], 219 | [ 220 | 0.0 221 | ], 222 | [ 223 | 0.0 224 | ] 225 | ], 226 | "p": [ 227 | [ 228 | 0.0 229 | ], 230 | [ 231 | 0.0 232 | ] 233 | ], 234 | "R": [ 235 | [ 236 | -0.194109, 237 | 0.980554, 238 | -0.028888 239 | ], 240 | [ 241 | 0.233045, 242 | 0.017488, 243 | -0.972309 244 | ], 245 | [ 246 | -0.952896, 247 | -0.195466, 248 | -0.231908 249 | ] 250 | ], 251 | "T": [ 252 | [ 253 | 3832.020978729 254 | ], 255 | [ 256 | 273.55271850000014 257 | ], 258 | [ 259 | 1439.4616998990002 260 | ] 261 | ], 262 | "fx": 1089.654175, 263 | "fy": 1080.99939, 264 | "cx": 498.32962, 265 | "cy": 359.514832 266 | } 267 | } -------------------------------------------------------------------------------- /data/Shelf/pred_shelf_maskrcnn_hrnet_coco.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/data/Shelf/pred_shelf_maskrcnn_hrnet_coco.pkl -------------------------------------------------------------------------------- /data/panoptic.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/data/panoptic.gif -------------------------------------------------------------------------------- /data/panoptic2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/data/panoptic2.gif -------------------------------------------------------------------------------- /data/panoptic_training_pose.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/data/panoptic_training_pose.pkl -------------------------------------------------------------------------------- /lib/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/lib/core/__init__.py -------------------------------------------------------------------------------- /lib/core/config.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os 11 | import yaml 12 | 13 | import numpy as np 14 | from easydict import EasyDict as edict 15 | 16 | config = edict() 17 | 18 | config.OUTPUT_DIR = 'output' 19 | config.LOG_DIR = 'log' 20 | config.DATA_DIR = '' 21 | config.BACKBONE_MODEL = 'pose_resnet' 22 | config.MODEL = 'multi_person_posenet' 23 | config.GPUS = '0,1' 24 | config.WORKERS = 8 25 | config.PRINT_FREQ = 100 26 | 27 | # higherhrnet definition 28 | config.MODEL_EXTRA = edict() 29 | config.MODEL_EXTRA.PRETRAINED_LAYERS = ['*'] 30 | config.MODEL_EXTRA.FINAL_CONV_KERNEL = 1 31 | config.MODEL_EXTRA.STEM_INPLANES = 64 32 | 33 | config.MODEL_EXTRA.STAGE2 = edict() 34 | config.MODEL_EXTRA.STAGE2.NUM_MODULES = 1 35 | config.MODEL_EXTRA.STAGE2.NUM_BRANCHES= 2 36 | config.MODEL_EXTRA.STAGE2.BLOCK = 'BASIC' 37 | config.MODEL_EXTRA.STAGE2.NUM_BLOCKS = [4, 4] 38 | config.MODEL_EXTRA.STAGE2.NUM_CHANNELS = [48, 96] 39 | config.MODEL_EXTRA.STAGE2.FUSE_METHOD = 'SUM' 40 | 41 | config.MODEL_EXTRA.STAGE3 = edict() 42 | config.MODEL_EXTRA.STAGE3.NUM_MODULES = 4 43 | config.MODEL_EXTRA.STAGE3.NUM_BRANCHES = 3 44 | config.MODEL_EXTRA.STAGE3.BLOCK = 'BASIC' 45 | config.MODEL_EXTRA.STAGE3.NUM_BLOCKS = [4, 4, 4] 46 | config.MODEL_EXTRA.STAGE3.NUM_CHANNELS = [48, 96, 192] 47 | config.MODEL_EXTRA.STAGE3.FUSE_METHOD = 'SUM' 48 | 49 | config.MODEL_EXTRA.STAGE4 = edict() 50 | config.MODEL_EXTRA.STAGE4.NUM_MODULES = 3 51 | config.MODEL_EXTRA.STAGE4.NUM_BRANCHES = 4 52 | config.MODEL_EXTRA.STAGE4.BLOCK = 'BASIC' 53 | config.MODEL_EXTRA.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] 54 | config.MODEL_EXTRA.STAGE4.NUM_CHANNELS = [48, 96, 192, 384] 55 | config.MODEL_EXTRA.STAGE4.FUSE_METHOD = 'SUM' 56 | 57 | config.MODEL_EXTRA.DECONV = edict() 58 | config.MODEL_EXTRA.DECONV.NUM_DECONVS = 1 59 | config.MODEL_EXTRA.DECONV.NUM_CHANNELS = 32 60 | config.MODEL_EXTRA.DECONV.KERNEL_SIZE = 4 61 | config.MODEL_EXTRA.DECONV.NUM_BASIC_BLOCKS = 4 62 | config.MODEL_EXTRA.DECONV.CAT_OUTPUT = True 63 | 64 | # Cudnn related params 65 | config.CUDNN = edict() 66 | config.CUDNN.BENCHMARK = True 67 | config.CUDNN.DETERMINISTIC = False 68 | config.CUDNN.ENABLED = True 69 | 70 | # common params for NETWORK 71 | config.NETWORK = edict() 72 | config.NETWORK.PRETRAINED = 'models/pytorch/imagenet/resnet50-19c8e357.pth' 73 | config.NETWORK.PRETRAINED_BACKBONE = '' 74 | config.NETWORK.NUM_JOINTS = 20 75 | config.NETWORK.INPUT_SIZE = 512 76 | config.NETWORK.HEATMAP_SIZE = np.array([80, 80]) 77 | config.NETWORK.IMAGE_SIZE = np.array([320, 320]) 78 | config.NETWORK.SIGMA = 2 79 | config.NETWORK.TARGET_TYPE = 'gaussian' 80 | config.NETWORK.AGGRE = True 81 | config.NETWORK.USE_GT = False 82 | config.NETWORK.BETA = 100.0 83 | 84 | # pose_resnet related params 85 | config.POSE_RESNET = edict() 86 | config.POSE_RESNET.NUM_LAYERS = 50 87 | config.POSE_RESNET.DECONV_WITH_BIAS = False 88 | config.POSE_RESNET.NUM_DECONV_LAYERS = 3 89 | config.POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256] 90 | config.POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4] 91 | config.POSE_RESNET.FINAL_CONV_KERNEL = 1 92 | 93 | config.LOSS = edict() 94 | config.LOSS.USE_TARGET_WEIGHT = True 95 | config.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False 96 | 97 | # DATASET related params 98 | config.DATASET = edict() 99 | config.DATASET.ROOT = '../data/h36m/' 100 | config.DATASET.TRAIN_DATASET = 'mixed_dataset' 101 | config.DATASET.TEST_DATASET = 'multi_view_h36m' 102 | config.DATASET.TRAIN_SUBSET = 'train' 103 | config.DATASET.TEST_SUBSET = 'validation' 104 | config.DATASET.ROOTIDX = 2 105 | config.DATASET.DATA_FORMAT = 'jpg' 106 | config.DATASET.BBOX = 2000 107 | config.DATASET.CROP = True 108 | config.DATASET.COLOR_RGB = False 109 | config.DATASET.FLIP = True 110 | config.DATASET.DATA_AUGMENTATION = True 111 | config.DATASET.CAMERA_NUM = 5 112 | 113 | # training data augmentation 114 | config.DATASET.SCALE_FACTOR = 0 115 | config.DATASET.ROT_FACTOR = 0 116 | 117 | # train 118 | config.TRAIN = edict() 119 | config.TRAIN.LR_FACTOR = 0.1 120 | config.TRAIN.LR_STEP = [90, 110] 121 | config.TRAIN.LR = 0.001 122 | 123 | config.TRAIN.OPTIMIZER = 'adam' 124 | config.TRAIN.MOMENTUM = 0.9 125 | config.TRAIN.WD = 0.0001 126 | config.TRAIN.NESTEROV = False 127 | config.TRAIN.GAMMA1 = 0.99 128 | config.TRAIN.GAMMA2 = 0.0 129 | 130 | config.TRAIN.BEGIN_EPOCH = 0 131 | config.TRAIN.END_EPOCH = 140 132 | 133 | config.TRAIN.RESUME = False 134 | 135 | config.TRAIN.BATCH_SIZE = 8 136 | config.TRAIN.SHUFFLE = True 137 | 138 | # testing 139 | config.TEST = edict() 140 | config.TEST.BATCH_SIZE = 8 141 | config.TEST.STATE = 'best' 142 | config.TEST.FLIP_TEST = False 143 | config.TEST.POST_PROCESS = False 144 | config.TEST.SHIFT_HEATMAP = False 145 | config.TEST.USE_GT_BBOX = False 146 | config.TEST.IMAGE_THRE = 0.1 147 | config.TEST.NMS_THRE = 0.6 148 | config.TEST.OKS_THRE = 0.5 149 | config.TEST.IN_VIS_THRE = 0.0 150 | config.TEST.BBOX_FILE = '' 151 | config.TEST.BBOX_THRE = 1.0 152 | config.TEST.MATCH_IOU_THRE = 0.3 153 | config.TEST.DETECTOR = 'fpn_dcn' 154 | config.TEST.DETECTOR_DIR = '' 155 | config.TEST.MODEL_FILE = '' 156 | config.TEST.HEATMAP_LOCATION_FILE = 'predicted_heatmaps.h5' 157 | 158 | # debug 159 | config.DEBUG = edict() 160 | config.DEBUG.DEBUG = True 161 | config.DEBUG.SAVE_BATCH_IMAGES_GT = True 162 | config.DEBUG.SAVE_BATCH_IMAGES_PRED = True 163 | config.DEBUG.SAVE_HEATMAPS_GT = True 164 | config.DEBUG.SAVE_HEATMAPS_PRED = True 165 | 166 | # pictorial structure 167 | config.PICT_STRUCT = edict() 168 | config.PICT_STRUCT.FIRST_NBINS = 16 169 | config.PICT_STRUCT.PAIRWISE_FILE = '' 170 | config.PICT_STRUCT.RECUR_NBINS = 2 171 | config.PICT_STRUCT.RECUR_DEPTH = 10 172 | config.PICT_STRUCT.LIMB_LENGTH_TOLERANCE = 150 173 | config.PICT_STRUCT.GRID_SIZE = np.array([2000.0, 2000.0, 2000.0]) 174 | config.PICT_STRUCT.CUBE_SIZE = np.array([64, 64, 64]) 175 | config.PICT_STRUCT.DEBUG = False 176 | config.PICT_STRUCT.TEST_PAIRWISE = False 177 | config.PICT_STRUCT.SHOW_ORIIMG = False 178 | config.PICT_STRUCT.SHOW_CROPIMG = False 179 | config.PICT_STRUCT.SHOW_HEATIMG = False 180 | 181 | config.MULTI_PERSON = edict() 182 | config.MULTI_PERSON.SPACE_SIZE = np.array([4000.0, 5200.0, 2400.0]) 183 | config.MULTI_PERSON.SPACE_CENTER = np.array([300.0, 300.0, 300.0]) 184 | config.MULTI_PERSON.INITIAL_CUBE_SIZE = np.array([24, 32, 16]) 185 | config.MULTI_PERSON.MAX_PEOPLE_NUM = 10 186 | config.MULTI_PERSON.THRESHOLD = 0.1 187 | 188 | 189 | def _update_dict(k, v): 190 | if k == 'DATASET': 191 | if 'MEAN' in v and v['MEAN']: 192 | v['MEAN'] = np.array( 193 | [eval(x) if isinstance(x, str) else x for x in v['MEAN']]) 194 | if 'STD' in v and v['STD']: 195 | v['STD'] = np.array( 196 | [eval(x) if isinstance(x, str) else x for x in v['STD']]) 197 | if k == 'NETWORK': 198 | if 'HEATMAP_SIZE' in v: 199 | if isinstance(v['HEATMAP_SIZE'], int): 200 | v['HEATMAP_SIZE'] = np.array( 201 | [v['HEATMAP_SIZE'], v['HEATMAP_SIZE']]) 202 | else: 203 | v['HEATMAP_SIZE'] = np.array(v['HEATMAP_SIZE']) 204 | if 'IMAGE_SIZE' in v: 205 | if isinstance(v['IMAGE_SIZE'], int): 206 | v['IMAGE_SIZE'] = np.array([v['IMAGE_SIZE'], v['IMAGE_SIZE']]) 207 | else: 208 | v['IMAGE_SIZE'] = np.array(v['IMAGE_SIZE']) 209 | for vk, vv in v.items(): 210 | if vk in config[k]: 211 | config[k][vk] = vv 212 | else: 213 | raise ValueError("{}.{} not exist in config.py".format(k, vk)) 214 | 215 | 216 | def update_config(config_file): 217 | exp_config = None 218 | with open(config_file) as f: 219 | exp_config = edict(yaml.load(f, Loader=yaml.FullLoader)) 220 | for k, v in exp_config.items(): 221 | if k in config: 222 | if isinstance(v, dict): 223 | _update_dict(k, v) 224 | else: 225 | if k == 'SCALES': 226 | config[k][0] = (tuple(v)) 227 | else: 228 | config[k] = v 229 | else: 230 | raise ValueError("{} not exist in config.py".format(k)) 231 | 232 | 233 | def gen_config(config_file): 234 | cfg = dict(config) 235 | for k, v in cfg.items(): 236 | if isinstance(v, edict): 237 | cfg[k] = dict(v) 238 | 239 | with open(config_file, 'w') as f: 240 | yaml.dump(dict(cfg), f, default_flow_style=False) 241 | 242 | 243 | def update_dir(model_dir, log_dir, data_dir): 244 | if model_dir: 245 | config.OUTPUT_DIR = model_dir 246 | 247 | if log_dir: 248 | config.LOG_DIR = log_dir 249 | 250 | if data_dir: 251 | config.DATA_DIR = data_dir 252 | 253 | config.DATASET.ROOT = os.path.join(config.DATA_DIR, config.DATASET.ROOT) 254 | 255 | config.TEST.BBOX_FILE = os.path.join(config.DATA_DIR, config.TEST.BBOX_FILE) 256 | 257 | config.NETWORK.PRETRAINED = os.path.join(config.DATA_DIR, 258 | config.NETWORK.PRETRAINED) 259 | 260 | 261 | def get_model_name(cfg): 262 | name = '{model}_{num_layers}'.format( 263 | model=cfg.MODEL, num_layers=cfg.POSE_RESNET.NUM_LAYERS) 264 | deconv_suffix = ''.join( 265 | 'd{}'.format(num_filters) 266 | for num_filters in cfg.POSE_RESNET.NUM_DECONV_FILTERS) 267 | full_name = '{height}x{width}_{name}_{deconv_suffix}'.format( 268 | height=cfg.NETWORK.IMAGE_SIZE[1], 269 | width=cfg.NETWORK.IMAGE_SIZE[0], 270 | name=name, 271 | deconv_suffix=deconv_suffix) 272 | 273 | return name, full_name 274 | 275 | 276 | if __name__ == '__main__': 277 | import sys 278 | gen_config(sys.argv[1]) 279 | -------------------------------------------------------------------------------- /lib/core/function.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import time 6 | import logging 7 | import os 8 | import copy 9 | 10 | import torch 11 | import numpy as np 12 | 13 | from utils.vis import save_debug_images_multi 14 | from utils.vis import save_debug_3d_images 15 | from utils.vis import save_debug_3d_cubes 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def train_3d(config, model, optimizer, loader, epoch, output_dir, writer_dict, device=torch.device('cuda'), dtype=torch.float): 21 | batch_time = AverageMeter() 22 | data_time = AverageMeter() 23 | losses = AverageMeter() 24 | losses_2d = AverageMeter() 25 | losses_3d = AverageMeter() 26 | losses_cord = AverageMeter() 27 | 28 | model.train() 29 | 30 | if model.module.backbone is not None: 31 | model.module.backbone.eval() # Comment out this line if you want to train 2D backbone jointly 32 | 33 | accumulation_steps = 4 34 | accu_loss_3d = 0 35 | 36 | end = time.time() 37 | for i, (inputs, targets_2d, weights_2d, targets_3d, meta, input_heatmap) in enumerate(loader): 38 | data_time.update(time.time() - end) 39 | 40 | if 'panoptic' in config.DATASET.TEST_DATASET: 41 | pred, heatmaps, grid_centers, loss_2d, loss_3d, loss_cord = model(views=inputs, meta=meta, 42 | targets_2d=targets_2d, 43 | weights_2d=weights_2d, 44 | targets_3d=targets_3d[0]) 45 | elif 'campus' in config.DATASET.TEST_DATASET or 'shelf' in config.DATASET.TEST_DATASET: 46 | pred, heatmaps, grid_centers, loss_2d, loss_3d, loss_cord = model(meta=meta, targets_3d=targets_3d[0], 47 | input_heatmaps=input_heatmap) 48 | 49 | loss_2d = loss_2d.mean() 50 | loss_3d = loss_3d.mean() 51 | loss_cord = loss_cord.mean() 52 | 53 | losses_2d.update(loss_2d.item()) 54 | losses_3d.update(loss_3d.item()) 55 | losses_cord.update(loss_cord.item()) 56 | loss = loss_2d + loss_3d + loss_cord 57 | losses.update(loss.item()) 58 | 59 | if loss_cord > 0: 60 | optimizer.zero_grad() 61 | (loss_2d + loss_cord).backward() 62 | optimizer.step() 63 | 64 | if accu_loss_3d > 0 and (i + 1) % accumulation_steps == 0: 65 | optimizer.zero_grad() 66 | accu_loss_3d.backward() 67 | optimizer.step() 68 | accu_loss_3d = 0.0 69 | else: 70 | accu_loss_3d += loss_3d / accumulation_steps 71 | 72 | batch_time.update(time.time() - end) 73 | end = time.time() 74 | 75 | if i % config.PRINT_FREQ == 0: 76 | gpu_memory_usage = torch.cuda.memory_allocated(0) 77 | msg = 'Epoch: [{0}][{1}/{2}]\t' \ 78 | 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \ 79 | 'Speed: {speed:.1f} samples/s\t' \ 80 | 'Data: {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \ 81 | 'Loss: {loss.val:.6f} ({loss.avg:.6f})\t' \ 82 | 'Loss_2d: {loss_2d.val:.7f} ({loss_2d.avg:.7f})\t' \ 83 | 'Loss_3d: {loss_3d.val:.7f} ({loss_3d.avg:.7f})\t' \ 84 | 'Loss_cord: {loss_cord.val:.6f} ({loss_cord.avg:.6f})\t' \ 85 | 'Memory {memory:.1f}'.format( 86 | epoch, i, len(loader), batch_time=batch_time, 87 | speed=len(inputs) * inputs[0].size(0) / batch_time.val, 88 | data_time=data_time, loss=losses, loss_2d=losses_2d, loss_3d=losses_3d, 89 | loss_cord=losses_cord, memory=gpu_memory_usage) 90 | logger.info(msg) 91 | 92 | writer = writer_dict['writer'] 93 | global_steps = writer_dict['train_global_steps'] 94 | writer.add_scalar('train_loss_3d', losses_3d.val, global_steps) 95 | writer.add_scalar('train_loss_cord', losses_cord.val, global_steps) 96 | writer.add_scalar('train_loss', losses.val, global_steps) 97 | writer_dict['train_global_steps'] = global_steps + 1 98 | 99 | for k in range(len(inputs)): 100 | view_name = 'view_{}'.format(k + 1) 101 | prefix = '{}_{:08}_{}'.format( 102 | os.path.join(output_dir, 'train'), i, view_name) 103 | save_debug_images_multi(config, inputs[k], meta[k], targets_2d[k], heatmaps[k], prefix) 104 | prefix2 = '{}_{:08}'.format( 105 | os.path.join(output_dir, 'train'), i) 106 | 107 | save_debug_3d_cubes(config, meta[0], grid_centers, prefix2) 108 | save_debug_3d_images(config, meta[0], pred, prefix2) 109 | 110 | 111 | def validate_3d(config, model, loader, output_dir): 112 | batch_time = AverageMeter() 113 | data_time = AverageMeter() 114 | model.eval() 115 | 116 | preds = [] 117 | with torch.no_grad(): 118 | end = time.time() 119 | for i, (inputs, targets_2d, weights_2d, targets_3d, meta, input_heatmap) in enumerate(loader): 120 | data_time.update(time.time() - end) 121 | if 'panoptic' in config.DATASET.TEST_DATASET: 122 | pred, heatmaps, grid_centers, _, _, _ = model(views=inputs, meta=meta, targets_2d=targets_2d, 123 | weights_2d=weights_2d, targets_3d=targets_3d[0]) 124 | elif 'campus' in config.DATASET.TEST_DATASET or 'shelf' in config.DATASET.TEST_DATASET: 125 | pred, heatmaps, grid_centers, _, _, _ = model(meta=meta, targets_3d=targets_3d[0], 126 | input_heatmaps=input_heatmap) 127 | pred = pred.detach().cpu().numpy() 128 | for b in range(pred.shape[0]): 129 | preds.append(pred[b]) 130 | 131 | batch_time.update(time.time() - end) 132 | end = time.time() 133 | if i % config.PRINT_FREQ == 0 or i == len(loader) - 1: 134 | gpu_memory_usage = torch.cuda.memory_allocated(0) 135 | msg = 'Test: [{0}/{1}]\t' \ 136 | 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \ 137 | 'Speed: {speed:.1f} samples/s\t' \ 138 | 'Data: {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \ 139 | 'Memory {memory:.1f}'.format( 140 | i, len(loader), batch_time=batch_time, 141 | speed=len(inputs) * inputs[0].size(0) / batch_time.val, 142 | data_time=data_time, memory=gpu_memory_usage) 143 | logger.info(msg) 144 | 145 | for k in range(len(inputs)): 146 | view_name = 'view_{}'.format(k + 1) 147 | prefix = '{}_{:08}_{}'.format( 148 | os.path.join(output_dir, 'validation'), i, view_name) 149 | save_debug_images_multi(config, inputs[k], meta[k], targets_2d[k], heatmaps[k], prefix) 150 | prefix2 = '{}_{:08}'.format( 151 | os.path.join(output_dir, 'validation'), i) 152 | 153 | save_debug_3d_cubes(config, meta[0], grid_centers, prefix2) 154 | save_debug_3d_images(config, meta[0], pred, prefix2) 155 | 156 | metric = None 157 | if 'panoptic' in config.DATASET.TEST_DATASET: 158 | aps, _, mpjpe, recall = loader.dataset.evaluate(preds) 159 | msg = 'ap@25: {aps_25:.4f}\tap@50: {aps_50:.4f}\tap@75: {aps_75:.4f}\t' \ 160 | 'ap@100: {aps_100:.4f}\tap@125: {aps_125:.4f}\tap@150: {aps_150:.4f}\t' \ 161 | 'recall@500mm: {recall:.4f}\tmpjpe@500mm: {mpjpe:.3f}'.format( 162 | aps_25=aps[0], aps_50=aps[1], aps_75=aps[2], aps_100=aps[3], 163 | aps_125=aps[4], aps_150=aps[5], recall=recall, mpjpe=mpjpe 164 | ) 165 | logger.info(msg) 166 | metric = np.mean(aps) 167 | elif 'campus' in config.DATASET.TEST_DATASET or 'shelf' in config.DATASET.TEST_DATASET: 168 | actor_pcp, avg_pcp, _, recall = loader.dataset.evaluate(preds) 169 | msg = ' | Actor 1 | Actor 2 | Actor 3 | Average | \n' \ 170 | ' PCP | {pcp_1:.2f} | {pcp_2:.2f} | {pcp_3:.2f} | {pcp_avg:.2f} |\t Recall@500mm: {recall:.4f}'.format( 171 | pcp_1=actor_pcp[0]*100, pcp_2=actor_pcp[1]*100, pcp_3=actor_pcp[2]*100, pcp_avg=avg_pcp*100, recall=recall) 172 | logger.info(msg) 173 | metric = np.mean(avg_pcp) 174 | 175 | return metric 176 | 177 | 178 | class AverageMeter(object): 179 | """Computes and stores the average and current value""" 180 | 181 | def __init__(self): 182 | self.reset() 183 | 184 | def reset(self): 185 | self.val = 0 186 | self.avg = 0 187 | self.sum = 0 188 | self.count = 0 189 | 190 | def update(self, val, n=1): 191 | self.val = val 192 | self.sum += val * n 193 | self.count += n 194 | self.avg = self.sum / self.count 195 | -------------------------------------------------------------------------------- /lib/core/loss.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import torch.nn as nn 11 | 12 | 13 | class JointsMSELoss(nn.Module): 14 | def __init__(self, use_target_weight): 15 | super(JointsMSELoss, self).__init__() 16 | self.criterion = nn.MSELoss(reduction='mean') 17 | self.use_target_weight = use_target_weight 18 | 19 | def forward(self, output, target, target_weight): 20 | batch_size = output.size(0) 21 | num_joints = output.size(1) 22 | heatmaps_pred = output.reshape((batch_size, num_joints, -1)).split(1, 1) 23 | heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) 24 | loss = 0 25 | 26 | for idx in range(num_joints): 27 | heatmap_pred = heatmaps_pred[idx].squeeze() 28 | heatmap_gt = heatmaps_gt[idx].squeeze() 29 | if self.use_target_weight: 30 | loss += self.criterion(heatmap_pred.mul(target_weight[:, idx]), 31 | heatmap_gt.mul(target_weight[:, idx])) 32 | else: 33 | loss += self.criterion(heatmap_pred, heatmap_gt) 34 | 35 | return loss 36 | 37 | 38 | class PerJointMSELoss(nn.Module): 39 | def __init__(self): 40 | super(PerJointMSELoss, self).__init__() 41 | self.criterion = nn.MSELoss(reduction='mean') 42 | 43 | def forward(self, output, target, use_target_weight = False, target_weight=None): 44 | if use_target_weight: 45 | batch_size = output.size(0) 46 | num_joints = output.size(1) 47 | 48 | heatmap_pred = output.reshape((batch_size, num_joints, -1)) 49 | heatmap_gt = target.reshape((batch_size, num_joints, -1)) 50 | loss = self.criterion(heatmap_pred.mul(target_weight), heatmap_gt.mul(target_weight)) 51 | else: 52 | loss = self.criterion(output, target) 53 | 54 | return loss 55 | 56 | 57 | class PerJointL1Loss(nn.Module): 58 | def __init__(self): 59 | super(PerJointL1Loss, self).__init__() 60 | self.criterion = nn.L1Loss(reduction='mean') 61 | 62 | def forward(self, output, target, use_target_weight=False, target_weight=None): 63 | if use_target_weight: 64 | batch_size = output.size(0) 65 | num_joints = output.size(1) 66 | 67 | pred = output.reshape((batch_size, num_joints, -1)) 68 | gt = target.reshape((batch_size, num_joints, -1)) 69 | loss = self.criterion(pred.mul(target_weight), gt.mul(target_weight)) 70 | else: 71 | loss = self.criterion(output, target) 72 | 73 | return loss 74 | -------------------------------------------------------------------------------- /lib/core/proposal.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | import torch 12 | import torch.nn.functional as F 13 | from scipy.ndimage import maximum_filter 14 | 15 | 16 | def get_index(indices, shape): 17 | batch_size = indices.shape[0] 18 | num_people = indices.shape[1] 19 | indices_x = (indices // (shape[1] * shape[2])).reshape(batch_size, num_people, -1) 20 | indices_y = ((indices % (shape[1] * shape[2])) // shape[2]).reshape(batch_size, num_people, -1) 21 | indices_z = (indices % shape[2]).reshape(batch_size, num_people, -1) 22 | indices = torch.cat([indices_x, indices_y, indices_z], dim=2) 23 | return indices 24 | 25 | 26 | def max_pool(inputs, kernel=3): 27 | padding = (kernel - 1) // 2 28 | max = F.max_pool3d(inputs, kernel_size=kernel, stride=1, padding=padding) 29 | keep = (inputs == max).float() 30 | return keep * inputs 31 | 32 | 33 | def nms(root_cubes, max_num): 34 | batch_size = root_cubes.shape[0] 35 | # root_cubes_nms = torch.zeros_like(root_cubes, device=root_cubes.device) 36 | # 37 | # for b in range(batch_size): 38 | # mx = torch.as_tensor(maximum_filter(root_cubes[b].detach().cpu().numpy(), size=3), 39 | # dtype=torch.float, device=root_cubes.device) 40 | # root_cubes_nms[b] = (mx == root_cubes[b]).float() * root_cubes[b] 41 | root_cubes_nms = max_pool(root_cubes) 42 | root_cubes_nms_reshape = root_cubes_nms.reshape(batch_size, -1) 43 | topk_values, topk_index = root_cubes_nms_reshape.topk(max_num) 44 | topk_unravel_index = get_index(topk_index, root_cubes[0].shape) 45 | 46 | return topk_values, topk_unravel_index 47 | -------------------------------------------------------------------------------- /lib/dataset/JointsDataset.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import copy 7 | import logging 8 | 9 | import cv2 10 | import numpy as np 11 | import torch 12 | from torch.utils.data import Dataset 13 | import os 14 | 15 | from utils.transforms import get_affine_transform 16 | from utils.transforms import affine_transform, get_scale 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class JointsDataset(Dataset): 22 | 23 | def __init__(self, cfg, image_set, is_train, transform=None): 24 | self.cfg = cfg 25 | self.num_joints = 0 26 | self.pixel_std = 200 27 | self.flip_pairs = [] 28 | self.maximum_person = cfg.MULTI_PERSON.MAX_PEOPLE_NUM 29 | 30 | self.is_train = is_train 31 | 32 | this_dir = os.path.dirname(__file__) 33 | dataset_root = os.path.join(this_dir, '../..', cfg.DATASET.ROOT) 34 | self.dataset_root = os.path.abspath(dataset_root) 35 | self.root_id = cfg.DATASET.ROOTIDX 36 | self.image_set = image_set 37 | self.dataset_name = cfg.DATASET.TEST_DATASET 38 | 39 | self.data_format = cfg.DATASET.DATA_FORMAT 40 | self.data_augmentation = cfg.DATASET.DATA_AUGMENTATION 41 | 42 | self.num_views = cfg.DATASET.CAMERA_NUM 43 | 44 | self.scale_factor = cfg.DATASET.SCALE_FACTOR 45 | self.rotation_factor = cfg.DATASET.ROT_FACTOR 46 | self.flip = cfg.DATASET.FLIP 47 | self.color_rgb = cfg.DATASET.COLOR_RGB 48 | 49 | self.target_type = cfg.NETWORK.TARGET_TYPE 50 | self.image_size = np.array(cfg.NETWORK.IMAGE_SIZE) 51 | self.heatmap_size = np.array(cfg.NETWORK.HEATMAP_SIZE) 52 | self.sigma = cfg.NETWORK.SIGMA 53 | self.use_different_joints_weight = cfg.LOSS.USE_DIFFERENT_JOINTS_WEIGHT 54 | self.joints_weight = 1 55 | 56 | self.transform = transform 57 | self.db = [] 58 | 59 | self.space_size = np.array(cfg.MULTI_PERSON.SPACE_SIZE) 60 | self.space_center = np.array(cfg.MULTI_PERSON.SPACE_CENTER) 61 | self.initial_cube_size = np.array(cfg.MULTI_PERSON.INITIAL_CUBE_SIZE) 62 | 63 | 64 | def _get_db(self): 65 | raise NotImplementedError 66 | 67 | def evaluate(self, cfg, preds, output_dir, *args, **kwargs): 68 | raise NotImplementedError 69 | 70 | def __len__(self,): 71 | return len(self.db) 72 | 73 | def __getitem__(self, idx): 74 | db_rec = copy.deepcopy(self.db[idx]) 75 | 76 | image_file = db_rec['image'] 77 | 78 | if self.data_format == 'zip': 79 | from utils import zipreader 80 | data_numpy = zipreader.imread( 81 | image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) 82 | else: 83 | data_numpy = cv2.imread( 84 | image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION) 85 | 86 | if data_numpy is None: 87 | # logger.error('=> fail to read {}'.format(image_file)) 88 | # raise ValueError('Fail to read {}'.format(image_file)) 89 | return None, None, None, None, None, None 90 | 91 | if self.color_rgb: 92 | data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) 93 | 94 | joints = db_rec['joints_2d'] 95 | joints_vis = db_rec['joints_2d_vis'] 96 | joints_3d = db_rec['joints_3d'] 97 | joints_3d_vis = db_rec['joints_3d_vis'] 98 | 99 | nposes = len(joints) 100 | assert nposes <= self.maximum_person, 'too many persons' 101 | 102 | height, width, _ = data_numpy.shape 103 | c = np.array([width / 2.0, height / 2.0]) 104 | s = get_scale((width, height), self.image_size) 105 | r = 0 106 | 107 | trans = get_affine_transform(c, s, r, self.image_size) 108 | input = cv2.warpAffine( 109 | data_numpy, 110 | trans, (int(self.image_size[0]), int(self.image_size[1])), 111 | flags=cv2.INTER_LINEAR) 112 | 113 | if self.transform: 114 | input = self.transform(input) 115 | 116 | for n in range(nposes): 117 | for i in range(len(joints[0])): 118 | if joints_vis[n][i, 0] > 0.0: 119 | joints[n][i, 0:2] = affine_transform( 120 | joints[n][i, 0:2], trans) 121 | if (np.min(joints[n][i, :2]) < 0 or 122 | joints[n][i, 0] >= self.image_size[0] or 123 | joints[n][i, 1] >= self.image_size[1]): 124 | joints_vis[n][i, :] = 0 125 | 126 | if 'pred_pose2d' in db_rec and db_rec['pred_pose2d'] != None: 127 | # For convenience, we use predicted poses and corresponding values at the original heatmaps 128 | # to generate 2d heatmaps for Campus and Shelf dataset. 129 | # You can also use other 2d backbone trained on COCO to generate 2d heatmaps directly. 130 | pred_pose2d = db_rec['pred_pose2d'] 131 | for n in range(len(pred_pose2d)): 132 | for i in range(len(pred_pose2d[n])): 133 | pred_pose2d[n][i, 0:2] = affine_transform(pred_pose2d[n][i, 0:2], trans) 134 | 135 | input_heatmap = self.generate_input_heatmap(pred_pose2d) 136 | input_heatmap = torch.from_numpy(input_heatmap) 137 | else: 138 | input_heatmap = torch.zeros(self.cfg.NETWORK.NUM_JOINTS, self.heatmap_size[1], self.heatmap_size[0]) 139 | 140 | target_heatmap, target_weight = self.generate_target_heatmap( 141 | joints, joints_vis) 142 | target_heatmap = torch.from_numpy(target_heatmap) 143 | target_weight = torch.from_numpy(target_weight) 144 | 145 | # make joints and joints_vis having same shape 146 | joints_u = np.zeros((self.maximum_person, self.num_joints, 2)) 147 | joints_vis_u = np.zeros((self.maximum_person, self.num_joints, 2)) 148 | for i in range(nposes): 149 | joints_u[i] = joints[i] 150 | joints_vis_u[i] = joints_vis[i] 151 | 152 | joints_3d_u = np.zeros((self.maximum_person, self.num_joints, 3)) 153 | joints_3d_vis_u = np.zeros((self.maximum_person, self.num_joints, 3)) 154 | for i in range(nposes): 155 | joints_3d_u[i] = joints_3d[i][:, 0:3] 156 | joints_3d_vis_u[i] = joints_3d_vis[i][:, 0:3] 157 | 158 | target_3d = self.generate_3d_target(joints_3d) 159 | target_3d = torch.from_numpy(target_3d) 160 | 161 | if isinstance(self.root_id, int): 162 | roots_3d = joints_3d_u[:, self.root_id] 163 | elif isinstance(self.root_id, list): 164 | roots_3d = np.mean([joints_3d_u[:, j] for j in self.root_id], axis=0) 165 | meta = { 166 | 'image': image_file, 167 | 'num_person': nposes, 168 | 'joints_3d': joints_3d_u, 169 | 'joints_3d_vis': joints_3d_vis_u, 170 | 'roots_3d': roots_3d, 171 | 'joints': joints_u, 172 | 'joints_vis': joints_vis_u, 173 | 'center': c, 174 | 'scale': s, 175 | 'rotation': r, 176 | 'camera': db_rec['camera'] 177 | } 178 | 179 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 180 | 181 | def compute_human_scale(self, pose, joints_vis): 182 | idx = joints_vis[:, 0] == 1 183 | if np.sum(idx) == 0: 184 | return 0 185 | minx, maxx = np.min(pose[idx, 0]), np.max(pose[idx, 0]) 186 | miny, maxy = np.min(pose[idx, 1]), np.max(pose[idx, 1]) 187 | # return np.clip((maxy - miny) * (maxx - minx), 1.0 / 4 * 256**2, 188 | # 4 * 256**2) 189 | return np.clip(np.maximum(maxy - miny, maxx - minx)**2, 1.0 / 4 * 96**2, 4 * 96**2) 190 | 191 | def generate_target_heatmap(self, joints, joints_vis): 192 | ''' 193 | :param joints: [[num_joints, 3]] 194 | :param joints_vis: [num_joints, 3] 195 | :return: target, target_weight(1: visible, 0: invisible) 196 | ''' 197 | nposes = len(joints) 198 | num_joints = self.num_joints 199 | target_weight = np.zeros((num_joints, 1), dtype=np.float32) 200 | for i in range(num_joints): 201 | for n in range(nposes): 202 | if joints_vis[n][i, 0] == 1: 203 | target_weight[i, 0] = 1 204 | 205 | assert self.target_type == 'gaussian', \ 206 | 'Only support gaussian map now!' 207 | 208 | if self.target_type == 'gaussian': 209 | target = np.zeros( 210 | (num_joints, self.heatmap_size[1], self.heatmap_size[0]), 211 | dtype=np.float32) 212 | feat_stride = self.image_size / self.heatmap_size 213 | 214 | for n in range(nposes): 215 | human_scale = 2 * self.compute_human_scale(joints[n] / feat_stride, joints_vis[n]) 216 | if human_scale == 0: 217 | continue 218 | 219 | cur_sigma = self.sigma * np.sqrt((human_scale / (96.0 * 96.0))) 220 | tmp_size = cur_sigma * 3 221 | for joint_id in range(num_joints): 222 | feat_stride = self.image_size / self.heatmap_size 223 | mu_x = int(joints[n][joint_id][0] / feat_stride[0]) 224 | mu_y = int(joints[n][joint_id][1] / feat_stride[1]) 225 | ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] 226 | br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] 227 | if joints_vis[n][joint_id, 0] == 0 or \ 228 | ul[0] >= self.heatmap_size[0] or \ 229 | ul[1] >= self.heatmap_size[1] \ 230 | or br[0] < 0 or br[1] < 0: 231 | continue 232 | 233 | size = 2 * tmp_size + 1 234 | x = np.arange(0, size, 1, np.float32) 235 | y = x[:, np.newaxis] 236 | x0 = y0 = size // 2 237 | g = np.exp( 238 | -((x - x0)**2 + (y - y0)**2) / (2 * cur_sigma**2)) 239 | 240 | # Usable gaussian range 241 | g_x = max(0, 242 | -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0] 243 | g_y = max(0, 244 | -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1] 245 | # Image range 246 | img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0]) 247 | img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1]) 248 | 249 | target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum(target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]], 250 | g[g_y[0]:g_y[1], g_x[0]:g_x[1]]) 251 | target = np.clip(target, 0, 1) 252 | 253 | if self.use_different_joints_weight: 254 | target_weight = np.multiply(target_weight, self.joints_weight) 255 | 256 | return target, target_weight 257 | 258 | def generate_3d_target(self, joints_3d): 259 | num_people = len(joints_3d) 260 | 261 | space_size = self.space_size 262 | space_center = self.space_center 263 | cube_size = self.initial_cube_size 264 | grid1Dx = np.linspace(-space_size[0] / 2, space_size[0] / 2, cube_size[0]) + space_center[0] 265 | grid1Dy = np.linspace(-space_size[1] / 2, space_size[1] / 2, cube_size[1]) + space_center[1] 266 | grid1Dz = np.linspace(-space_size[2] / 2, space_size[2] / 2, cube_size[2]) + space_center[2] 267 | 268 | target = np.zeros((cube_size[0], cube_size[1], cube_size[2]), dtype=np.float32) 269 | cur_sigma = 200.0 270 | 271 | for n in range(num_people): 272 | joint_id = self.root_id # mid-hip 273 | if isinstance(joint_id, int): 274 | mu_x = joints_3d[n][joint_id][0] 275 | mu_y = joints_3d[n][joint_id][1] 276 | mu_z = joints_3d[n][joint_id][2] 277 | elif isinstance(joint_id, list): 278 | mu_x = (joints_3d[n][joint_id[0]][0] + joints_3d[n][joint_id[1]][0]) / 2.0 279 | mu_y = (joints_3d[n][joint_id[0]][1] + joints_3d[n][joint_id[1]][1]) / 2.0 280 | mu_z = (joints_3d[n][joint_id[0]][2] + joints_3d[n][joint_id[1]][2]) / 2.0 281 | i_x = [np.searchsorted(grid1Dx, mu_x - 3 * cur_sigma), 282 | np.searchsorted(grid1Dx, mu_x + 3 * cur_sigma, 'right')] 283 | i_y = [np.searchsorted(grid1Dy, mu_y - 3 * cur_sigma), 284 | np.searchsorted(grid1Dy, mu_y + 3 * cur_sigma, 'right')] 285 | i_z = [np.searchsorted(grid1Dz, mu_z - 3 * cur_sigma), 286 | np.searchsorted(grid1Dz, mu_z + 3 * cur_sigma, 'right')] 287 | if i_x[0] >= i_x[1] or i_y[0] >= i_y[1] or i_z[0] >= i_z[1]: 288 | continue 289 | 290 | gridx, gridy, gridz = np.meshgrid(grid1Dx[i_x[0]:i_x[1]], grid1Dy[i_y[0]:i_y[1]], grid1Dz[i_z[0]:i_z[1]], indexing='ij') 291 | g = np.exp(-((gridx - mu_x) ** 2 + (gridy - mu_y) ** 2 + (gridz - mu_z) ** 2) / (2 * cur_sigma ** 2)) 292 | target[i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]] = np.maximum(target[i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]], g) 293 | 294 | target = np.clip(target, 0, 1) 295 | return target 296 | 297 | def generate_input_heatmap(self, joints): 298 | ''' 299 | :param joints: [[num_joints, 3]] 300 | :param joints_vis: [num_joints, 3] 301 | :return: input_heatmap 302 | ''' 303 | nposes = len(joints) 304 | num_joints = self.cfg.NETWORK.NUM_JOINTS 305 | 306 | assert self.target_type == 'gaussian', \ 307 | 'Only support gaussian map now!' 308 | 309 | if self.target_type == 'gaussian': 310 | target = np.zeros( 311 | (num_joints, self.heatmap_size[1], self.heatmap_size[0]), 312 | dtype=np.float32) 313 | feat_stride = self.image_size / self.heatmap_size 314 | 315 | for n in range(nposes): 316 | human_scale = 2 * self.compute_human_scale(joints[n][:, 0:2] / feat_stride, np.ones((num_joints, 1))) 317 | if human_scale == 0: 318 | continue 319 | 320 | cur_sigma = self.sigma * np.sqrt((human_scale / (96.0 * 96.0))) 321 | tmp_size = cur_sigma * 3 322 | for joint_id in range(num_joints): 323 | feat_stride = self.image_size / self.heatmap_size 324 | mu_x = int(joints[n][joint_id][0] / feat_stride[0]) 325 | mu_y = int(joints[n][joint_id][1] / feat_stride[1]) 326 | ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] 327 | br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] 328 | if ul[0] >= self.heatmap_size[0] or \ 329 | ul[1] >= self.heatmap_size[1] \ 330 | or br[0] < 0 or br[1] < 0: 331 | continue 332 | 333 | size = 2 * tmp_size + 1 334 | x = np.arange(0, size, 1, np.float32) 335 | y = x[:, np.newaxis] 336 | x0 = y0 = size // 2 337 | if 'campus' in self.dataset_name: 338 | max_value = 1.0 339 | else: 340 | max_value = joints[n][joint_id][2] if len(joints[n][joint_id]) == 3 else 1.0 341 | # max_value = max_value**0.5 342 | g = np.exp( 343 | -((x - x0)**2 + (y - y0)**2) / (2 * cur_sigma**2)) * max_value 344 | 345 | # Usable gaussian range 346 | g_x = max(0, 347 | -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0] 348 | g_y = max(0, 349 | -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1] 350 | # Image range 351 | img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0]) 352 | img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1]) 353 | 354 | target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum(target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]], 355 | g[g_y[0]:g_y[1], g_x[0]:g_x[1]]) 356 | target = np.clip(target, 0, 1) 357 | 358 | return target 359 | 360 | 361 | 362 | 363 | -------------------------------------------------------------------------------- /lib/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from dataset.panoptic import Panoptic as panoptic 11 | from dataset.shelf_synthetic import ShelfSynthetic as shelf_synthetic 12 | from dataset.campus_synthetic import CampusSynthetic as campus_synthetic 13 | from dataset.shelf import Shelf as shelf 14 | from dataset.campus import Campus as campus 15 | -------------------------------------------------------------------------------- /lib/dataset/campus.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os.path as osp 11 | import numpy as np 12 | import json_tricks as json 13 | import pickle 14 | import scipy.io as scio 15 | import logging 16 | import copy 17 | import os 18 | from collections import OrderedDict 19 | 20 | from dataset.JointsDataset import JointsDataset 21 | from utils.cameras_cpu import project_pose 22 | 23 | CAMPUS_JOINTS_DEF = { 24 | 'Right-Ankle': 0, 25 | 'Right-Knee': 1, 26 | 'Right-Hip': 2, 27 | 'Left-Hip': 3, 28 | 'Left-Knee': 4, 29 | 'Left-Ankle': 5, 30 | 'Right-Wrist': 6, 31 | 'Right-Elbow': 7, 32 | 'Right-Shoulder': 8, 33 | 'Left-Shoulder': 9, 34 | 'Left-Elbow': 10, 35 | 'Left-Wrist': 11, 36 | 'Bottom-Head': 12, 37 | 'Top-Head': 13 38 | } 39 | 40 | LIMBS = [ 41 | [0, 1], 42 | [1, 2], 43 | [3, 4], 44 | [4, 5], 45 | [2, 3], 46 | [6, 7], 47 | [7, 8], 48 | [9, 10], 49 | [10, 11], 50 | [2, 8], 51 | [3, 9], 52 | [8, 12], 53 | [9, 12], 54 | [12, 13] 55 | ] 56 | 57 | 58 | class Campus(JointsDataset): 59 | def __init__(self, cfg, image_set, is_train, transform=None): 60 | self.pixel_std = 200.0 61 | self.joints_def = CAMPUS_JOINTS_DEF 62 | super().__init__(cfg, image_set, is_train, transform) 63 | self.limbs = LIMBS 64 | self.num_joints = len(CAMPUS_JOINTS_DEF) 65 | self.cam_list = [0, 1, 2] 66 | self.num_views = len(self.cam_list) 67 | self.frame_range = list(range(350, 471)) + list(range(650, 751)) 68 | 69 | self.pred_pose2d = self._get_pred_pose2d() 70 | self.db = self._get_db() 71 | 72 | self.db_size = len(self.db) 73 | 74 | def _get_pred_pose2d(self): 75 | file = os.path.join(self.dataset_root, "pred_campus_maskrcnn_hrnet_coco.pkl") 76 | with open(file, "rb") as pfile: 77 | logging.info("=> load {}".format(file)) 78 | pred_2d = pickle.load(pfile) 79 | 80 | return pred_2d 81 | 82 | def _get_db(self): 83 | width = 360 84 | height = 288 85 | 86 | db = [] 87 | cameras = self._get_cam() 88 | 89 | datafile = os.path.join(self.dataset_root, 'actorsGT.mat') 90 | data = scio.loadmat(datafile) 91 | actor_3d = np.array(np.array(data['actor3D'].tolist()).tolist()).squeeze() # num_person * num_frame 92 | 93 | num_person = len(actor_3d) 94 | num_frames = len(actor_3d[0]) 95 | 96 | for i in self.frame_range: 97 | for k, cam in cameras.items(): 98 | image = osp.join("Camera" + k, "campus4-c{0}-{1:05d}.png".format(k, i)) 99 | 100 | all_poses_3d = [] 101 | all_poses_vis_3d = [] 102 | all_poses = [] 103 | all_poses_vis = [] 104 | for person in range(num_person): 105 | pose3d = actor_3d[person][i] * 1000.0 106 | if len(pose3d[0]) > 0: 107 | all_poses_3d.append(pose3d) 108 | all_poses_vis_3d.append(np.ones((self.num_joints, 3))) 109 | 110 | pose2d = project_pose(pose3d, cam) 111 | 112 | x_check = np.bitwise_and(pose2d[:, 0] >= 0, 113 | pose2d[:, 0] <= width - 1) 114 | y_check = np.bitwise_and(pose2d[:, 1] >= 0, 115 | pose2d[:, 1] <= height - 1) 116 | check = np.bitwise_and(x_check, y_check) 117 | 118 | joints_vis = np.ones((len(pose2d), 1)) 119 | joints_vis[np.logical_not(check)] = 0 120 | all_poses.append(pose2d) 121 | all_poses_vis.append( 122 | np.repeat( 123 | np.reshape(joints_vis, (-1, 1)), 2, axis=1)) 124 | 125 | pred_index = '{}_{}'.format(k, i) 126 | preds = self.pred_pose2d[pred_index] 127 | preds = [np.array(p["pred"]) for p in preds] 128 | 129 | db.append({ 130 | 'image': osp.join(self.dataset_root, image), 131 | 'joints_3d': all_poses_3d, 132 | 'joints_3d_vis': all_poses_vis_3d, 133 | 'joints_2d': all_poses, 134 | 'joints_2d_vis': all_poses_vis, 135 | 'camera': cam, 136 | 'pred_pose2d': preds 137 | }) 138 | return db 139 | 140 | def _get_cam(self): 141 | cam_file = osp.join(self.dataset_root, "calibration_campus.json") 142 | with open(cam_file) as cfile: 143 | cameras = json.load(cfile) 144 | 145 | for id, cam in cameras.items(): 146 | for k, v in cam.items(): 147 | cameras[id][k] = np.array(v) 148 | 149 | return cameras 150 | 151 | def __getitem__(self, idx): 152 | input, target_heatmap, target_weight, target_3d, meta, input_heatmap = [], [], [], [], [], [] 153 | for k in range(self.num_views): 154 | i, th, tw, t3, m, ih = super().__getitem__(self.num_views * idx + k) 155 | input.append(i) 156 | target_heatmap.append(th) 157 | target_weight.append(tw) 158 | input_heatmap.append(ih) 159 | target_3d.append(t3) 160 | meta.append(m) 161 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 162 | 163 | def __len__(self): 164 | return self.db_size // self.num_views 165 | 166 | def evaluate(self, preds, recall_threshold=500): 167 | datafile = os.path.join(self.dataset_root, 'actorsGT.mat') 168 | data = scio.loadmat(datafile) 169 | actor_3d = np.array(np.array(data['actor3D'].tolist()).tolist()).squeeze() # num_person * num_frame 170 | num_person = len(actor_3d) 171 | total_gt = 0 172 | match_gt = 0 173 | 174 | limbs = [[0, 1], [1, 2], [3, 4], [4, 5], [6, 7], [7, 8], [9, 10], [10, 11], [12, 13]] 175 | correct_parts = np.zeros(num_person) 176 | total_parts = np.zeros(num_person) 177 | alpha = 0.5 178 | bone_correct_parts = np.zeros((num_person, 10)) 179 | 180 | for i, fi in enumerate(self.frame_range): 181 | pred_coco = preds[i].copy() 182 | pred_coco = pred_coco[pred_coco[:, 0, 3] >= 0, :, :3] 183 | pred = np.stack([self.coco2campus3D(p) for p in copy.deepcopy(pred_coco[:, :, :3])]) 184 | 185 | for person in range(num_person): 186 | gt = actor_3d[person][fi] * 1000.0 187 | if len(gt[0]) == 0: 188 | continue 189 | 190 | mpjpes = np.mean(np.sqrt(np.sum((gt[np.newaxis] - pred) ** 2, axis=-1)), axis=-1) 191 | min_n = np.argmin(mpjpes) 192 | min_mpjpe = np.min(mpjpes) 193 | if min_mpjpe < recall_threshold: 194 | match_gt += 1 195 | total_gt += 1 196 | 197 | for j, k in enumerate(limbs): 198 | total_parts[person] += 1 199 | error_s = np.linalg.norm(pred[min_n, k[0], 0:3] - gt[k[0]]) 200 | error_e = np.linalg.norm(pred[min_n, k[1], 0:3] - gt[k[1]]) 201 | limb_length = np.linalg.norm(gt[k[0]] - gt[k[1]]) 202 | if (error_s + error_e) / 2.0 <= alpha * limb_length: 203 | correct_parts[person] += 1 204 | bone_correct_parts[person, j] += 1 205 | pred_hip = (pred[min_n, 2, 0:3] + pred[min_n, 3, 0:3]) / 2.0 206 | gt_hip = (gt[2] + gt[3]) / 2.0 207 | total_parts[person] += 1 208 | error_s = np.linalg.norm(pred_hip - gt_hip) 209 | error_e = np.linalg.norm(pred[min_n, 12, 0:3] - gt[12]) 210 | limb_length = np.linalg.norm(gt_hip - gt[12]) 211 | if (error_s + error_e) / 2.0 <= alpha * limb_length: 212 | correct_parts[person] += 1 213 | bone_correct_parts[person, 9] += 1 214 | 215 | actor_pcp = correct_parts / (total_parts + 1e-8) 216 | avg_pcp = np.mean(actor_pcp[:3]) 217 | 218 | bone_group = OrderedDict( 219 | [('Head', [8]), ('Torso', [9]), ('Upper arms', [5, 6]), 220 | ('Lower arms', [4, 7]), ('Upper legs', [1, 2]), ('Lower legs', [0, 3])]) 221 | bone_person_pcp = OrderedDict() 222 | for k, v in bone_group.items(): 223 | bone_person_pcp[k] = np.sum(bone_correct_parts[:, v], axis=-1) / (total_parts / 10 * len(v) + 1e-8) 224 | 225 | return actor_pcp, avg_pcp, bone_person_pcp, match_gt / (total_gt + 1e-8) 226 | 227 | @staticmethod 228 | def coco2campus3D(coco_pose): 229 | """ 230 | transform coco order(our method output) 3d pose to shelf dataset order with interpolation 231 | :param coco_pose: np.array with shape 17x3 232 | :return: 3D pose in campus order with shape 14x3 233 | """ 234 | campus_pose = np.zeros((14, 3)) 235 | coco2campus = np.array([16, 14, 12, 11, 13, 15, 10, 8, 6, 5, 7, 9]) 236 | campus_pose[0: 12] += coco_pose[coco2campus] 237 | 238 | mid_sho = (coco_pose[5] + coco_pose[6]) / 2 # L and R shoulder 239 | head_center = (coco_pose[3] + coco_pose[4]) / 2 # middle of two ear 240 | 241 | head_bottom = (mid_sho + head_center) / 2 # nose and head center 242 | head_top = head_bottom + (head_center - head_bottom) * 2 243 | campus_pose[12] += head_bottom 244 | campus_pose[13] += head_top 245 | 246 | return campus_pose 247 | -------------------------------------------------------------------------------- /lib/dataset/campus_synthetic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os.path as osp 11 | import numpy as np 12 | import torch 13 | from torch.utils.data import Dataset 14 | 15 | import json_tricks as json 16 | import pickle 17 | import logging 18 | import copy 19 | import random 20 | import cv2 21 | 22 | import os 23 | 24 | from utils.transforms import get_affine_transform 25 | from utils.transforms import affine_transform 26 | from utils.transforms import rotate_points, get_scale 27 | from utils.cameras_cpu import project_pose 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | coco_joints_def = {0: 'nose', 32 | 1: 'Leye', 2: 'Reye', 3: 'Lear', 4: 'Rear', 33 | 5: 'Lsho', 6: 'Rsho', 34 | 7: 'Lelb', 8: 'Relb', 35 | 9: 'Lwri', 10: 'Rwri', 36 | 11: 'Lhip', 12: 'Rhip', 37 | 13: 'Lkne', 14: 'Rkne', 38 | 15: 'Lank', 16: 'Rank'} 39 | 40 | LIMBS = [[0, 1], [0, 2], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], [11, 13], [13, 15], 41 | [6, 12], [12, 14], [14, 16], [5, 6], [11, 12]] 42 | 43 | 44 | class CampusSynthetic(Dataset): 45 | def __init__(self, cfg, image_set, is_train, transform=None): 46 | super().__init__() 47 | self.pixel_std = 200.0 48 | self.joints_def = coco_joints_def 49 | self.limbs = LIMBS 50 | self.num_joints = len(coco_joints_def) 51 | self.cam_list = [0, 1, 2] 52 | self.num_views = len(self.cam_list) 53 | self.maximum_person = cfg.MULTI_PERSON.MAX_PEOPLE_NUM 54 | 55 | self.is_train = is_train 56 | 57 | this_dir = os.path.dirname(__file__) 58 | dataset_root = os.path.join(this_dir, '../..', cfg.DATASET.ROOT) 59 | self.dataset_root = dataset_root 60 | self.image_set = image_set 61 | self.dataset_name = cfg.DATASET.TEST_DATASET 62 | 63 | self.data_format = cfg.DATASET.DATA_FORMAT 64 | self.data_augmentation = cfg.DATASET.DATA_AUGMENTATION 65 | 66 | self.color_rgb = cfg.DATASET.COLOR_RGB 67 | 68 | self.target_type = cfg.NETWORK.TARGET_TYPE 69 | self.image_size = np.array(cfg.NETWORK.IMAGE_SIZE) 70 | self.heatmap_size = np.array(cfg.NETWORK.HEATMAP_SIZE) 71 | self.sigma = cfg.NETWORK.SIGMA 72 | self.use_different_joints_weight = cfg.LOSS.USE_DIFFERENT_JOINTS_WEIGHT 73 | self.joints_weight = 1 74 | 75 | self.transform = transform 76 | 77 | self.space_size = np.array(cfg.MULTI_PERSON.SPACE_SIZE) 78 | self.space_center = np.array(cfg.MULTI_PERSON.SPACE_CENTER) 79 | self.initial_cube_size = np.array(cfg.MULTI_PERSON.INITIAL_CUBE_SIZE) 80 | 81 | pose_db_file = os.path.join(self.dataset_root, "..", "panoptic_training_pose.pkl") 82 | self.pose_db = pickle.load(open(pose_db_file, "rb")) 83 | self.cameras = self._get_cam() 84 | 85 | def _get_cam(self): 86 | cam_file = osp.join(self.dataset_root, "calibration_campus.json") 87 | with open(cam_file) as cfile: 88 | cameras = json.load(cfile) 89 | 90 | for id, cam in cameras.items(): 91 | for k, v in cam.items(): 92 | cameras[id][k] = np.array(v) 93 | 94 | return cameras 95 | 96 | def __getitem__(self, idx): 97 | # nposes = np.random.choice([1, 2, 3, 4, 5], p=[0.1, 0.1, 0.2, 0.4, 0.2]) 98 | nposes = np.random.choice(range(1, 10)) 99 | bbox_list = [] 100 | center_list = [] 101 | 102 | select_poses = np.random.choice(self.pose_db, nposes) 103 | joints_3d = np.array([p['pose'] for p in select_poses]) 104 | joints_3d_vis = np.array([p['vis'] for p in select_poses]) 105 | 106 | for n in range(0, nposes): 107 | points = joints_3d[n][:, :2].copy() 108 | center = (points[11, :2] + points[12, :2]) / 2 109 | rot_rad = np.random.uniform(-180, 180) 110 | 111 | new_center = self.get_new_center(center_list) 112 | new_xy = rotate_points(points, center, rot_rad) - center + new_center 113 | 114 | loop_count = 0 115 | while not self.isvalid(new_center, self.calc_bbox(new_xy, joints_3d_vis[n]), bbox_list): 116 | loop_count += 1 117 | if loop_count >= 100: 118 | break 119 | new_center = self.get_new_center(center_list) 120 | new_xy = rotate_points(points, center, rot_rad) - center + new_center 121 | 122 | if loop_count >= 100: 123 | nposes = n 124 | joints_3d = joints_3d[:n] 125 | joints_3d_vis = joints_3d_vis[:n] 126 | else: 127 | center_list.append(new_center) 128 | bbox_list.append(self.calc_bbox(new_xy, joints_3d_vis[n])) 129 | joints_3d[n][:, :2] = new_xy 130 | 131 | input, target_heatmap, target_weight, target_3d, meta, input_heatmap = [], [], [], [], [], [] 132 | for k, cam in self.cameras.items(): 133 | i, th, tw, t3, m, ih = self._get_single_view_item(joints_3d, joints_3d_vis, cam) 134 | input.append(i) 135 | target_heatmap.append(th) 136 | target_weight.append(tw) 137 | input_heatmap.append(ih) 138 | target_3d.append(t3) 139 | meta.append(m) 140 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 141 | 142 | def __len__(self): 143 | return 3000 144 | # return self.db_size // self.num_views 145 | 146 | def _get_single_view_item(self, joints_3d, joints_3d_vis, cam): 147 | joints_3d = copy.deepcopy(joints_3d) 148 | joints_3d_vis = copy.deepcopy(joints_3d_vis) 149 | nposes = len(joints_3d) 150 | 151 | width = 360 152 | height = 288 153 | c = np.array([width / 2.0, height / 2.0], dtype=np.float32) 154 | # s = np.array( 155 | # [width / self.pixel_std, height / self.pixel_std], dtype=np.float32) 156 | s = get_scale((width, height), self.image_size) 157 | r = 0 158 | 159 | joints = [] 160 | joints_vis = [] 161 | for n in range(nposes): 162 | pose2d = project_pose(joints_3d[n], cam) 163 | 164 | x_check = np.bitwise_and(pose2d[:, 0] >= 0, 165 | pose2d[:, 0] <= width - 1) 166 | y_check = np.bitwise_and(pose2d[:, 1] >= 0, 167 | pose2d[:, 1] <= height - 1) 168 | check = np.bitwise_and(x_check, y_check) 169 | vis = joints_3d_vis[n][:, 0] > 0 170 | vis[np.logical_not(check)] = 0 171 | 172 | joints.append(pose2d) 173 | joints_vis.append(np.repeat(np.reshape(vis, (-1, 1)), 2, axis=1)) 174 | 175 | trans = get_affine_transform(c, s, r, self.image_size) 176 | input = np.ones((height, width, 3), dtype=np.float32) 177 | input = cv2.warpAffine( 178 | input, 179 | trans, (int(self.image_size[0]), int(self.image_size[1])), 180 | flags=cv2.INTER_LINEAR) 181 | 182 | if self.transform: 183 | input = self.transform(input) 184 | 185 | for n in range(nposes): 186 | for i in range(len(joints[0])): 187 | if joints_vis[n][i, 0] > 0.0: 188 | joints[n][i, 0:2] = affine_transform( 189 | joints[n][i, 0:2], trans) 190 | if (np.min(joints[n][i, :2]) < 0 or 191 | joints[n][i, 0] >= self.image_size[0] or 192 | joints[n][i, 1] >= self.image_size[1]): 193 | joints_vis[n][i, :] = 0 194 | 195 | input_heatmap, _ = self.generate_input_heatmap( 196 | joints, joints_vis) 197 | input_heatmap = torch.from_numpy(input_heatmap) 198 | target_heatmap = torch.zeros_like(input_heatmap) 199 | target_weight = torch.zeros(len(target_heatmap), 1) 200 | 201 | # make joints and joints_vis having same shape 202 | joints_u = np.zeros((self.maximum_person, len(joints[0]), 2)) 203 | joints_vis_u = np.zeros((self.maximum_person, len(joints[0]), 2)) 204 | for i in range(nposes): 205 | joints_u[i] = joints[i] 206 | joints_vis_u[i] = joints_vis[i] 207 | 208 | joints_3d_u = np.zeros((self.maximum_person, len(joints[0]), 3)) 209 | joints_3d_vis_u = np.zeros((self.maximum_person, len(joints[0]), 3)) 210 | for i in range(nposes): 211 | joints_3d_u[i] = joints_3d[i][:, 0:3] 212 | joints_3d_vis_u[i] = joints_3d_vis[i][:, 0:3] 213 | 214 | target_3d = self.generate_3d_target(joints_3d) 215 | target_3d = torch.from_numpy(target_3d) 216 | 217 | meta = { 218 | 'image': '', 219 | 'num_person': nposes, 220 | 'joints_3d': joints_3d_u, 221 | 'roots_3d': (joints_3d_u[:, 11] + joints_3d_u[:, 12]) / 2.0, 222 | 'joints_3d_vis': joints_3d_vis_u, 223 | 'joints': joints_u, 224 | 'joints_vis': joints_vis_u, 225 | 'center': c, 226 | 'scale': s, 227 | 'rotation': r, 228 | 'camera': cam 229 | } 230 | 231 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 232 | 233 | @staticmethod 234 | def compute_human_scale(pose, joints_vis): 235 | idx = joints_vis[:, 0] == 1 236 | if np.sum(idx) == 0: 237 | return 0 238 | minx, maxx = np.min(pose[idx, 0]), np.max(pose[idx, 0]) 239 | miny, maxy = np.min(pose[idx, 1]), np.max(pose[idx, 1]) 240 | return np.clip(np.maximum(maxy - miny, maxx - minx) ** 2, 1.0 / 4 * 96 ** 2, 4 * 96 ** 2) 241 | 242 | def generate_input_heatmap(self, joints, joints_vis): 243 | ''' 244 | :param joints: [[num_joints, 3]] 245 | :param joints_vis: [num_joints, 3] 246 | :return: input_heatmap 247 | ''' 248 | nposes = len(joints) 249 | num_joints = joints[0].shape[0] 250 | target_weight = np.zeros((num_joints, 1), dtype=np.float32) 251 | for i in range(num_joints): 252 | for n in range(nposes): 253 | if joints_vis[n][i, 0] == 1: 254 | target_weight[i, 0] = 1 255 | 256 | assert self.target_type == 'gaussian', \ 257 | 'Only support gaussian map now!' 258 | 259 | if self.target_type == 'gaussian': 260 | target = np.zeros( 261 | (num_joints, self.heatmap_size[1], self.heatmap_size[0]), 262 | dtype=np.float32) 263 | feat_stride = self.image_size / self.heatmap_size 264 | 265 | for n in range(nposes): 266 | # obscured = random.random() < 0.05 267 | # if obscured: 268 | # continue 269 | human_scale = 2 * self.compute_human_scale(joints[n] / feat_stride, joints_vis[n]) 270 | if human_scale == 0: 271 | continue 272 | 273 | cur_sigma = self.sigma * np.sqrt((human_scale / (96.0 * 96.0))) 274 | tmp_size = cur_sigma * 3 275 | for joint_id in range(num_joints): 276 | feat_stride = self.image_size / self.heatmap_size 277 | mu_x = int(joints[n][joint_id][0] / feat_stride[0]) 278 | mu_y = int(joints[n][joint_id][1] / feat_stride[1]) 279 | ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] 280 | br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] 281 | if joints_vis[n][joint_id, 0] == 0 or \ 282 | ul[0] >= self.heatmap_size[0] or \ 283 | ul[1] >= self.heatmap_size[1] \ 284 | or br[0] < 0 or br[1] < 0: 285 | continue 286 | 287 | size = 2 * tmp_size + 1 288 | x = np.arange(0, size, 1, np.float32) 289 | y = x[:, np.newaxis] 290 | x0 = y0 = size // 2 291 | scale = 0.9 + np.random.randn(1) * 0.03 if random.random() < 0.6 else 1.0 292 | if joint_id in [7, 8]: 293 | scale = scale * 0.5 if random.random() < 0.1 else scale 294 | elif joint_id in [9, 10]: 295 | scale = scale * 0.2 if random.random() < 0.1 else scale 296 | else: 297 | scale = scale * 0.5 if random.random() < 0.05 else scale 298 | g = np.exp( 299 | -((x - x0) ** 2 + (y - y0) ** 2) / (2 * cur_sigma ** 2)) * scale 300 | 301 | # Usable gaussian range 302 | g_x = max(0, 303 | -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0] 304 | g_y = max(0, 305 | -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1] 306 | # Image range 307 | img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0]) 308 | img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1]) 309 | 310 | target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum( 311 | target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]], 312 | g[g_y[0]:g_y[1], g_x[0]:g_x[1]]) 313 | target = np.clip(target, 0, 1) 314 | 315 | if self.use_different_joints_weight: 316 | target_weight = np.multiply(target_weight, self.joints_weight) 317 | 318 | return target, target_weight 319 | 320 | def generate_3d_target(self, joints_3d): 321 | num_people = len(joints_3d) 322 | 323 | space_size = self.space_size 324 | space_center = self.space_center 325 | cube_size = self.initial_cube_size 326 | grid1Dx = np.linspace(-space_size[0] / 2, space_size[0] / 2, cube_size[0]) + space_center[0] 327 | grid1Dy = np.linspace(-space_size[1] / 2, space_size[1] / 2, cube_size[1]) + space_center[1] 328 | grid1Dz = np.linspace(-space_size[2] / 2, space_size[2] / 2, cube_size[2]) + space_center[2] 329 | 330 | target = np.zeros((cube_size[0], cube_size[1], cube_size[2]), dtype=np.float32) 331 | cur_sigma = 200.0 332 | 333 | for n in range(num_people): 334 | joint_id = [11, 12] # mid-hip 335 | mu_x = (joints_3d[n][joint_id[0]][0] + joints_3d[n][joint_id[1]][0]) / 2.0 336 | mu_y = (joints_3d[n][joint_id[0]][1] + joints_3d[n][joint_id[1]][1]) / 2.0 337 | mu_z = (joints_3d[n][joint_id[0]][2] + joints_3d[n][joint_id[1]][2]) / 2.0 338 | 339 | i_x = [np.searchsorted(grid1Dx, mu_x - 3 * cur_sigma), 340 | np.searchsorted(grid1Dx, mu_x + 3 * cur_sigma, 'right')] 341 | i_y = [np.searchsorted(grid1Dy, mu_y - 3 * cur_sigma), 342 | np.searchsorted(grid1Dy, mu_y + 3 * cur_sigma, 'right')] 343 | i_z = [np.searchsorted(grid1Dz, mu_z - 3 * cur_sigma), 344 | np.searchsorted(grid1Dz, mu_z + 3 * cur_sigma, 'right')] 345 | if i_x[0] >= i_x[1] or i_y[0] >= i_y[1] or i_z[0] >= i_z[1]: 346 | continue 347 | 348 | gridx, gridy, gridz = np.meshgrid(grid1Dx[i_x[0]:i_x[1]], grid1Dy[i_y[0]:i_y[1]], grid1Dz[i_z[0]:i_z[1]], 349 | indexing='ij') 350 | g = np.exp(-((gridx - mu_x) ** 2 + (gridy - mu_y) ** 2 + (gridz - mu_z) ** 2) / (2 * cur_sigma ** 2)) 351 | target[i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]] = np.maximum( 352 | target[i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]], g) 353 | 354 | target = np.clip(target, 0, 1) 355 | return target 356 | 357 | def evaluate(self): 358 | pass 359 | 360 | @staticmethod 361 | def get_new_center(center_list): 362 | if len(center_list) == 0 or random.random() < 0.7: 363 | new_center = np.array([np.random.uniform(-2500.0, 8500.0), np.random.uniform(-1000.0, 10000.0)]) 364 | else: 365 | xy = center_list[np.random.choice(range(len(center_list)))] 366 | new_center = xy + np.random.normal(500, 50, 2) * np.random.choice([1, -1], 2) 367 | 368 | return new_center 369 | 370 | def isvalid(self, new_center, bbox, bbox_list): 371 | new_center_us = new_center.reshape(1, -1) 372 | vis = 0 373 | for k, cam in self.cameras.items(): 374 | width = 360 375 | height = 288 376 | loc_2d = project_pose(np.hstack((new_center_us, [[1000.0]])), cam) 377 | if 10 < loc_2d[0, 0] < width - 10 and 10 < loc_2d[0, 1] < height - 10: 378 | vis += 1 379 | 380 | if len(bbox_list) == 0: 381 | return vis >= 2 382 | 383 | bbox_list = np.array(bbox_list) 384 | x0 = np.maximum(bbox[0], bbox_list[:, 0]) 385 | y0 = np.maximum(bbox[1], bbox_list[:, 1]) 386 | x1 = np.minimum(bbox[2], bbox_list[:, 2]) 387 | y1 = np.minimum(bbox[3], bbox_list[:, 3]) 388 | 389 | intersection = np.maximum(0, (x1 - x0) * (y1 - y0)) 390 | area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) 391 | area_list = (bbox_list[:, 2] - bbox_list[:, 0]) * (bbox_list[:, 3] - bbox_list[:, 1]) 392 | iou_list = intersection / (area + area_list - intersection) 393 | 394 | return vis >= 2 and np.max(iou_list) < 0.01 395 | 396 | @staticmethod 397 | def calc_bbox(pose, pose_vis): 398 | index = pose_vis[:, 0] > 0 399 | bbox = [np.min(pose[index, 0]), np.min(pose[index, 1]), 400 | np.max(pose[index, 0]), np.max(pose[index, 1])] 401 | 402 | return np.array(bbox) 403 | -------------------------------------------------------------------------------- /lib/dataset/panoptic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import glob 11 | import os.path as osp 12 | import numpy as np 13 | import json_tricks as json 14 | import pickle 15 | import logging 16 | import os 17 | import copy 18 | 19 | from dataset.JointsDataset import JointsDataset 20 | from utils.transforms import projectPoints 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | TRAIN_LIST = [ 25 | '160422_ultimatum1', 26 | '160224_haggling1', 27 | '160226_haggling1', 28 | '161202_haggling1', 29 | '160906_ian1', 30 | '160906_ian2', 31 | '160906_ian3', 32 | '160906_band1', 33 | '160906_band2', 34 | '160906_band3', 35 | ] 36 | VAL_LIST = ['160906_pizza1', '160422_haggling1', '160906_ian5', '160906_band4'] 37 | 38 | JOINTS_DEF = { 39 | 'neck': 0, 40 | 'nose': 1, 41 | 'mid-hip': 2, 42 | 'l-shoulder': 3, 43 | 'l-elbow': 4, 44 | 'l-wrist': 5, 45 | 'l-hip': 6, 46 | 'l-knee': 7, 47 | 'l-ankle': 8, 48 | 'r-shoulder': 9, 49 | 'r-elbow': 10, 50 | 'r-wrist': 11, 51 | 'r-hip': 12, 52 | 'r-knee': 13, 53 | 'r-ankle': 14, 54 | # 'l-eye': 15, 55 | # 'l-ear': 16, 56 | # 'r-eye': 17, 57 | # 'r-ear': 18, 58 | } 59 | 60 | LIMBS = [[0, 1], 61 | [0, 2], 62 | [0, 3], 63 | [3, 4], 64 | [4, 5], 65 | [0, 9], 66 | [9, 10], 67 | [10, 11], 68 | [2, 6], 69 | [2, 12], 70 | [6, 7], 71 | [7, 8], 72 | [12, 13], 73 | [13, 14]] 74 | 75 | 76 | class Panoptic(JointsDataset): 77 | def __init__(self, cfg, image_set, is_train, transform=None): 78 | super().__init__(cfg, image_set, is_train, transform) 79 | self.pixel_std = 200.0 80 | self.joints_def = JOINTS_DEF 81 | self.limbs = LIMBS 82 | self.num_joints = len(JOINTS_DEF) 83 | 84 | if self.image_set == 'train': 85 | self.sequence_list = TRAIN_LIST 86 | self._interval = 3 87 | self.cam_list = [(0, 12), (0, 6), (0, 23), (0, 13), (0, 3)][:self.num_views] 88 | # self.cam_list = list(set([(0, n) for n in range(0, 31)]) - {(0, 12), (0, 6), (0, 23), (0, 13), (0, 3)}) 89 | # self.cam_list.sort() 90 | self.num_views = len(self.cam_list) 91 | elif self.image_set == 'validation': 92 | self.sequence_list = VAL_LIST 93 | self._interval = 12 94 | self.cam_list = [(0, 12), (0, 6), (0, 23), (0, 13), (0, 3)][:self.num_views] 95 | self.num_views = len(self.cam_list) 96 | 97 | self.db_file = 'group_{}_cam{}.pkl'.format(self.image_set, self.num_views) 98 | self.db_file = os.path.join(self.dataset_root, self.db_file) 99 | 100 | if osp.exists(self.db_file): 101 | info = pickle.load(open(self.db_file, 'rb')) 102 | assert info['sequence_list'] == self.sequence_list 103 | assert info['interval'] == self._interval 104 | assert info['cam_list'] == self.cam_list 105 | self.db = info['db'] 106 | else: 107 | self.db = self._get_db() 108 | info = { 109 | 'sequence_list': self.sequence_list, 110 | 'interval': self._interval, 111 | 'cam_list': self.cam_list, 112 | 'db': self.db 113 | } 114 | pickle.dump(info, open(self.db_file, 'wb')) 115 | # self.db = self._get_db() 116 | self.db_size = len(self.db) 117 | 118 | def _get_db(self): 119 | width = 1920 120 | height = 1080 121 | db = [] 122 | for seq in self.sequence_list: 123 | 124 | cameras = self._get_cam(seq) 125 | 126 | curr_anno = osp.join(self.dataset_root, seq, 'hdPose3d_stage1_coco19') 127 | anno_files = sorted(glob.iglob('{:s}/*.json'.format(curr_anno))) 128 | 129 | for i, file in enumerate(anno_files): 130 | if i % self._interval == 0: 131 | with open(file) as dfile: 132 | bodies = json.load(dfile)['bodies'] 133 | if len(bodies) == 0: 134 | continue 135 | 136 | for k, v in cameras.items(): 137 | postfix = osp.basename(file).replace('body3DScene', '') 138 | prefix = '{:02d}_{:02d}'.format(k[0], k[1]) 139 | image = osp.join(seq, 'hdImgs', prefix, 140 | prefix + postfix) 141 | image = image.replace('json', 'jpg') 142 | 143 | all_poses_3d = [] 144 | all_poses_vis_3d = [] 145 | all_poses = [] 146 | all_poses_vis = [] 147 | for body in bodies: 148 | pose3d = np.array(body['joints19']).reshape((-1, 4)) 149 | pose3d = pose3d[:self.num_joints] 150 | 151 | joints_vis = pose3d[:, -1] > 0.1 152 | 153 | if not joints_vis[self.root_id]: 154 | continue 155 | 156 | # Coordinate transformation 157 | M = np.array([[1.0, 0.0, 0.0], 158 | [0.0, 0.0, -1.0], 159 | [0.0, 1.0, 0.0]]) 160 | pose3d[:, 0:3] = pose3d[:, 0:3].dot(M) 161 | 162 | all_poses_3d.append(pose3d[:, 0:3] * 10.0) 163 | all_poses_vis_3d.append( 164 | np.repeat( 165 | np.reshape(joints_vis, (-1, 1)), 3, axis=1)) 166 | 167 | pose2d = np.zeros((pose3d.shape[0], 2)) 168 | pose2d[:, :2] = projectPoints( 169 | pose3d[:, 0:3].transpose(), v['K'], v['R'], 170 | v['t'], v['distCoef']).transpose()[:, :2] 171 | x_check = np.bitwise_and(pose2d[:, 0] >= 0, 172 | pose2d[:, 0] <= width - 1) 173 | y_check = np.bitwise_and(pose2d[:, 1] >= 0, 174 | pose2d[:, 1] <= height - 1) 175 | check = np.bitwise_and(x_check, y_check) 176 | joints_vis[np.logical_not(check)] = 0 177 | 178 | all_poses.append(pose2d) 179 | all_poses_vis.append( 180 | np.repeat( 181 | np.reshape(joints_vis, (-1, 1)), 2, axis=1)) 182 | 183 | if len(all_poses_3d) > 0: 184 | our_cam = {} 185 | our_cam['R'] = v['R'] 186 | our_cam['T'] = -np.dot(v['R'].T, v['t']) * 10.0 # cm to mm 187 | our_cam['fx'] = np.array(v['K'][0, 0]) 188 | our_cam['fy'] = np.array(v['K'][1, 1]) 189 | our_cam['cx'] = np.array(v['K'][0, 2]) 190 | our_cam['cy'] = np.array(v['K'][1, 2]) 191 | our_cam['k'] = v['distCoef'][[0, 1, 4]].reshape(3, 1) 192 | our_cam['p'] = v['distCoef'][[2, 3]].reshape(2, 1) 193 | 194 | db.append({ 195 | 'key': "{}_{}{}".format(seq, prefix, postfix.split('.')[0]), 196 | 'image': osp.join(self.dataset_root, image), 197 | 'joints_3d': all_poses_3d, 198 | 'joints_3d_vis': all_poses_vis_3d, 199 | 'joints_2d': all_poses, 200 | 'joints_2d_vis': all_poses_vis, 201 | 'camera': our_cam 202 | }) 203 | return db 204 | 205 | def _get_cam(self, seq): 206 | cam_file = osp.join(self.dataset_root, seq, 'calibration_{:s}.json'.format(seq)) 207 | with open(cam_file) as cfile: 208 | calib = json.load(cfile) 209 | 210 | M = np.array([[1.0, 0.0, 0.0], 211 | [0.0, 0.0, -1.0], 212 | [0.0, 1.0, 0.0]]) 213 | cameras = {} 214 | for cam in calib['cameras']: 215 | if (cam['panel'], cam['node']) in self.cam_list: 216 | sel_cam = {} 217 | sel_cam['K'] = np.array(cam['K']) 218 | sel_cam['distCoef'] = np.array(cam['distCoef']) 219 | sel_cam['R'] = np.array(cam['R']).dot(M) 220 | sel_cam['t'] = np.array(cam['t']).reshape((3, 1)) 221 | cameras[(cam['panel'], cam['node'])] = sel_cam 222 | return cameras 223 | 224 | def __getitem__(self, idx): 225 | input, target, weight, target_3d, meta, input_heatmap = [], [], [], [], [], [] 226 | 227 | # if self.image_set == 'train': 228 | # # camera_num = np.random.choice([5], size=1) 229 | # select_cam = np.random.choice(self.num_views, size=5, replace=False) 230 | # elif self.image_set == 'validation': 231 | # select_cam = list(range(self.num_views)) 232 | 233 | for k in range(self.num_views): 234 | i, t, w, t3, m, ih = super().__getitem__(self.num_views * idx + k) 235 | if i is None: 236 | continue 237 | input.append(i) 238 | target.append(t) 239 | weight.append(w) 240 | target_3d.append(t3) 241 | meta.append(m) 242 | input_heatmap.append(ih) 243 | return input, target, weight, target_3d, meta, input_heatmap 244 | 245 | def __len__(self): 246 | return self.db_size // self.num_views 247 | 248 | def evaluate(self, preds): 249 | eval_list = [] 250 | gt_num = self.db_size // self.num_views 251 | assert len(preds) == gt_num, 'number mismatch' 252 | 253 | total_gt = 0 254 | for i in range(gt_num): 255 | index = self.num_views * i 256 | db_rec = copy.deepcopy(self.db[index]) 257 | joints_3d = db_rec['joints_3d'] 258 | joints_3d_vis = db_rec['joints_3d_vis'] 259 | 260 | if len(joints_3d) == 0: 261 | continue 262 | 263 | pred = preds[i].copy() 264 | pred = pred[pred[:, 0, 3] >= 0] 265 | for pose in pred: 266 | mpjpes = [] 267 | for (gt, gt_vis) in zip(joints_3d, joints_3d_vis): 268 | vis = gt_vis[:, 0] > 0 269 | mpjpe = np.mean(np.sqrt(np.sum((pose[vis, 0:3] - gt[vis]) ** 2, axis=-1))) 270 | mpjpes.append(mpjpe) 271 | min_gt = np.argmin(mpjpes) 272 | min_mpjpe = np.min(mpjpes) 273 | score = pose[0, 4] 274 | eval_list.append({ 275 | "mpjpe": float(min_mpjpe), 276 | "score": float(score), 277 | "gt_id": int(total_gt + min_gt) 278 | }) 279 | 280 | total_gt += len(joints_3d) 281 | 282 | mpjpe_threshold = np.arange(25, 155, 25) 283 | aps = [] 284 | recs = [] 285 | for t in mpjpe_threshold: 286 | ap, rec = self._eval_list_to_ap(eval_list, total_gt, t) 287 | aps.append(ap) 288 | recs.append(rec) 289 | 290 | return aps, recs, self._eval_list_to_mpjpe(eval_list), self._eval_list_to_recall(eval_list, total_gt) 291 | 292 | @staticmethod 293 | def _eval_list_to_ap(eval_list, total_gt, threshold): 294 | eval_list.sort(key=lambda k: k["score"], reverse=True) 295 | total_num = len(eval_list) 296 | 297 | tp = np.zeros(total_num) 298 | fp = np.zeros(total_num) 299 | gt_det = [] 300 | for i, item in enumerate(eval_list): 301 | if item["mpjpe"] < threshold and item["gt_id"] not in gt_det: 302 | tp[i] = 1 303 | gt_det.append(item["gt_id"]) 304 | else: 305 | fp[i] = 1 306 | tp = np.cumsum(tp) 307 | fp = np.cumsum(fp) 308 | recall = tp / (total_gt + 1e-5) 309 | precise = tp / (tp + fp + 1e-5) 310 | for n in range(total_num - 2, -1, -1): 311 | precise[n] = max(precise[n], precise[n + 1]) 312 | 313 | precise = np.concatenate(([0], precise, [0])) 314 | recall = np.concatenate(([0], recall, [1])) 315 | index = np.where(recall[1:] != recall[:-1])[0] 316 | ap = np.sum((recall[index + 1] - recall[index]) * precise[index + 1]) 317 | 318 | return ap, recall[-2] 319 | 320 | @staticmethod 321 | def _eval_list_to_mpjpe(eval_list, threshold=500): 322 | eval_list.sort(key=lambda k: k["score"], reverse=True) 323 | gt_det = [] 324 | 325 | mpjpes = [] 326 | for i, item in enumerate(eval_list): 327 | if item["mpjpe"] < threshold and item["gt_id"] not in gt_det: 328 | mpjpes.append(item["mpjpe"]) 329 | gt_det.append(item["gt_id"]) 330 | 331 | return np.mean(mpjpes) if len(mpjpes) > 0 else np.inf 332 | 333 | @staticmethod 334 | def _eval_list_to_recall(eval_list, total_gt, threshold=500): 335 | gt_ids = [e["gt_id"] for e in eval_list if e["mpjpe"] < threshold] 336 | 337 | return len(np.unique(gt_ids)) / total_gt 338 | 339 | 340 | 341 | 342 | -------------------------------------------------------------------------------- /lib/dataset/shelf.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os.path as osp 11 | import numpy as np 12 | import json_tricks as json 13 | import pickle 14 | import scipy.io as scio 15 | import logging 16 | import copy 17 | import os 18 | from collections import OrderedDict 19 | 20 | from dataset.JointsDataset import JointsDataset 21 | from utils.cameras_cpu import project_pose 22 | 23 | SHELF_JOINTS_DEF = { 24 | 'Right-Ankle': 0, 25 | 'Right-Knee': 1, 26 | 'Right-Hip': 2, 27 | 'Left-Hip': 3, 28 | 'Left-Knee': 4, 29 | 'Left-Ankle': 5, 30 | 'Right-Wrist': 6, 31 | 'Right-Elbow': 7, 32 | 'Right-Shoulder': 8, 33 | 'Left-Shoulder': 9, 34 | 'Left-Elbow': 10, 35 | 'Left-Wrist': 11, 36 | 'Bottom-Head': 12, 37 | 'Top-Head': 13 38 | } 39 | 40 | LIMBS = [ 41 | [0, 1], 42 | [1, 2], 43 | [3, 4], 44 | [4, 5], 45 | [2, 3], 46 | [6, 7], 47 | [7, 8], 48 | [9, 10], 49 | [10, 11], 50 | [2, 8], 51 | [3, 9], 52 | [8, 12], 53 | [9, 12], 54 | [12, 13] 55 | ] 56 | 57 | 58 | class Shelf(JointsDataset): 59 | def __init__(self, cfg, image_set, is_train, transform=None): 60 | self.pixel_std = 200.0 61 | self.joints_def = SHELF_JOINTS_DEF 62 | super().__init__(cfg, image_set, is_train, transform) 63 | self.limbs = LIMBS 64 | self.num_joints = len(SHELF_JOINTS_DEF) 65 | self.cam_list = [0, 1, 2, 3, 4] 66 | self.num_views = len(self.cam_list) 67 | self.frame_range = list(range(300, 601)) 68 | 69 | self.pred_pose2d = self._get_pred_pose2d() 70 | self.db = self._get_db() 71 | 72 | self.db_size = len(self.db) 73 | 74 | def _get_pred_pose2d(self): 75 | file = os.path.join(self.dataset_root, "pred_shelf_maskrcnn_hrnet_coco.pkl") 76 | with open(file, "rb") as pfile: 77 | logging.info("=> load {}".format(file)) 78 | pred_2d = pickle.load(pfile) 79 | 80 | return pred_2d 81 | 82 | def _get_db(self): 83 | width = 1032 84 | height = 776 85 | 86 | db = [] 87 | cameras = self._get_cam() 88 | 89 | datafile = os.path.join(self.dataset_root, 'actorsGT.mat') 90 | data = scio.loadmat(datafile) 91 | actor_3d = np.array(np.array(data['actor3D'].tolist()).tolist()).squeeze() # num_person * num_frame 92 | 93 | num_person = len(actor_3d) 94 | num_frames = len(actor_3d[0]) 95 | 96 | for i in self.frame_range: 97 | for k, cam in cameras.items(): 98 | image = osp.join("Camera" + k, "img_{:06d}.png".format(i)) 99 | 100 | all_poses_3d = [] 101 | all_poses_vis_3d = [] 102 | all_poses = [] 103 | all_poses_vis = [] 104 | for person in range(num_person): 105 | pose3d = actor_3d[person][i] * 1000.0 106 | if len(pose3d[0]) > 0: 107 | all_poses_3d.append(pose3d) 108 | all_poses_vis_3d.append(np.ones((self.num_joints, 3))) 109 | 110 | pose2d = project_pose(pose3d, cam) 111 | 112 | x_check = np.bitwise_and(pose2d[:, 0] >= 0, 113 | pose2d[:, 0] <= width - 1) 114 | y_check = np.bitwise_and(pose2d[:, 1] >= 0, 115 | pose2d[:, 1] <= height - 1) 116 | check = np.bitwise_and(x_check, y_check) 117 | 118 | joints_vis = np.ones((len(pose2d), 1)) 119 | joints_vis[np.logical_not(check)] = 0 120 | all_poses.append(pose2d) 121 | all_poses_vis.append( 122 | np.repeat( 123 | np.reshape(joints_vis, (-1, 1)), 2, axis=1)) 124 | 125 | pred_index = '{}_{}'.format(k, i) 126 | preds = self.pred_pose2d[pred_index] 127 | preds = [np.array(p["pred"]) for p in preds] 128 | db.append({ 129 | 'image': osp.join(self.dataset_root, image), 130 | 'joints_3d': all_poses_3d, 131 | 'joints_3d_vis': all_poses_vis_3d, 132 | 'joints_2d': all_poses, 133 | 'joints_2d_vis': all_poses_vis, 134 | 'camera': cam, 135 | 'pred_pose2d': preds 136 | }) 137 | 138 | return db 139 | 140 | def _get_cam(self): 141 | cam_file = osp.join(self.dataset_root, "calibration_shelf.json") 142 | with open(cam_file) as cfile: 143 | cameras = json.load(cfile) 144 | 145 | for id, cam in cameras.items(): 146 | for k, v in cam.items(): 147 | cameras[id][k] = np.array(v) 148 | 149 | return cameras 150 | 151 | def __getitem__(self, idx): 152 | input, target_heatmap, target_weight, target_3d, meta, input_heatmap = [], [], [], [], [], [] 153 | for k in range(self.num_views): 154 | i, th, tw, t3, m, ih = super().__getitem__(self.num_views * idx + k) 155 | input.append(i) 156 | target_heatmap.append(th) 157 | target_weight.append(tw) 158 | input_heatmap.append(ih) 159 | target_3d.append(t3) 160 | meta.append(m) 161 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 162 | 163 | def __len__(self): 164 | return self.db_size // self.num_views 165 | 166 | def evaluate(self, preds, recall_threshold=500): 167 | datafile = os.path.join(self.dataset_root, 'actorsGT.mat') 168 | data = scio.loadmat(datafile) 169 | actor_3d = np.array(np.array(data['actor3D'].tolist()).tolist()).squeeze() # num_person * num_frame 170 | num_person = len(actor_3d) 171 | total_gt = 0 172 | match_gt = 0 173 | 174 | limbs = [[0, 1], [1, 2], [3, 4], [4, 5], [6, 7], [7, 8], [9, 10], [10, 11], [12, 13]] 175 | correct_parts = np.zeros(num_person) 176 | total_parts = np.zeros(num_person) 177 | alpha = 0.5 178 | bone_correct_parts = np.zeros((num_person, 10)) 179 | 180 | for i, fi in enumerate(self.frame_range): 181 | pred_coco = preds[i].copy() 182 | pred_coco = pred_coco[pred_coco[:, 0, 3] >= 0, :, :3] 183 | pred = np.stack([self.coco2shelf3D(p) for p in copy.deepcopy(pred_coco[:, :, :3])]) 184 | 185 | for person in range(num_person): 186 | gt = actor_3d[person][fi] * 1000.0 187 | if len(gt[0]) == 0: 188 | continue 189 | 190 | mpjpes = np.mean(np.sqrt(np.sum((gt[np.newaxis] - pred) ** 2, axis=-1)), axis=-1) 191 | min_n = np.argmin(mpjpes) 192 | min_mpjpe = np.min(mpjpes) 193 | if min_mpjpe < recall_threshold: 194 | match_gt += 1 195 | total_gt += 1 196 | 197 | for j, k in enumerate(limbs): 198 | total_parts[person] += 1 199 | error_s = np.linalg.norm(pred[min_n, k[0], 0:3] - gt[k[0]]) 200 | error_e = np.linalg.norm(pred[min_n, k[1], 0:3] - gt[k[1]]) 201 | limb_length = np.linalg.norm(gt[k[0]] - gt[k[1]]) 202 | if (error_s + error_e) / 2.0 <= alpha * limb_length: 203 | correct_parts[person] += 1 204 | bone_correct_parts[person, j] += 1 205 | pred_hip = (pred[min_n, 2, 0:3] + pred[min_n, 3, 0:3]) / 2.0 206 | gt_hip = (gt[2] + gt[3]) / 2.0 207 | total_parts[person] += 1 208 | error_s = np.linalg.norm(pred_hip - gt_hip) 209 | error_e = np.linalg.norm(pred[min_n, 12, 0:3] - gt[12]) 210 | limb_length = np.linalg.norm(gt_hip - gt[12]) 211 | if (error_s + error_e) / 2.0 <= alpha * limb_length: 212 | correct_parts[person] += 1 213 | bone_correct_parts[person, 9] += 1 214 | 215 | actor_pcp = correct_parts / (total_parts + 1e-8) 216 | avg_pcp = np.mean(actor_pcp[:3]) 217 | 218 | bone_group = OrderedDict( 219 | [('Head', [8]), ('Torso', [9]), ('Upper arms', [5, 6]), 220 | ('Lower arms', [4, 7]), ('Upper legs', [1, 2]), ('Lower legs', [0, 3])]) 221 | bone_person_pcp = OrderedDict() 222 | for k, v in bone_group.items(): 223 | bone_person_pcp[k] = np.sum(bone_correct_parts[:, v], axis=-1) / (total_parts / 10 * len(v) + 1e-8) 224 | 225 | return actor_pcp, avg_pcp, bone_person_pcp, match_gt / (total_gt + 1e-8) 226 | 227 | @staticmethod 228 | def coco2shelf3D(coco_pose): 229 | """ 230 | transform coco order(our method output) 3d pose to shelf dataset order with interpolation 231 | :param coco_pose: np.array with shape 17x3 232 | :return: 3D pose in shelf order with shape 14x3 233 | """ 234 | shelf_pose = np.zeros((14, 3)) 235 | coco2shelf = np.array([16, 14, 12, 11, 13, 15, 10, 8, 6, 5, 7, 9]) 236 | shelf_pose[0: 12] += coco_pose[coco2shelf] 237 | 238 | mid_sho = (coco_pose[5] + coco_pose[6]) / 2 # L and R shoulder 239 | head_center = (coco_pose[3] + coco_pose[4]) / 2 # middle of two ear 240 | 241 | head_bottom = (mid_sho + head_center) / 2 # nose and head center 242 | head_top = head_bottom + (head_center - head_bottom) * 2 243 | # shelf_pose[12] += head_bottom 244 | # shelf_pose[13] += head_top 245 | 246 | shelf_pose[12] = (shelf_pose[8] + shelf_pose[9]) / 2 # Use middle of shoulder to init 247 | shelf_pose[13] = coco_pose[0] # use nose to init 248 | 249 | shelf_pose[13] = shelf_pose[12] + (shelf_pose[13] - shelf_pose[12]) * np.array([0.75, 0.75, 1.5]) 250 | shelf_pose[12] = shelf_pose[12] + (coco_pose[0] - shelf_pose[12]) * np.array([0.5, 0.5, 0.5]) 251 | 252 | alpha = 0.75 253 | shelf_pose[13] = shelf_pose[13] * alpha + head_top * (1 - alpha) 254 | shelf_pose[12] = shelf_pose[12] * alpha + head_bottom * (1 - alpha) 255 | 256 | return shelf_pose 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | -------------------------------------------------------------------------------- /lib/dataset/shelf_synthetic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os.path as osp 11 | import numpy as np 12 | import torch 13 | from torch.utils.data import Dataset 14 | 15 | import json_tricks as json 16 | import pickle 17 | import logging 18 | import copy 19 | import random 20 | import cv2 21 | 22 | import os 23 | 24 | from utils.transforms import get_affine_transform 25 | from utils.transforms import affine_transform 26 | from utils.transforms import rotate_points, get_scale 27 | from utils.cameras_cpu import project_pose 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | coco_joints_def = {0: 'nose', 32 | 1: 'Leye', 2: 'Reye', 3: 'Lear', 4: 'Rear', 33 | 5: 'Lsho', 6: 'Rsho', 34 | 7: 'Lelb', 8: 'Relb', 35 | 9: 'Lwri', 10: 'Rwri', 36 | 11: 'Lhip', 12: 'Rhip', 37 | 13: 'Lkne', 14: 'Rkne', 38 | 15: 'Lank', 16: 'Rank'} 39 | 40 | LIMBS = [[0, 1], [0, 2], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], [11, 13], [13, 15], 41 | [6, 12], [12, 14], [14, 16], [5, 6], [11, 12]] 42 | 43 | 44 | class ShelfSynthetic(Dataset): 45 | def __init__(self, cfg, image_set, is_train, transform=None): 46 | super().__init__() 47 | self.pixel_std = 200.0 48 | self.joints_def = coco_joints_def 49 | self.limbs = LIMBS 50 | self.num_joints = len(coco_joints_def) 51 | self.cam_list = [0, 1, 2, 3, 4] 52 | self.num_views = len(self.cam_list) 53 | self.maximum_person = cfg.MULTI_PERSON.MAX_PEOPLE_NUM 54 | 55 | self.is_train = is_train 56 | 57 | this_dir = os.path.dirname(__file__) 58 | dataset_root = os.path.join(this_dir, '../..', cfg.DATASET.ROOT) 59 | self.dataset_root = dataset_root 60 | self.image_set = image_set 61 | self.dataset_name = cfg.DATASET.TEST_DATASET 62 | 63 | self.data_format = cfg.DATASET.DATA_FORMAT 64 | self.data_augmentation = cfg.DATASET.DATA_AUGMENTATION 65 | 66 | self.color_rgb = cfg.DATASET.COLOR_RGB 67 | 68 | self.target_type = cfg.NETWORK.TARGET_TYPE 69 | self.image_size = np.array(cfg.NETWORK.IMAGE_SIZE) 70 | self.heatmap_size = np.array(cfg.NETWORK.HEATMAP_SIZE) 71 | self.sigma = cfg.NETWORK.SIGMA 72 | self.use_different_joints_weight = cfg.LOSS.USE_DIFFERENT_JOINTS_WEIGHT 73 | self.joints_weight = 1 74 | 75 | self.transform = transform 76 | 77 | self.space_size = np.array(cfg.MULTI_PERSON.SPACE_SIZE) 78 | self.space_center = np.array(cfg.MULTI_PERSON.SPACE_CENTER) 79 | self.initial_cube_size = np.array(cfg.MULTI_PERSON.INITIAL_CUBE_SIZE) 80 | 81 | pose_db_file = os.path.join(self.dataset_root, "..", "panoptic_training_pose.pkl") 82 | self.pose_db = pickle.load(open(pose_db_file, "rb")) 83 | self.cameras = self._get_cam() 84 | 85 | def _get_cam(self): 86 | cam_file = osp.join(self.dataset_root, "calibration_shelf.json") 87 | with open(cam_file) as cfile: 88 | cameras = json.load(cfile) 89 | 90 | for id, cam in cameras.items(): 91 | for k, v in cam.items(): 92 | cameras[id][k] = np.array(v) 93 | 94 | return cameras 95 | 96 | def __getitem__(self, idx): 97 | # nposes = np.random.choice([1, 2, 3, 4, 5], p=[0.1, 0.1, 0.2, 0.4, 0.2]) 98 | nposes = np.random.choice(range(1, 6)) 99 | bbox_list = [] 100 | center_list = [] 101 | 102 | select_poses = np.random.choice(self.pose_db, nposes) 103 | joints_3d = np.array([p['pose'] for p in select_poses]) 104 | joints_3d_vis = np.array([p['vis'] for p in select_poses]) 105 | 106 | for n in range(0, nposes): 107 | points = joints_3d[n][:, :2].copy() 108 | center = (points[11, :2] + points[12, :2]) / 2 109 | rot_rad = np.random.uniform(-180, 180) 110 | 111 | new_center = self.get_new_center(center_list) 112 | new_xy = rotate_points(points, center, rot_rad) - center + new_center 113 | 114 | loop_count = 0 115 | while not self.isvalid(self.calc_bbox(new_xy, joints_3d_vis[n]), bbox_list): 116 | loop_count += 1 117 | if loop_count >= 100: 118 | break 119 | new_center = self.get_new_center(center_list) 120 | new_xy = rotate_points(points, center, rot_rad) - center + new_center 121 | 122 | if loop_count >= 100: 123 | nposes = n 124 | joints_3d = joints_3d[:n] 125 | joints_3d_vis = joints_3d_vis[:n] 126 | else: 127 | center_list.append(new_center) 128 | bbox_list.append(self.calc_bbox(new_xy, joints_3d_vis[n])) 129 | joints_3d[n][:, :2] = new_xy 130 | 131 | input, target_heatmap, target_weight, target_3d, meta, input_heatmap = [], [], [], [], [], [] 132 | for k, cam in self.cameras.items(): 133 | i, th, tw, t3, m, ih = self._get_single_view_item(joints_3d, joints_3d_vis, cam) 134 | input.append(i) 135 | target_heatmap.append(th) 136 | target_weight.append(tw) 137 | input_heatmap.append(ih) 138 | target_3d.append(t3) 139 | meta.append(m) 140 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 141 | 142 | def __len__(self): 143 | return 3000 144 | # return self.db_size // self.num_views 145 | 146 | def _get_single_view_item(self, joints_3d, joints_3d_vis, cam): 147 | joints_3d = copy.deepcopy(joints_3d) 148 | joints_3d_vis = copy.deepcopy(joints_3d_vis) 149 | nposes = len(joints_3d) 150 | 151 | width = 1032 152 | height = 776 153 | c = np.array([width / 2.0, height / 2.0], dtype=np.float32) 154 | # s = np.array( 155 | # [width / self.pixel_std, height / self.pixel_std], dtype=np.float32) 156 | s = get_scale((width, height), self.image_size) 157 | r = 0 158 | 159 | joints = [] 160 | joints_vis = [] 161 | for n in range(nposes): 162 | pose2d = project_pose(joints_3d[n], cam) 163 | 164 | x_check = np.bitwise_and(pose2d[:, 0] >= 0, 165 | pose2d[:, 0] <= width - 1) 166 | y_check = np.bitwise_and(pose2d[:, 1] >= 0, 167 | pose2d[:, 1] <= height - 1) 168 | check = np.bitwise_and(x_check, y_check) 169 | vis = joints_3d_vis[n][:, 0] > 0 170 | vis[np.logical_not(check)] = 0 171 | 172 | joints.append(pose2d) 173 | joints_vis.append(np.repeat(np.reshape(vis, (-1, 1)), 2, axis=1)) 174 | 175 | trans = get_affine_transform(c, s, r, self.image_size) 176 | input = np.ones((height, width, 3), dtype=np.float32) 177 | input = cv2.warpAffine( 178 | input, 179 | trans, (int(self.image_size[0]), int(self.image_size[1])), 180 | flags=cv2.INTER_LINEAR) 181 | 182 | if self.transform: 183 | input = self.transform(input) 184 | 185 | for n in range(nposes): 186 | for i in range(len(joints[0])): 187 | if joints_vis[n][i, 0] > 0.0: 188 | joints[n][i, 0:2] = affine_transform( 189 | joints[n][i, 0:2], trans) 190 | if (np.min(joints[n][i, :2]) < 0 or 191 | joints[n][i, 0] >= self.image_size[0] or 192 | joints[n][i, 1] >= self.image_size[1]): 193 | joints_vis[n][i, :] = 0 194 | 195 | input_heatmap, _ = self.generate_input_heatmap( 196 | joints, joints_vis) 197 | input_heatmap = torch.from_numpy(input_heatmap) 198 | target_heatmap = torch.zeros_like(input_heatmap) 199 | target_weight = torch.zeros(len(target_heatmap), 1) 200 | 201 | # make joints and joints_vis having same shape 202 | joints_u = np.zeros((self.maximum_person, len(joints[0]), 2)) 203 | joints_vis_u = np.zeros((self.maximum_person, len(joints[0]), 2)) 204 | for i in range(nposes): 205 | joints_u[i] = joints[i] 206 | joints_vis_u[i] = joints_vis[i] 207 | 208 | joints_3d_u = np.zeros((self.maximum_person, len(joints[0]), 3)) 209 | joints_3d_vis_u = np.zeros((self.maximum_person, len(joints[0]), 3)) 210 | for i in range(nposes): 211 | joints_3d_u[i] = joints_3d[i][:, 0:3] 212 | joints_3d_vis_u[i] = joints_3d_vis[i][:, 0:3] 213 | 214 | target_3d = self.generate_3d_target(joints_3d) 215 | target_3d = torch.from_numpy(target_3d) 216 | 217 | meta = { 218 | 'image': '', 219 | 'num_person': nposes, 220 | 'joints_3d': joints_3d_u, 221 | 'roots_3d': (joints_3d_u[:, 11] + joints_3d_u[:, 12]) / 2.0, 222 | 'joints_3d_vis': joints_3d_vis_u, 223 | 'joints': joints_u, 224 | 'joints_vis': joints_vis_u, 225 | 'center': c, 226 | 'scale': s, 227 | 'rotation': r, 228 | 'camera': cam 229 | } 230 | 231 | return input, target_heatmap, target_weight, target_3d, meta, input_heatmap 232 | 233 | @staticmethod 234 | def compute_human_scale(pose, joints_vis): 235 | idx = joints_vis[:, 0] == 1 236 | if np.sum(idx) == 0: 237 | return 0 238 | minx, maxx = np.min(pose[idx, 0]), np.max(pose[idx, 0]) 239 | miny, maxy = np.min(pose[idx, 1]), np.max(pose[idx, 1]) 240 | return np.clip(np.maximum(maxy - miny, maxx - minx) ** 2, 1.0 / 4 * 96 ** 2, 4 * 96 ** 2) 241 | 242 | def generate_input_heatmap(self, joints, joints_vis): 243 | ''' 244 | :param joints: [[num_joints, 3]] 245 | :param joints_vis: [num_joints, 3] 246 | :return: input_heatmap 247 | ''' 248 | nposes = len(joints) 249 | num_joints = joints[0].shape[0] 250 | target_weight = np.zeros((num_joints, 1), dtype=np.float32) 251 | for i in range(num_joints): 252 | for n in range(nposes): 253 | if joints_vis[n][i, 0] == 1: 254 | target_weight[i, 0] = 1 255 | 256 | assert self.target_type == 'gaussian', \ 257 | 'Only support gaussian map now!' 258 | 259 | if self.target_type == 'gaussian': 260 | target = np.zeros( 261 | (num_joints, self.heatmap_size[1], self.heatmap_size[0]), 262 | dtype=np.float32) 263 | feat_stride = self.image_size / self.heatmap_size 264 | 265 | for n in range(nposes): 266 | obscured = random.random() < 0.05 267 | if obscured: 268 | continue 269 | human_scale = 2 * self.compute_human_scale(joints[n] / feat_stride, joints_vis[n]) 270 | if human_scale == 0: 271 | continue 272 | 273 | cur_sigma = self.sigma * np.sqrt((human_scale / (96.0 * 96.0))) 274 | tmp_size = cur_sigma * 3 275 | for joint_id in range(num_joints): 276 | feat_stride = self.image_size / self.heatmap_size 277 | mu_x = int(joints[n][joint_id][0] / feat_stride[0]) 278 | mu_y = int(joints[n][joint_id][1] / feat_stride[1]) 279 | ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] 280 | br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] 281 | if joints_vis[n][joint_id, 0] == 0 or \ 282 | ul[0] >= self.heatmap_size[0] or \ 283 | ul[1] >= self.heatmap_size[1] \ 284 | or br[0] < 0 or br[1] < 0: 285 | continue 286 | 287 | size = 2 * tmp_size + 1 288 | x = np.arange(0, size, 1, np.float32) 289 | y = x[:, np.newaxis] 290 | x0 = y0 = size // 2 291 | # scale = 1 - np.abs(np.random.randn(1) * 0.25) 292 | scale = 0.9 + np.random.randn(1) * 0.03 if random.random() < 0.6 else 1.0 293 | if joint_id in [7, 8, 13, 14]: 294 | scale = scale * 0.5 if random.random() < 0.1 else scale 295 | elif joint_id in [9, 10, 15, 16]: 296 | scale = scale * 0.2 if random.random() < 0.1 else scale 297 | else: 298 | scale = scale * 0.5 if random.random() < 0.05 else scale 299 | g = np.exp( 300 | -((x - x0) ** 2 + (y - y0) ** 2) / (2 * cur_sigma ** 2)) * scale 301 | 302 | # Usable gaussian range 303 | g_x = max(0, 304 | -ul[0]), min(br[0], self.heatmap_size[0]) - ul[0] 305 | g_y = max(0, 306 | -ul[1]), min(br[1], self.heatmap_size[1]) - ul[1] 307 | # Image range 308 | img_x = max(0, ul[0]), min(br[0], self.heatmap_size[0]) 309 | img_y = max(0, ul[1]), min(br[1], self.heatmap_size[1]) 310 | 311 | target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum( 312 | target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]], 313 | g[g_y[0]:g_y[1], g_x[0]:g_x[1]]) 314 | target = np.clip(target, 0, 1) 315 | 316 | if self.use_different_joints_weight: 317 | target_weight = np.multiply(target_weight, self.joints_weight) 318 | 319 | return target, target_weight 320 | 321 | def generate_3d_target(self, joints_3d): 322 | num_people = len(joints_3d) 323 | 324 | space_size = self.space_size 325 | space_center = self.space_center 326 | cube_size = self.initial_cube_size 327 | grid1Dx = np.linspace(-space_size[0] / 2, space_size[0] / 2, cube_size[0]) + space_center[0] 328 | grid1Dy = np.linspace(-space_size[1] / 2, space_size[1] / 2, cube_size[1]) + space_center[1] 329 | grid1Dz = np.linspace(-space_size[2] / 2, space_size[2] / 2, cube_size[2]) + space_center[2] 330 | 331 | target = np.zeros((cube_size[0], cube_size[1], cube_size[2]), dtype=np.float32) 332 | cur_sigma = 200.0 333 | 334 | for n in range(num_people): 335 | joint_id = [11, 12] # mid-hip 336 | mu_x = (joints_3d[n][joint_id[0]][0] + joints_3d[n][joint_id[1]][0]) / 2.0 337 | mu_y = (joints_3d[n][joint_id[0]][1] + joints_3d[n][joint_id[1]][1]) / 2.0 338 | mu_z = (joints_3d[n][joint_id[0]][2] + joints_3d[n][joint_id[1]][2]) / 2.0 339 | 340 | i_x = [np.searchsorted(grid1Dx, mu_x - 3 * cur_sigma), 341 | np.searchsorted(grid1Dx, mu_x + 3 * cur_sigma, 'right')] 342 | i_y = [np.searchsorted(grid1Dy, mu_y - 3 * cur_sigma), 343 | np.searchsorted(grid1Dy, mu_y + 3 * cur_sigma, 'right')] 344 | i_z = [np.searchsorted(grid1Dz, mu_z - 3 * cur_sigma), 345 | np.searchsorted(grid1Dz, mu_z + 3 * cur_sigma, 'right')] 346 | if i_x[0] >= i_x[1] or i_y[0] >= i_y[1] or i_z[0] >= i_z[1]: 347 | continue 348 | 349 | gridx, gridy, gridz = np.meshgrid(grid1Dx[i_x[0]:i_x[1]], grid1Dy[i_y[0]:i_y[1]], grid1Dz[i_z[0]:i_z[1]], 350 | indexing='ij') 351 | g = np.exp(-((gridx - mu_x) ** 2 + (gridy - mu_y) ** 2 + (gridz - mu_z) ** 2) / (2 * cur_sigma ** 2)) 352 | target[i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]] = np.maximum( 353 | target[i_x[0]:i_x[1], i_y[0]:i_y[1], i_z[0]:i_z[1]], g) 354 | 355 | target = np.clip(target, 0, 1) 356 | return target 357 | 358 | def evaluate(self): 359 | pass 360 | 361 | @staticmethod 362 | def get_new_center(center_list): 363 | if len(center_list) == 0 or random.random() < 0.7: 364 | new_center = np.array([np.random.uniform(-1000.0, 2000.0), np.random.uniform(-1600.0, 1600.0)]) 365 | else: 366 | xy = center_list[np.random.choice(range(len(center_list)))] 367 | new_center = xy + np.random.normal(500, 50, 2) * np.random.choice([1, -1], 2) 368 | 369 | return new_center 370 | 371 | @staticmethod 372 | def isvalid(bbox, bbox_list): 373 | if len(bbox_list) == 0: 374 | return True 375 | 376 | bbox_list = np.array(bbox_list) 377 | x0 = np.maximum(bbox[0], bbox_list[:, 0]) 378 | y0 = np.maximum(bbox[1], bbox_list[:, 1]) 379 | x1 = np.minimum(bbox[2], bbox_list[:, 2]) 380 | y1 = np.minimum(bbox[3], bbox_list[:, 3]) 381 | 382 | intersection = np.maximum(0, (x1 - x0) * (y1 - y0)) 383 | area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) 384 | area_list = (bbox_list[:, 2] - bbox_list[:, 0]) * (bbox_list[:, 3] - bbox_list[:, 1]) 385 | iou_list = intersection / (area + area_list - intersection) 386 | 387 | return np.max(iou_list) < 0.01 388 | 389 | @staticmethod 390 | def calc_bbox(pose, pose_vis): 391 | index = pose_vis[:, 0] > 0 392 | bbox = [np.min(pose[index, 0]), np.min(pose[index, 1]), 393 | np.max(pose[index, 0]), np.max(pose[index, 1])] 394 | 395 | return np.array(bbox) 396 | -------------------------------------------------------------------------------- /lib/models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import models.pose_resnet 11 | import models.v2v_net 12 | import models.project_layer 13 | import models.cuboid_proposal_net 14 | import models.pose_regression_net 15 | import models.multi_person_posenet 16 | 17 | -------------------------------------------------------------------------------- /lib/models/cuboid_proposal_net.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from models.v2v_net import V2VNet 10 | from models.project_layer import ProjectLayer 11 | from core.proposal import nms 12 | 13 | 14 | class ProposalLayer(nn.Module): 15 | def __init__(self, cfg): 16 | super(ProposalLayer, self).__init__() 17 | self.grid_size = torch.tensor(cfg.MULTI_PERSON.SPACE_SIZE) 18 | self.cube_size = torch.tensor(cfg.MULTI_PERSON.INITIAL_CUBE_SIZE) 19 | self.grid_center = torch.tensor(cfg.MULTI_PERSON.SPACE_CENTER) 20 | self.num_cand = cfg.MULTI_PERSON.MAX_PEOPLE_NUM 21 | self.root_id = cfg.DATASET.ROOTIDX 22 | self.num_joints = cfg.NETWORK.NUM_JOINTS 23 | self.threshold = cfg.MULTI_PERSON.THRESHOLD 24 | 25 | def filter_proposal(self, topk_index, gt_3d, num_person): 26 | batch_size = topk_index.shape[0] 27 | cand_num = topk_index.shape[1] 28 | cand2gt = torch.zeros(batch_size, cand_num) 29 | 30 | for i in range(batch_size): 31 | cand = topk_index[i].reshape(cand_num, 1, -1) 32 | gt = gt_3d[i, :num_person[i]].reshape(1, num_person[i], -1) 33 | 34 | dist = torch.sqrt(torch.sum((cand - gt)**2, dim=-1)) 35 | min_dist, min_gt = torch.min(dist, dim=-1) 36 | 37 | cand2gt[i] = min_gt 38 | cand2gt[i][min_dist > 500.0] = -1.0 39 | 40 | return cand2gt 41 | 42 | def get_real_loc(self, index): 43 | device = index.device 44 | cube_size = self.cube_size.to(device=device, dtype=torch.float) 45 | grid_size = self.grid_size.to(device=device) 46 | grid_center = self.grid_center.to(device=device) 47 | loc = index.float() / (cube_size - 1) * grid_size + grid_center - grid_size / 2.0 48 | return loc 49 | 50 | def forward(self, root_cubes, meta): 51 | batch_size = root_cubes.shape[0] 52 | 53 | topk_values, topk_unravel_index = nms(root_cubes.detach(), self.num_cand) 54 | topk_unravel_index = self.get_real_loc(topk_unravel_index) 55 | 56 | grid_centers = torch.zeros(batch_size, self.num_cand, 5, device=root_cubes.device) 57 | grid_centers[:, :, 0:3] = topk_unravel_index 58 | grid_centers[:, :, 4] = topk_values 59 | 60 | # match gt to filter those invalid proposals for training/validate PRN 61 | if self.training and ('roots_3d' in meta[0] and 'num_person' in meta[0]): 62 | gt_3d = meta[0]['roots_3d'].float() 63 | num_person = meta[0]['num_person'] 64 | cand2gt = self.filter_proposal(topk_unravel_index, gt_3d, num_person) 65 | grid_centers[:, :, 3] = cand2gt 66 | else: 67 | grid_centers[:, :, 3] = (topk_values > self.threshold).float() - 1.0 # if ground-truths are not available. 68 | 69 | # nms 70 | # for b in range(batch_size): 71 | # centers = copy.deepcopy(topk_unravel_index[b, :, :3]) 72 | # scores = copy.deepcopy(topk_values[b]) 73 | # keep = [] 74 | # keep_s = [] 75 | # while len(centers): 76 | # keep.append(centers[0]) 77 | # keep_s.append(scores[0]) 78 | # dist = torch.sqrt(torch.sum((centers[0] - centers)**2, dim=-1)) 79 | # index = (dist > 500.0) & (scores > 0.1) 80 | # centers = centers[index] 81 | # scores = scores[index] 82 | # grid_centers[b, :len(keep), :3] = torch.stack(keep, dim=0) 83 | # grid_centers[b, :len(keep), 3] = 0.0 84 | # grid_centers[b, :len(keep), 4] = torch.stack(keep_s, dim=0) 85 | 86 | return grid_centers 87 | 88 | 89 | class CuboidProposalNet(nn.Module): 90 | def __init__(self, cfg): 91 | super(CuboidProposalNet, self).__init__() 92 | self.grid_size = cfg.MULTI_PERSON.SPACE_SIZE 93 | self.cube_size = cfg.MULTI_PERSON.INITIAL_CUBE_SIZE 94 | self.grid_center = cfg.MULTI_PERSON.SPACE_CENTER 95 | 96 | self.project_layer = ProjectLayer(cfg) 97 | self.v2v_net = V2VNet(cfg.NETWORK.NUM_JOINTS, 1) 98 | self.proposal_layer = ProposalLayer(cfg) 99 | 100 | def forward(self, all_heatmaps, meta): 101 | 102 | initial_cubes, grids = self.project_layer(all_heatmaps, meta, 103 | self.grid_size, [self.grid_center], self.cube_size) 104 | root_cubes = self.v2v_net(initial_cubes) 105 | root_cubes = root_cubes.squeeze(1) 106 | grid_centers = self.proposal_layer(root_cubes, meta) 107 | 108 | return root_cubes, grid_centers -------------------------------------------------------------------------------- /lib/models/multi_person_posenet.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import torch 11 | import torch.nn as nn 12 | 13 | from models import pose_resnet 14 | from models.cuboid_proposal_net import CuboidProposalNet 15 | from models.pose_regression_net import PoseRegressionNet 16 | from core.loss import PerJointMSELoss 17 | from core.loss import PerJointL1Loss 18 | 19 | 20 | class MultiPersonPoseNet(nn.Module): 21 | def __init__(self, backbone, cfg): 22 | super(MultiPersonPoseNet, self).__init__() 23 | self.num_cand = cfg.MULTI_PERSON.MAX_PEOPLE_NUM 24 | self.num_joints = cfg.NETWORK.NUM_JOINTS 25 | 26 | self.backbone = backbone 27 | self.root_net = CuboidProposalNet(cfg) 28 | self.pose_net = PoseRegressionNet(cfg) 29 | 30 | self.USE_GT = cfg.NETWORK.USE_GT 31 | self.root_id = cfg.DATASET.ROOTIDX 32 | self.dataset_name = cfg.DATASET.TEST_DATASET 33 | 34 | def forward(self, views=None, meta=None, targets_2d=None, weights_2d=None, targets_3d=None, input_heatmaps=None): 35 | if views is not None: 36 | all_heatmaps = [] 37 | for view in views: 38 | heatmaps = self.backbone(view) 39 | all_heatmaps.append(heatmaps) 40 | else: 41 | all_heatmaps = input_heatmaps 42 | 43 | # all_heatmaps = targets_2d 44 | device = all_heatmaps[0].device 45 | batch_size = all_heatmaps[0].shape[0] 46 | 47 | # calculate 2D heatmap loss 48 | criterion = PerJointMSELoss().cuda() 49 | loss_2d = criterion(torch.zeros(1, device=device), torch.zeros(1, device=device)) 50 | if targets_2d is not None: 51 | for t, w, o in zip(targets_2d, weights_2d, all_heatmaps): 52 | loss_2d += criterion(o, t, True, w) 53 | loss_2d /= len(all_heatmaps) 54 | 55 | loss_3d = criterion(torch.zeros(1, device=device), torch.zeros(1, device=device)) 56 | if self.USE_GT: 57 | num_person = meta[0]['num_person'] 58 | grid_centers = torch.zeros(batch_size, self.num_cand, 5, device=device) 59 | grid_centers[:, :, 0:3] = meta[0]['roots_3d'].float() 60 | grid_centers[:, :, 3] = -1.0 61 | for i in range(batch_size): 62 | grid_centers[i, :num_person[i], 3] = torch.tensor(range(num_person[i]), device=device) 63 | grid_centers[i, :num_person[i], 4] = 1.0 64 | else: 65 | root_cubes, grid_centers = self.root_net(all_heatmaps, meta) 66 | 67 | # calculate 3D heatmap loss 68 | if targets_3d is not None: 69 | loss_3d = criterion(root_cubes, targets_3d) 70 | del root_cubes 71 | 72 | pred = torch.zeros(batch_size, self.num_cand, self.num_joints, 5, device=device) 73 | pred[:, :, :, 3:] = grid_centers[:, :, 3:].reshape(batch_size, -1, 1, 2) # matched gt 74 | 75 | loss_cord = criterion(torch.zeros(1, device=device), torch.zeros(1, device=device)) 76 | criterion_cord = PerJointL1Loss().cuda() 77 | count = 0 78 | 79 | for n in range(self.num_cand): 80 | index = (pred[:, n, 0, 3] >= 0) 81 | if torch.sum(index) > 0: 82 | single_pose = self.pose_net(all_heatmaps, meta, grid_centers[:, n]) 83 | pred[:, n, :, 0:3] = single_pose.detach() 84 | 85 | # calculate 3D pose loss 86 | if self.training and 'joints_3d' in meta[0] and 'joints_3d_vis' in meta[0]: 87 | gt_3d = meta[0]['joints_3d'].float() 88 | for i in range(batch_size): 89 | if pred[i, n, 0, 3] >= 0: 90 | targets = gt_3d[i:i + 1, pred[i, n, 0, 3].long()] 91 | weights_3d = meta[0]['joints_3d_vis'][i:i + 1, pred[i, n, 0, 3].long(), :, 0:1].float() 92 | count += 1 93 | loss_cord = (loss_cord * (count - 1) + 94 | criterion_cord(single_pose[i:i + 1], targets, True, weights_3d)) / count 95 | del single_pose 96 | 97 | return pred, all_heatmaps, grid_centers, loss_2d, loss_3d, loss_cord 98 | 99 | 100 | def get_multi_person_pose_net(cfg, is_train=True): 101 | if cfg.BACKBONE_MODEL: 102 | backbone = eval(cfg.BACKBONE_MODEL + '.get_pose_net')(cfg, is_train=is_train) 103 | else: 104 | backbone = None 105 | model = MultiPersonPoseNet(backbone, cfg) 106 | return model 107 | -------------------------------------------------------------------------------- /lib/models/pose_regression_net.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | from models.v2v_net import V2VNet 11 | from models.project_layer import ProjectLayer 12 | 13 | 14 | class SoftArgmaxLayer(nn.Module): 15 | def __init__(self, cfg): 16 | super(SoftArgmaxLayer, self).__init__() 17 | self.beta = cfg.NETWORK.BETA 18 | 19 | def forward(self, x, grids): 20 | batch_size = x.size(0) 21 | channel = x.size(1) 22 | x = x.reshape(batch_size, channel, -1, 1) 23 | # x = F.softmax(x, dim=2) 24 | x = F.softmax(self.beta * x, dim=2) 25 | grids = grids.unsqueeze(1) 26 | x = torch.mul(x, grids) 27 | x = torch.sum(x, dim=2) 28 | return x 29 | 30 | 31 | class PoseRegressionNet(nn.Module): 32 | def __init__(self, cfg): 33 | super(PoseRegressionNet, self).__init__() 34 | self.grid_size = cfg.PICT_STRUCT.GRID_SIZE 35 | self.cube_size = cfg.PICT_STRUCT.CUBE_SIZE 36 | 37 | self.project_layer = ProjectLayer(cfg) 38 | self.v2v_net = V2VNet(cfg.NETWORK.NUM_JOINTS, cfg.NETWORK.NUM_JOINTS) 39 | self.soft_argmax_layer = SoftArgmaxLayer(cfg) 40 | 41 | def forward(self, all_heatmaps, meta, grid_centers): 42 | batch_size = all_heatmaps[0].shape[0] 43 | num_joints = all_heatmaps[0].shape[1] 44 | device = all_heatmaps[0].device 45 | pred = torch.zeros(batch_size, num_joints, 3, device=device) 46 | cubes, grids = self.project_layer(all_heatmaps, meta, 47 | self.grid_size, grid_centers, self.cube_size) 48 | 49 | index = grid_centers[:, 3] >= 0 50 | valid_cubes = self.v2v_net(cubes[index]) 51 | pred[index] = self.soft_argmax_layer(valid_cubes, grids[index]) 52 | 53 | return pred 54 | -------------------------------------------------------------------------------- /lib/models/pose_resnet.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os 11 | import logging 12 | 13 | import torch 14 | import torch.nn as nn 15 | 16 | 17 | BN_MOMENTUM = 0.1 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def conv3x3(in_planes, out_planes, stride=1): 22 | """3x3 convolution with padding""" 23 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 24 | padding=1, bias=False) 25 | 26 | 27 | class BasicBlock(nn.Module): 28 | expansion = 1 29 | 30 | def __init__(self, inplanes, planes, stride=1, downsample=None): 31 | super(BasicBlock, self).__init__() 32 | self.conv1 = conv3x3(inplanes, planes, stride) 33 | self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 34 | self.relu = nn.ReLU(inplace=True) 35 | self.conv2 = conv3x3(planes, planes) 36 | self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 37 | self.downsample = downsample 38 | self.stride = stride 39 | 40 | def forward(self, x): 41 | residual = x 42 | 43 | out = self.conv1(x) 44 | out = self.bn1(out) 45 | out = self.relu(out) 46 | 47 | out = self.conv2(out) 48 | out = self.bn2(out) 49 | 50 | if self.downsample is not None: 51 | residual = self.downsample(x) 52 | 53 | out += residual 54 | out = self.relu(out) 55 | 56 | return out 57 | 58 | 59 | class Bottleneck(nn.Module): 60 | expansion = 4 61 | 62 | def __init__(self, inplanes, planes, stride=1, downsample=None): 63 | super(Bottleneck, self).__init__() 64 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 65 | self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 66 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 67 | padding=1, bias=False) 68 | self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 69 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, 70 | bias=False) 71 | self.bn3 = nn.BatchNorm2d(planes * self.expansion, 72 | momentum=BN_MOMENTUM) 73 | self.relu = nn.ReLU(inplace=True) 74 | self.downsample = downsample 75 | self.stride = stride 76 | 77 | def forward(self, x): 78 | residual = x 79 | 80 | out = self.conv1(x) 81 | out = self.bn1(out) 82 | out = self.relu(out) 83 | 84 | out = self.conv2(out) 85 | out = self.bn2(out) 86 | out = self.relu(out) 87 | 88 | out = self.conv3(out) 89 | out = self.bn3(out) 90 | 91 | if self.downsample is not None: 92 | residual = self.downsample(x) 93 | 94 | out += residual 95 | out = self.relu(out) 96 | 97 | return out 98 | 99 | 100 | class PoseResNet(nn.Module): 101 | 102 | def __init__(self, block, layers, cfg, **kwargs): 103 | self.inplanes = 64 104 | self.deconv_with_bias = cfg.POSE_RESNET.DECONV_WITH_BIAS 105 | 106 | super(PoseResNet, self).__init__() 107 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 108 | bias=False) 109 | self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) 110 | self.relu = nn.ReLU(inplace=True) 111 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 112 | self.layer1 = self._make_layer(block, 64, layers[0]) 113 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 114 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 115 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 116 | 117 | # used for deconv layers 118 | self.deconv_layers = self._make_deconv_layer( 119 | cfg.POSE_RESNET.NUM_DECONV_LAYERS, 120 | cfg.POSE_RESNET.NUM_DECONV_FILTERS, 121 | cfg.POSE_RESNET.NUM_DECONV_KERNELS, 122 | ) 123 | 124 | self.final_layer = nn.Conv2d( 125 | in_channels=cfg.POSE_RESNET.NUM_DECONV_FILTERS[-1], 126 | out_channels=cfg.NETWORK.NUM_JOINTS, 127 | kernel_size=cfg.POSE_RESNET.FINAL_CONV_KERNEL, 128 | stride=1, 129 | padding=1 if cfg.POSE_RESNET.FINAL_CONV_KERNEL == 3 else 0 130 | ) 131 | 132 | def _make_layer(self, block, planes, blocks, stride=1): 133 | downsample = None 134 | if stride != 1 or self.inplanes != planes * block.expansion: 135 | downsample = nn.Sequential( 136 | nn.Conv2d(self.inplanes, planes * block.expansion, 137 | kernel_size=1, stride=stride, bias=False), 138 | nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), 139 | ) 140 | 141 | layers = [] 142 | layers.append(block(self.inplanes, planes, stride, downsample)) 143 | self.inplanes = planes * block.expansion 144 | for i in range(1, blocks): 145 | layers.append(block(self.inplanes, planes)) 146 | 147 | return nn.Sequential(*layers) 148 | 149 | def _get_deconv_cfg(self, deconv_kernel, index): 150 | if deconv_kernel == 4: 151 | padding = 1 152 | output_padding = 0 153 | elif deconv_kernel == 3: 154 | padding = 1 155 | output_padding = 1 156 | elif deconv_kernel == 2: 157 | padding = 0 158 | output_padding = 0 159 | 160 | return deconv_kernel, padding, output_padding 161 | 162 | def _make_deconv_layer(self, num_layers, num_filters, num_kernels): 163 | assert num_layers == len(num_filters), \ 164 | 'ERROR: num_deconv_layers is different len(num_deconv_filters)' 165 | assert num_layers == len(num_kernels), \ 166 | 'ERROR: num_deconv_layers is different len(num_deconv_filters)' 167 | 168 | layers = [] 169 | for i in range(num_layers): 170 | kernel, padding, output_padding = \ 171 | self._get_deconv_cfg(num_kernels[i], i) 172 | 173 | planes = num_filters[i] 174 | layers.append( 175 | nn.ConvTranspose2d( 176 | in_channels=self.inplanes, 177 | out_channels=planes, 178 | kernel_size=kernel, 179 | stride=2, 180 | padding=padding, 181 | output_padding=output_padding, 182 | bias=self.deconv_with_bias)) 183 | layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) 184 | layers.append(nn.ReLU(inplace=True)) 185 | self.inplanes = planes 186 | 187 | return nn.Sequential(*layers) 188 | 189 | def forward(self, x): 190 | x = self.conv1(x) 191 | x = self.bn1(x) 192 | x = self.relu(x) 193 | x = self.maxpool(x) 194 | 195 | x = self.layer1(x) 196 | x = self.layer2(x) 197 | x = self.layer3(x) 198 | x = self.layer4(x) 199 | 200 | x = self.deconv_layers(x) 201 | x = self.final_layer(x) 202 | 203 | return x 204 | 205 | def init_weights(self, pretrained=''): 206 | this_dir = os.path.dirname(__file__) 207 | pretrained = os.path.join(this_dir, '../..', pretrained) 208 | if os.path.isfile(pretrained): 209 | pretrained_state_dict = torch.load(pretrained) 210 | logger.info('=> loading pretrained models {}'.format(pretrained)) 211 | 212 | model_state_dict = self.state_dict() 213 | for k, v in pretrained_state_dict.items(): 214 | if "final_layer" in k: 215 | pretrained_state_dict[k] = torch.zeros_like(model_state_dict[k]) 216 | self.load_state_dict(pretrained_state_dict, strict=False) 217 | 218 | logger.info('=> init deconv weights from normal distribution') 219 | for name, m in self.deconv_layers.named_modules(): 220 | if isinstance(m, nn.ConvTranspose2d): 221 | logger.info('=> init {}.weight as normal(0, 0.001)'.format(name)) 222 | logger.info('=> init {}.bias as 0'.format(name)) 223 | nn.init.normal_(m.weight, std=0.001) 224 | if self.deconv_with_bias: 225 | nn.init.constant_(m.bias, 0) 226 | elif isinstance(m, nn.BatchNorm2d): 227 | logger.info('=> init {}.weight as 1'.format(name)) 228 | logger.info('=> init {}.bias as 0'.format(name)) 229 | nn.init.constant_(m.weight, 1) 230 | nn.init.constant_(m.bias, 0) 231 | logger.info('=> init final conv weights from normal distribution') 232 | for m in self.final_layer.modules(): 233 | if isinstance(m, nn.Conv2d): 234 | # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 235 | logger.info('=> init {}.weight as normal(0, 0.001)'.format(name)) 236 | logger.info('=> init {}.bias as 0'.format(name)) 237 | nn.init.normal_(m.weight, std=0.001) 238 | nn.init.constant_(m.bias, 0) 239 | else: 240 | logger.info('=> init weights from normal distribution') 241 | for m in self.modules(): 242 | if isinstance(m, nn.Conv2d): 243 | # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 244 | nn.init.normal_(m.weight, std=0.001) 245 | # nn.init.constant_(m.bias, 0) 246 | elif isinstance(m, nn.BatchNorm2d): 247 | nn.init.constant_(m.weight, 1) 248 | nn.init.constant_(m.bias, 0) 249 | elif isinstance(m, nn.ConvTranspose2d): 250 | nn.init.normal_(m.weight, std=0.001) 251 | if self.deconv_with_bias: 252 | nn.init.constant_(m.bias, 0) 253 | 254 | 255 | resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), 256 | 34: (BasicBlock, [3, 4, 6, 3]), 257 | 50: (Bottleneck, [3, 4, 6, 3]), 258 | 101: (Bottleneck, [3, 4, 23, 3]), 259 | 152: (Bottleneck, [3, 8, 36, 3])} 260 | 261 | 262 | def get_pose_net(cfg, is_train, **kwargs): 263 | num_layers = cfg.POSE_RESNET.NUM_LAYERS 264 | 265 | block_class, layers = resnet_spec[num_layers] 266 | 267 | model = PoseResNet(block_class, layers, cfg, **kwargs) 268 | 269 | if is_train: 270 | model.init_weights(cfg.NETWORK.PRETRAINED) 271 | 272 | return model 273 | -------------------------------------------------------------------------------- /lib/models/project_layer.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | import utils.cameras as cameras 11 | from utils.transforms import get_affine_transform as get_transform 12 | from utils.transforms import affine_transform_pts_cuda as do_transform 13 | 14 | 15 | class ProjectLayer(nn.Module): 16 | def __init__(self, cfg): 17 | super(ProjectLayer, self).__init__() 18 | 19 | self.img_size = cfg.NETWORK.IMAGE_SIZE 20 | self.heatmap_size = cfg.NETWORK.HEATMAP_SIZE 21 | self.grid_size = cfg.MULTI_PERSON.SPACE_SIZE 22 | self.cube_size = cfg.MULTI_PERSON.INITIAL_CUBE_SIZE 23 | self.grid_center = cfg.MULTI_PERSON.SPACE_CENTER 24 | 25 | def compute_grid(self, boxSize, boxCenter, nBins, device=None): 26 | if isinstance(boxSize, int) or isinstance(boxSize, float): 27 | boxSize = [boxSize, boxSize, boxSize] 28 | if isinstance(nBins, int): 29 | nBins = [nBins, nBins, nBins] 30 | 31 | grid1Dx = torch.linspace(-boxSize[0] / 2, boxSize[0] / 2, nBins[0], device=device) 32 | grid1Dy = torch.linspace(-boxSize[1] / 2, boxSize[1] / 2, nBins[1], device=device) 33 | grid1Dz = torch.linspace(-boxSize[2] / 2, boxSize[2] / 2, nBins[2], device=device) 34 | gridx, gridy, gridz = torch.meshgrid( 35 | grid1Dx + boxCenter[0], 36 | grid1Dy + boxCenter[1], 37 | grid1Dz + boxCenter[2], 38 | ) 39 | gridx = gridx.contiguous().view(-1, 1) 40 | gridy = gridy.contiguous().view(-1, 1) 41 | gridz = gridz.contiguous().view(-1, 1) 42 | grid = torch.cat([gridx, gridy, gridz], dim=1) 43 | return grid 44 | 45 | def get_voxel(self, heatmaps, meta, grid_size, grid_center, cube_size): 46 | device = heatmaps[0].device 47 | batch_size = heatmaps[0].shape[0] 48 | num_joints = heatmaps[0].shape[1] 49 | nbins = cube_size[0] * cube_size[1] * cube_size[2] 50 | n = len(heatmaps) 51 | cubes = torch.zeros(batch_size, num_joints, 1, nbins, n, device=device) 52 | # h, w = heatmaps[0].shape[2], heatmaps[0].shape[3] 53 | w, h = self.heatmap_size 54 | grids = torch.zeros(batch_size, nbins, 3, device=device) 55 | bounding = torch.zeros(batch_size, 1, 1, nbins, n, device=device) 56 | for i in range(batch_size): 57 | if len(grid_center[0]) == 3 or grid_center[i][3] >= 0: 58 | # This part of the code can be optimized because the projection operation is time-consuming. 59 | # If the camera locations always keep the same, the grids and sample_grids are repeated across frames 60 | # and can be computed only one time. 61 | if len(grid_center) == 1: 62 | grid = self.compute_grid(grid_size, grid_center[0], cube_size, device=device) 63 | else: 64 | grid = self.compute_grid(grid_size, grid_center[i], cube_size, device=device) 65 | grids[i:i + 1] = grid 66 | for c in range(n): 67 | center = meta[c]['center'][i] 68 | scale = meta[c]['scale'][i] 69 | 70 | width, height = center * 2 71 | trans = torch.as_tensor( 72 | get_transform(center, scale, 0, self.img_size), 73 | dtype=torch.float, 74 | device=device) 75 | cam = {} 76 | for k, v in meta[c]['camera'].items(): 77 | cam[k] = v[i] 78 | xy = cameras.project_pose(grid, cam) 79 | 80 | bounding[i, 0, 0, :, c] = (xy[:, 0] >= 0) & (xy[:, 1] >= 0) & (xy[:, 0] < width) & ( 81 | xy[:, 1] < height) 82 | xy = torch.clamp(xy, -1.0, max(width, height)) 83 | xy = do_transform(xy, trans) 84 | xy = xy * torch.tensor( 85 | [w, h], dtype=torch.float, device=device) / torch.tensor( 86 | self.img_size, dtype=torch.float, device=device) 87 | sample_grid = xy / torch.tensor( 88 | [w - 1, h - 1], dtype=torch.float, 89 | device=device) * 2.0 - 1.0 90 | sample_grid = torch.clamp(sample_grid.view(1, 1, nbins, 2), -1.1, 1.1) 91 | 92 | # if pytorch version < 1.3.0, align_corners=True should be omitted. 93 | cubes[i:i + 1, :, :, :, c] += F.grid_sample(heatmaps[c][i:i + 1, :, :, :], sample_grid, align_corners=True) 94 | 95 | # cubes = cubes.mean(dim=-1) 96 | cubes = torch.sum(torch.mul(cubes, bounding), dim=-1) / (torch.sum(bounding, dim=-1) + 1e-6) 97 | cubes[cubes != cubes] = 0.0 98 | cubes = cubes.clamp(0.0, 1.0) 99 | 100 | cubes = cubes.view(batch_size, num_joints, cube_size[0], cube_size[1], cube_size[2]) ## 101 | return cubes, grids 102 | 103 | def forward(self, heatmaps, meta, grid_size, grid_center, cube_size): 104 | cubes, grids = self.get_voxel(heatmaps, meta, grid_size, grid_center, cube_size) 105 | return cubes, grids -------------------------------------------------------------------------------- /lib/models/v2v_net.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | class Basic3DBlock(nn.Module): 11 | def __init__(self, in_planes, out_planes, kernel_size): 12 | super(Basic3DBlock, self).__init__() 13 | self.block = nn.Sequential( 14 | nn.Conv3d(in_planes, out_planes, kernel_size=kernel_size, stride=1, padding=((kernel_size-1)//2)), 15 | nn.BatchNorm3d(out_planes), 16 | nn.ReLU(True) 17 | ) 18 | 19 | def forward(self, x): 20 | return self.block(x) 21 | 22 | 23 | class Res3DBlock(nn.Module): 24 | def __init__(self, in_planes, out_planes): 25 | super(Res3DBlock, self).__init__() 26 | self.res_branch = nn.Sequential( 27 | nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=1, padding=1), 28 | nn.BatchNorm3d(out_planes), 29 | nn.ReLU(True), 30 | nn.Conv3d(out_planes, out_planes, kernel_size=3, stride=1, padding=1), 31 | nn.BatchNorm3d(out_planes) 32 | ) 33 | 34 | if in_planes == out_planes: 35 | self.skip_con = nn.Sequential() 36 | else: 37 | self.skip_con = nn.Sequential( 38 | nn.Conv3d(in_planes, out_planes, kernel_size=1, stride=1, padding=0), 39 | nn.BatchNorm3d(out_planes) 40 | ) 41 | 42 | def forward(self, x): 43 | res = self.res_branch(x) 44 | skip = self.skip_con(x) 45 | return F.relu(res + skip, True) 46 | 47 | 48 | class Pool3DBlock(nn.Module): 49 | def __init__(self, pool_size): 50 | super(Pool3DBlock, self).__init__() 51 | self.pool_size = pool_size 52 | 53 | def forward(self, x): 54 | return F.max_pool3d(x, kernel_size=self.pool_size, stride=self.pool_size) 55 | 56 | 57 | class Upsample3DBlock(nn.Module): 58 | def __init__(self, in_planes, out_planes, kernel_size, stride): 59 | super(Upsample3DBlock, self).__init__() 60 | assert(kernel_size == 2) 61 | assert(stride == 2) 62 | self.block = nn.Sequential( 63 | nn.ConvTranspose3d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=0, output_padding=0), 64 | nn.BatchNorm3d(out_planes), 65 | nn.ReLU(True) 66 | ) 67 | 68 | def forward(self, x): 69 | return self.block(x) 70 | 71 | 72 | class EncoderDecorder(nn.Module): 73 | def __init__(self): 74 | super(EncoderDecorder, self).__init__() 75 | 76 | self.encoder_pool1 = Pool3DBlock(2) 77 | self.encoder_res1 = Res3DBlock(32, 64) 78 | self.encoder_pool2 = Pool3DBlock(2) 79 | self.encoder_res2 = Res3DBlock(64, 128) 80 | 81 | self.mid_res = Res3DBlock(128, 128) 82 | 83 | self.decoder_res2 = Res3DBlock(128, 128) 84 | self.decoder_upsample2 = Upsample3DBlock(128, 64, 2, 2) 85 | self.decoder_res1 = Res3DBlock(64, 64) 86 | self.decoder_upsample1 = Upsample3DBlock(64, 32, 2, 2) 87 | 88 | self.skip_res1 = Res3DBlock(32, 32) 89 | self.skip_res2 = Res3DBlock(64, 64) 90 | 91 | def forward(self, x): 92 | skip_x1 = self.skip_res1(x) 93 | x = self.encoder_pool1(x) 94 | x = self.encoder_res1(x) 95 | 96 | skip_x2 = self.skip_res2(x) 97 | x = self.encoder_pool2(x) 98 | x = self.encoder_res2(x) 99 | 100 | x = self.mid_res(x) 101 | 102 | x = self.decoder_res2(x) 103 | x = self.decoder_upsample2(x) 104 | x = x + skip_x2 105 | 106 | x = self.decoder_res1(x) 107 | x = self.decoder_upsample1(x) 108 | x = x + skip_x1 109 | 110 | return x 111 | 112 | 113 | class V2VNet(nn.Module): 114 | def __init__(self, input_channels, output_channels): 115 | super(V2VNet, self).__init__() 116 | 117 | self.front_layers = nn.Sequential( 118 | Basic3DBlock(input_channels, 16, 7), 119 | Res3DBlock(16, 32), 120 | ) 121 | 122 | self.encoder_decoder = EncoderDecorder() 123 | 124 | self.output_layer = nn.Conv3d(32, output_channels, kernel_size=1, stride=1, padding=0) 125 | 126 | self._initialize_weights() 127 | 128 | def forward(self, x): 129 | x = self.front_layers(x) 130 | x = self.encoder_decoder(x) 131 | x = self.output_layer(x) 132 | 133 | return x 134 | 135 | def _initialize_weights(self): 136 | for m in self.modules(): 137 | if isinstance(m, nn.Conv3d): 138 | # nn.init.xavier_normal_(m.weight) 139 | nn.init.normal_(m.weight, 0, 0.001) 140 | nn.init.constant_(m.bias, 0) 141 | elif isinstance(m, nn.ConvTranspose3d): 142 | # nn.init.xavier_normal_(m.weight) 143 | nn.init.normal_(m.weight, 0, 0.001) 144 | nn.init.constant_(m.bias, 0) 145 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/voxelpose-pytorch/9ef5d407a597c9647b2c8f6c0a246b725a87a054/lib/utils/__init__.py -------------------------------------------------------------------------------- /lib/utils/cameras.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import division 7 | import torch 8 | import numpy as np 9 | 10 | 11 | def unfold_camera_param(camera, device=None): 12 | R = torch.as_tensor(camera['R'], dtype=torch.float, device=device) 13 | T = torch.as_tensor(camera['T'], dtype=torch.float, device=device) 14 | fx = torch.as_tensor(camera['fx'], dtype=torch.float, device=device) 15 | fy = torch.as_tensor(camera['fy'], dtype=torch.float, device=device) 16 | f = torch.tensor([fx, fy], dtype=torch.float, device=device).reshape(2, 1) 17 | c = torch.as_tensor( 18 | [[camera['cx']], [camera['cy']]], 19 | dtype=torch.float, 20 | device=device) 21 | k = torch.as_tensor(camera['k'], dtype=torch.float, device=device) 22 | p = torch.as_tensor(camera['p'], dtype=torch.float, device=device) 23 | return R, T, f, c, k, p 24 | 25 | 26 | def project_point_radial(x, R, T, f, c, k, p): 27 | """ 28 | Args 29 | x: Nx3 points in world coordinates 30 | R: 3x3 Camera rotation matrix 31 | T: 3x1 Camera translation parameters 32 | f: (scalar) Camera focal length 33 | c: 2x1 Camera center 34 | k: 3x1 Camera radial distortion coefficients 35 | p: 2x1 Camera tangential distortion coefficients 36 | Returns 37 | ypixel.T: Nx2 points in pixel space 38 | """ 39 | n = x.shape[0] 40 | xcam = torch.mm(R, torch.t(x) - T) 41 | y = xcam[:2] / (xcam[2] + 1e-5) 42 | 43 | kexp = k.repeat((1, n)) 44 | r2 = torch.sum(y**2, 0, keepdim=True) 45 | r2exp = torch.cat([r2, r2**2, r2**3], 0) 46 | radial = 1 + torch.einsum('ij,ij->j', kexp, r2exp) 47 | 48 | tan = p[0] * y[1] + p[1] * y[0] 49 | corr = (radial + 2 * tan).repeat((2, 1)) 50 | 51 | y = y * corr + torch.ger(torch.cat([p[1], p[0]]).view(-1), r2.view(-1)) 52 | ypixel = (f * y) + c 53 | return torch.t(ypixel) 54 | 55 | 56 | def project_pose(x, camera): 57 | R, T, f, c, k, p = unfold_camera_param(camera, device=x.device) 58 | return project_point_radial(x, R, T, f, c, k, p) 59 | 60 | 61 | def world_to_camera_frame(x, R, T): 62 | """ 63 | Args 64 | x: Nx3 3d points in world coordinates 65 | R: 3x3 Camera rotation matrix 66 | T: 3x1 Camera translation parameters 67 | Returns 68 | xcam: Nx3 3d points in camera coordinates 69 | """ 70 | 71 | R = torch.as_tensor(R, device=x.device) 72 | T = torch.as_tensor(T, device=x.device) 73 | xcam = torch.mm(R, torch.t(x) - T) 74 | return torch.t(xcam) 75 | 76 | 77 | def camera_to_world_frame(x, R, T): 78 | """ 79 | Args 80 | x: Nx3 points in camera coordinates 81 | R: 3x3 Camera rotation matrix 82 | T: 3x1 Camera translation parameters 83 | Returns 84 | xcam: Nx3 points in world coordinates 85 | """ 86 | 87 | R = torch.as_tensor(R, device=x.device) 88 | T = torch.as_tensor(T, device=x.device) 89 | xcam = torch.mm(torch.t(R), torch.t(x)) 90 | xcam = xcam + T # rotate and translate 91 | return torch.t(xcam) 92 | -------------------------------------------------------------------------------- /lib/utils/cameras_cpu.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import division 7 | import numpy as np 8 | 9 | 10 | def unfold_camera_param(camera): 11 | R = camera['R'] 12 | T = camera['T'] 13 | fx = camera['fx'] 14 | fy = camera['fy'] 15 | # f = 0.5 * (camera['fx'] + camera['fy']) 16 | f = np.array([[fx], [fy]]).reshape(-1, 1) 17 | c = np.array([[camera['cx']], [camera['cy']]]).reshape(-1, 1) 18 | k = camera['k'] 19 | p = camera['p'] 20 | return R, T, f, c, k, p 21 | 22 | 23 | def project_point_radial(x, R, T, f, c, k, p): 24 | """ 25 | Args 26 | x: Nx3 points in world coordinates 27 | R: 3x3 Camera rotation matrix 28 | T: 3x1 Camera translation parameters 29 | f: (scalar) Camera focal length 30 | c: 2x1 Camera center 31 | k: 3x1 Camera radial distortion coefficients 32 | p: 2x1 Camera tangential distortion coefficients 33 | Returns 34 | ypixel.T: Nx2 points in pixel space 35 | """ 36 | n = x.shape[0] 37 | xcam = R.dot(x.T - T) 38 | y = xcam[:2] / (xcam[2]+1e-5) 39 | # print(xcam[2]) 40 | 41 | r2 = np.sum(y**2, axis=0) 42 | radial = 1 + np.einsum('ij,ij->j', np.tile(k, (1, n)), 43 | np.array([r2, r2**2, r2**3])) 44 | tan = p[0] * y[1] + p[1] * y[0] 45 | y = y * np.tile(radial + 2 * tan, 46 | (2, 1)) + np.outer(np.array([p[1], p[0]]).reshape(-1), r2) 47 | ypixel = np.multiply(f, y) + c 48 | return ypixel.T 49 | 50 | 51 | def project_pose(x, camera): 52 | R, T, f, c, k, p = unfold_camera_param(camera) 53 | return project_point_radial(x, R, T, f, c, k, p) 54 | 55 | 56 | def world_to_camera_frame(x, R, T): 57 | """ 58 | Args 59 | x: Nx3 3d points in world coordinates 60 | R: 3x3 Camera rotation matrix 61 | T: 3x1 Camera translation parameters 62 | Returns 63 | xcam: Nx3 3d points in camera coordinates 64 | """ 65 | 66 | xcam = R.dot(x.T - T) # rotate and translate 67 | return xcam.T 68 | 69 | 70 | def camera_to_world_frame(x, R, T): 71 | """ 72 | Args 73 | x: Nx3 points in camera coordinates 74 | R: 3x3 Camera rotation matrix 75 | T: 3x1 Camera translation parameters 76 | Returns 77 | xcam: Nx3 points in world coordinates 78 | """ 79 | 80 | xcam = R.T.dot(x.T) + T # rotate and translate 81 | return xcam.T 82 | -------------------------------------------------------------------------------- /lib/utils/transforms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | import torch 14 | 15 | 16 | def flip_back(output_flipped, matched_parts): 17 | ''' 18 | ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width) 19 | ''' 20 | assert output_flipped.ndim == 4,\ 21 | 'output_flipped should be [batch_size, num_joints, height, width]' 22 | 23 | output_flipped = output_flipped[:, :, :, ::-1] 24 | 25 | for pair in matched_parts: 26 | tmp = output_flipped[:, pair[0], :, :].copy() 27 | output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] 28 | output_flipped[:, pair[1], :, :] = tmp 29 | 30 | return output_flipped 31 | 32 | 33 | def fliplr_joints(joints, joints_vis, width, matched_parts): 34 | """ 35 | flip coords 36 | """ 37 | # Flip horizontal 38 | joints[:, 0] = width - joints[:, 0] - 1 39 | 40 | # Change left-right parts 41 | for pair in matched_parts: 42 | joints[pair[0], :], joints[pair[1], :] = \ 43 | joints[pair[1], :], joints[pair[0], :].copy() 44 | joints_vis[pair[0], :], joints_vis[pair[1], :] = \ 45 | joints_vis[pair[1], :], joints_vis[pair[0], :].copy() 46 | 47 | return joints * joints_vis, joints_vis 48 | 49 | 50 | def transform_preds(coords, center, scale, output_size): 51 | target_coords = np.zeros(coords.shape) 52 | trans = get_affine_transform(center, scale, 0, output_size, inv=1) 53 | for p in range(coords.shape[0]): 54 | target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) 55 | return target_coords 56 | 57 | 58 | def get_affine_transform(center, 59 | scale, 60 | rot, 61 | output_size, 62 | shift=np.array([0, 0], dtype=np.float32), 63 | inv=0): 64 | if isinstance(scale, torch.Tensor): 65 | scale = np.array(scale.cpu()) 66 | if isinstance(center, torch.Tensor): 67 | center = np.array(center.cpu()) 68 | if not isinstance(scale, np.ndarray) and not isinstance(scale, list): 69 | scale = np.array([scale, scale]) 70 | 71 | scale_tmp = scale * 200.0 72 | src_w, src_h = scale_tmp[0], scale_tmp[1] 73 | dst_w, dst_h = output_size[0], output_size[1] 74 | 75 | rot_rad = np.pi * rot / 180 76 | if src_w >= src_h: 77 | src_dir = get_dir([0, src_w * -0.5], rot_rad) 78 | dst_dir = np.array([0, dst_w * -0.5], np.float32) 79 | else: 80 | src_dir = get_dir([src_h * -0.5, 0], rot_rad) 81 | dst_dir = np.array([dst_h * -0.5, 0], np.float32) 82 | 83 | src = np.zeros((3, 2), dtype=np.float32) 84 | dst = np.zeros((3, 2), dtype=np.float32) 85 | src[0, :] = center + scale_tmp * shift # x,y 86 | src[1, :] = center + src_dir + scale_tmp * shift 87 | dst[0, :] = [dst_w * 0.5, dst_h * 0.5] 88 | dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir 89 | 90 | src[2:, :] = get_3rd_point(src[0, :], src[1, :]) 91 | dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) 92 | 93 | if inv: 94 | trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) 95 | else: 96 | trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) 97 | 98 | return trans 99 | 100 | 101 | def affine_transform(pt, t): 102 | new_pt = np.array([pt[0], pt[1], 1.]).T 103 | new_pt = np.dot(t, new_pt) 104 | return new_pt[:2] 105 | 106 | 107 | def affine_transform_pts(pts, t): 108 | xyz = np.add( 109 | np.array([[1, 0], [0, 1], [0, 0]]).dot(pts.T), np.array([[0], [0], 110 | [1]])) 111 | return np.dot(t, xyz).T 112 | 113 | 114 | def affine_transform_pts_cuda(pts, t): 115 | npts = pts.shape[0] 116 | pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1) 117 | out = torch.mm(t, torch.t(pts_homo)) 118 | return torch.t(out[:2, :]) 119 | 120 | 121 | def get_3rd_point(a, b): 122 | direct = a - b 123 | return np.array(b) + np.array([-direct[1], direct[0]], dtype=np.float32) 124 | 125 | 126 | def get_dir(src_point, rot_rad): 127 | sn, cs = np.sin(rot_rad), np.cos(rot_rad) 128 | 129 | src_result = [0, 0] 130 | src_result[0] = src_point[0] * cs - src_point[1] * sn 131 | src_result[1] = src_point[0] * sn + src_point[1] * cs 132 | 133 | return src_result 134 | 135 | 136 | def crop(img, center, scale, output_size, rot=0): 137 | trans = get_affine_transform(center, scale, rot, output_size) 138 | 139 | dst_img = cv2.warpAffine( 140 | img, 141 | trans, (int(output_size[0]), int(output_size[1])), 142 | flags=cv2.INTER_LINEAR) 143 | 144 | return dst_img 145 | 146 | def get_scale(image_size, resized_size): 147 | w, h = image_size 148 | w_resized, h_resized = resized_size 149 | if w / w_resized < h / h_resized: 150 | w_pad = h / h_resized * w_resized 151 | h_pad = h 152 | else: 153 | w_pad = w 154 | h_pad = w / w_resized * h_resized 155 | scale = np.array([w_pad / 200.0, h_pad / 200.0], dtype=np.float32) 156 | 157 | return scale 158 | 159 | 160 | def projectPoints(X, K, R, t, Kd): 161 | """ 162 | Projects points X (3xN) using camera intrinsics K (3x3), 163 | extrinsics (R,t) and distortion parameters Kd=[k1,k2,p1,p2,k3]. 164 | Roughly, x = K*(R*X + t) + distortion 165 | See http://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html 166 | or cv2.projectPoints 167 | """ 168 | 169 | x = np.dot(R, X) + t 170 | 171 | x[0:2, :] = x[0:2, :] / (x[2, :] + 1e-5) 172 | 173 | r = x[0, :] * x[0, :] + x[1, :] * x[1, :] 174 | 175 | x[0, :] = x[0, :] * (1 + Kd[0] * r + Kd[1] * r * r + Kd[4] * r * r * r 176 | ) + 2 * Kd[2] * x[0, :] * x[1, :] + Kd[3] * ( 177 | r + 2 * x[0, :] * x[0, :]) 178 | x[1, :] = x[1, :] * (1 + Kd[0] * r + Kd[1] * r * r + Kd[4] * r * r * r 179 | ) + 2 * Kd[3] * x[0, :] * x[1, :] + Kd[2] * ( 180 | r + 2 * x[1, :] * x[1, :]) 181 | 182 | x[0, :] = K[0, 0] * x[0, :] + K[0, 1] * x[1, :] + K[0, 2] 183 | x[1, :] = K[1, 0] * x[0, :] + K[1, 1] * x[1, :] + K[1, 2] 184 | 185 | return x 186 | 187 | 188 | def rotate_points(points, center, rot_rad): 189 | """ 190 | :param points: N*2 191 | :param center: 2 192 | :param rot_rad: scalar 193 | :return: N*2 194 | """ 195 | rot_rad = rot_rad * np.pi / 180.0 196 | rotate_mat = np.array([[np.cos(rot_rad), -np.sin(rot_rad)], 197 | [np.sin(rot_rad), np.cos(rot_rad)]]) 198 | center = center.reshape(2, 1) 199 | points = points.T 200 | points = rotate_mat.dot(points - center) + center 201 | 202 | return points.T 203 | 204 | 205 | def compute_similarity_transform(X, Y, compute_optimal_scale=False): 206 | """ 207 | A port of MATLAB's `procrustes` function to Numpy. 208 | Adapted from http://stackoverflow.com/a/18927641/1884420 209 | 210 | Args 211 | X: array NxM of targets, with N number of points and M point dimensionality 212 | Y: array NxM of inputs 213 | compute_optimal_scale: whether we compute optimal scale or force it to be 1 214 | 215 | Returns: 216 | d: squared error after transformation 217 | Z: transformed Y 218 | T: computed rotation 219 | b: scaling 220 | c: translation 221 | """ 222 | muX = X.mean(0) 223 | muY = Y.mean(0) 224 | 225 | X0 = X - muX 226 | Y0 = Y - muY 227 | 228 | ssX = (X0 ** 2.).sum() 229 | ssY = (Y0 ** 2.).sum() 230 | 231 | # centred Frobenius norm 232 | normX = np.sqrt(ssX) 233 | normY = np.sqrt(ssY) 234 | 235 | # scale to equal (unit) norm 236 | X0 = X0 / normX 237 | Y0 = Y0 / normY 238 | 239 | # optimum rotation matrix of Y 240 | A = np.dot(X0.T, Y0) 241 | U, s, Vt = np.linalg.svd(A, full_matrices=False) 242 | V = Vt.T 243 | T = np.dot(V, U.T) 244 | 245 | # Make sure we have a rotation 246 | detT = np.linalg.det(T) 247 | V[:, -1] *= np.sign(detT) 248 | s[-1] *= np.sign(detT) 249 | T = np.dot(V, U.T) 250 | 251 | traceTA = s.sum() 252 | 253 | if compute_optimal_scale: # Compute optimum scaling of Y. 254 | b = traceTA * normX / normY 255 | d = 1 - traceTA ** 2 256 | Z = normX * traceTA * np.dot(Y0, T) + muX 257 | else: # If no scaling allowed 258 | b = 1 259 | d = 1 + ssY / ssX - 2 * traceTA * normY / normX 260 | Z = normY * np.dot(Y0, T) + muX 261 | 262 | c = muX - b * np.dot(muY, T) 263 | 264 | return d, Z, T, b, c 265 | 266 | 267 | def procrustes_transform(target_pose, from_pose): 268 | _, Z, rot, s, t = compute_similarity_transform(target_pose, from_pose, compute_optimal_scale=True) 269 | align_pose = s * from_pose.dot(rot) + t 270 | 271 | return align_pose 272 | -------------------------------------------------------------------------------- /lib/utils/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os 11 | import logging 12 | import time 13 | from pathlib import Path 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.optim as optim 18 | 19 | from core.config import get_model_name 20 | 21 | 22 | def create_logger(cfg, cfg_name, phase='train'): 23 | this_dir = Path(os.path.dirname(__file__)) ## 24 | root_output_dir = (this_dir / '..' / '..' / cfg.OUTPUT_DIR).resolve() ## 25 | tensorboard_log_dir = (this_dir / '..' / '..' / cfg.LOG_DIR).resolve() 26 | # set up logger 27 | if not root_output_dir.exists(): 28 | print('=> creating {}'.format(root_output_dir)) 29 | root_output_dir.mkdir() 30 | 31 | dataset = cfg.DATASET.TRAIN_DATASET 32 | model, _ = get_model_name(cfg) 33 | cfg_name = os.path.basename(cfg_name).split('.')[0] 34 | 35 | final_output_dir = root_output_dir / dataset / model / cfg_name 36 | 37 | print('=> creating {}'.format(final_output_dir)) 38 | final_output_dir.mkdir(parents=True, exist_ok=True) 39 | 40 | time_str = time.strftime('%Y-%m-%d-%H-%M') 41 | log_file = '{}_{}_{}.log'.format(cfg_name, time_str, phase) 42 | final_log_file = final_output_dir / log_file 43 | head = '%(asctime)-15s %(message)s' 44 | logging.basicConfig(filename=str(final_log_file), 45 | format=head) 46 | logger = logging.getLogger() 47 | logger.setLevel(logging.INFO) 48 | console = logging.StreamHandler() 49 | logging.getLogger('').addHandler(console) 50 | 51 | tensorboard_log_dir = tensorboard_log_dir / dataset / model / \ 52 | (cfg_name + time_str) 53 | print('=> creating {}'.format(tensorboard_log_dir)) 54 | tensorboard_log_dir.mkdir(parents=True, exist_ok=True) 55 | 56 | return logger, str(final_output_dir), str(tensorboard_log_dir) 57 | 58 | def get_optimizer(cfg, model): 59 | optimizer = None 60 | if cfg.TRAIN.OPTIMIZER == 'sgd': 61 | optimizer = optim.SGD( 62 | model.parameters(), 63 | lr=cfg.TRAIN.LR, 64 | momentum=cfg.TRAIN.MOMENTUM, 65 | weight_decay=cfg.TRAIN.WD, 66 | nesterov=cfg.TRAIN.NESTEROV 67 | ) 68 | elif cfg.TRAIN.OPTIMIZER == 'adam': 69 | optimizer = optim.Adam( 70 | model.parameters(), 71 | lr=cfg.TRAIN.LR 72 | ) 73 | 74 | return optimizer 75 | 76 | 77 | def load_model_state(model, output_dir, epoch): 78 | file = os.path.join(output_dir, 'checkpoint_3d_epoch'+str(epoch)+'.pth.tar') 79 | if os.path.isfile(file): 80 | model.module.load_state_dict(torch.load(file)) 81 | print('=> load models state {} (epoch {})' 82 | .format(file, epoch)) 83 | return model 84 | else: 85 | print('=> no checkpoint found at {}'.format(file)) 86 | return model 87 | 88 | 89 | def load_checkpoint(model, optimizer, output_dir, filename='checkpoint.pth.tar'): 90 | file = os.path.join(output_dir, filename) 91 | if os.path.isfile(file): 92 | checkpoint = torch.load(file) 93 | start_epoch = checkpoint['epoch'] 94 | precision = checkpoint['precision'] if 'precision' in checkpoint else 0 95 | model.module.load_state_dict(checkpoint['state_dict']) 96 | optimizer.load_state_dict(checkpoint['optimizer']) 97 | print('=> load checkpoint {} (epoch {})' 98 | .format(file, start_epoch)) 99 | 100 | return start_epoch, model, optimizer, precision 101 | 102 | else: 103 | print('=> no checkpoint found at {}'.format(file)) 104 | return 0, model, optimizer, 0 105 | 106 | 107 | def save_checkpoint(states, is_best, output_dir, 108 | filename='checkpoint.pth.tar'): 109 | torch.save(states, os.path.join(output_dir, filename)) 110 | if is_best and 'state_dict' in states: 111 | torch.save(states['state_dict'], 112 | os.path.join(output_dir, 'model_best.pth.tar')) 113 | 114 | 115 | def load_backbone_panoptic(model, pretrained_file): 116 | this_dir = os.path.dirname(__file__) 117 | pretrained_file = os.path.abspath(os.path.join(this_dir, '../..', pretrained_file)) 118 | pretrained_state_dict = torch.load(pretrained_file) 119 | model_state_dict = model.module.backbone.state_dict() 120 | 121 | prefix = "module." 122 | new_pretrained_state_dict = {} 123 | for k, v in pretrained_state_dict.items(): 124 | if k.replace(prefix, "") in model_state_dict and v.shape == model_state_dict[k.replace(prefix, "")].shape: 125 | new_pretrained_state_dict[k.replace(prefix, "")] = v 126 | elif k.replace(prefix, "") == "final_layer.weight": # TODO 127 | print("Reiniting final layer filters:", k) 128 | 129 | o = torch.zeros_like(model_state_dict[k.replace(prefix, "")][:, :, :, :]) 130 | nn.init.xavier_uniform_(o) 131 | n_filters = min(o.shape[0], v.shape[0]) 132 | o[:n_filters, :, :, :] = v[:n_filters, :, :, :] 133 | 134 | new_pretrained_state_dict[k.replace(prefix, "")] = o 135 | elif k.replace(prefix, "") == "final_layer.bias": 136 | print("Reiniting final layer biases:", k) 137 | o = torch.zeros_like(model_state_dict[k.replace(prefix, "")][:]) 138 | nn.init.zeros_(o) 139 | n_filters = min(o.shape[0], v.shape[0]) 140 | o[:n_filters] = v[:n_filters] 141 | 142 | new_pretrained_state_dict[k.replace(prefix, "")] = o 143 | logging.info("load backbone statedict from {}".format(pretrained_file)) 144 | model.module.backbone.load_state_dict(new_pretrained_state_dict) 145 | 146 | return model 147 | -------------------------------------------------------------------------------- /lib/utils/vis.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import math 7 | import numpy as np 8 | import torchvision 9 | import cv2 10 | import os 11 | import matplotlib 12 | matplotlib.use('Agg') 13 | from matplotlib import pyplot as plt 14 | from mpl_toolkits.mplot3d import Axes3D 15 | 16 | 17 | def save_batch_image_with_joints_multi(batch_image, 18 | batch_joints, 19 | batch_joints_vis, 20 | num_person, 21 | file_name, 22 | nrow=8, 23 | padding=2): 24 | ''' 25 | batch_image: [batch_size, channel, height, width] 26 | batch_joints: [batch_size, num_person, num_joints, 3], 27 | batch_joints_vis: [batch_size, num_person, num_joints, 1], 28 | num_person: [batch_size] 29 | } 30 | ''' 31 | batch_image = batch_image.flip(1) 32 | grid = torchvision.utils.make_grid(batch_image, nrow, padding, True) 33 | ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).cpu().numpy() 34 | ndarr = ndarr.copy() 35 | 36 | nmaps = batch_image.size(0) 37 | xmaps = min(nrow, nmaps) 38 | ymaps = int(math.ceil(float(nmaps) / xmaps)) 39 | height = int(batch_image.size(2) + padding) 40 | width = int(batch_image.size(3) + padding) 41 | k = 0 42 | for y in range(ymaps): 43 | for x in range(xmaps): 44 | if k >= nmaps: 45 | break 46 | for n in range(num_person[k]): 47 | joints = batch_joints[k, n] 48 | joints_vis = batch_joints_vis[k, n] 49 | 50 | for joint, joint_vis in zip(joints, joints_vis): 51 | joint[0] = x * width + padding + joint[0] 52 | joint[1] = y * height + padding + joint[1] 53 | if joint_vis[0]: 54 | cv2.circle(ndarr, (int(joint[0]), int(joint[1])), 2, 55 | [0, 255, 255], 2) 56 | k = k + 1 57 | cv2.imwrite(file_name, ndarr) 58 | 59 | 60 | def save_batch_heatmaps_multi(batch_image, batch_heatmaps, file_name, normalize=True): 61 | ''' 62 | batch_image: [batch_size, channel, height, width] 63 | batch_heatmaps: ['batch_size, num_joints, height, width] 64 | file_name: saved file name 65 | ''' 66 | if normalize: 67 | batch_image = batch_image.clone() 68 | min = float(batch_image.min()) 69 | max = float(batch_image.max()) 70 | 71 | batch_image.add_(-min).div_(max - min + 1e-5) 72 | batch_image = batch_image.flip(1) 73 | 74 | batch_size = batch_heatmaps.size(0) 75 | num_joints = batch_heatmaps.size(1) 76 | heatmap_height = batch_heatmaps.size(2) 77 | heatmap_width = batch_heatmaps.size(3) 78 | 79 | grid_image = np.zeros( 80 | (batch_size * heatmap_height, (num_joints + 1) * heatmap_width, 3), 81 | dtype=np.uint8) 82 | 83 | for i in range(batch_size): 84 | image = batch_image[i].mul(255)\ 85 | .clamp(0, 255)\ 86 | .byte()\ 87 | .permute(1, 2, 0)\ 88 | .cpu().numpy() 89 | heatmaps = batch_heatmaps[i].mul(255)\ 90 | .clamp(0, 255)\ 91 | .byte()\ 92 | .cpu().numpy() 93 | 94 | resized_image = cv2.resize(image, 95 | (int(heatmap_width), int(heatmap_height))) 96 | 97 | height_begin = heatmap_height * i 98 | height_end = heatmap_height * (i + 1) 99 | for j in range(num_joints): 100 | heatmap = heatmaps[j, :, :] 101 | colored_heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) 102 | masked_image = colored_heatmap * 0.7 + resized_image * 0.3 103 | 104 | width_begin = heatmap_width * (j + 1) 105 | width_end = heatmap_width * (j + 2) 106 | grid_image[height_begin:height_end, width_begin:width_end, :] = \ 107 | masked_image 108 | # grid_image[height_begin:height_end, width_begin:width_end, :] = \ 109 | # colored_heatmap*0.7 + resized_image*0.3 110 | 111 | grid_image[height_begin:height_end, 0:heatmap_width, :] = resized_image 112 | 113 | cv2.imwrite(file_name, grid_image) 114 | 115 | 116 | def save_debug_images_multi(config, input, meta, target, output, prefix): 117 | if not config.DEBUG.DEBUG: 118 | return 119 | 120 | basename = os.path.basename(prefix) 121 | dirname = os.path.dirname(prefix) 122 | dirname1 = os.path.join(dirname, 'image_with_joints') 123 | dirname2 = os.path.join(dirname, 'batch_heatmaps') 124 | 125 | for dir in [dirname1, dirname2]: 126 | if not os.path.exists(dir): 127 | os.makedirs(dir) 128 | 129 | prefix1 = os.path.join(dirname1, basename) 130 | prefix2 = os.path.join(dirname2, basename) 131 | 132 | if config.DEBUG.SAVE_BATCH_IMAGES_GT: 133 | save_batch_image_with_joints_multi(input, meta['joints'], meta['joints_vis'], meta['num_person'], '{}_gt.jpg'.format(prefix1)) 134 | if config.DEBUG.SAVE_HEATMAPS_GT: 135 | save_batch_heatmaps_multi(input, target, '{}_hm_gt.jpg'.format(prefix2)) 136 | if config.DEBUG.SAVE_HEATMAPS_PRED: 137 | save_batch_heatmaps_multi(input, output, '{}_hm_pred.jpg'.format(prefix2)) 138 | 139 | # panoptic 140 | LIMBS15 = [[0, 1], [0, 2], [0, 3], [3, 4], [4, 5], [0, 9], [9, 10], 141 | [10, 11], [2, 6], [2, 12], [6, 7], [7, 8], [12, 13], [13, 14]] 142 | 143 | # # h36m 144 | # LIMBS17 = [[0, 1], [1, 2], [2, 3], [0, 4], [4, 5], [5, 6], [0, 7], [7, 8], 145 | # [8, 9], [9, 10], [8, 14], [14, 15], [15, 16], [8, 11], [11, 12], [12, 13]] 146 | # coco17 147 | LIMBS17 = [[0, 1], [0, 2], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], [11, 13], [13, 15], 148 | [6, 12], [12, 14], [14, 16], [5, 6], [11, 12]] 149 | 150 | # shelf / campus 151 | LIMBS14 = [[0, 1], [1, 2], [3, 4], [4, 5], [2, 3], [6, 7], [7, 8], [9, 10], 152 | [10, 11], [2, 8], [3, 9], [8, 12], [9, 12], [12, 13]] 153 | 154 | 155 | def save_debug_3d_images(config, meta, preds, prefix): 156 | if not config.DEBUG.DEBUG: 157 | return 158 | 159 | basename = os.path.basename(prefix) 160 | dirname = os.path.dirname(prefix) 161 | dirname1 = os.path.join(dirname, '3d_joints') 162 | 163 | if not os.path.exists(dirname1): 164 | os.makedirs(dirname1) 165 | 166 | prefix = os.path.join(dirname1, basename) 167 | file_name = prefix + "_3d.png" 168 | 169 | # preds = preds.cpu().numpy() 170 | batch_size = meta['num_person'].shape[0] 171 | xplot = min(4, batch_size) 172 | yplot = int(math.ceil(float(batch_size) / xplot)) 173 | 174 | width = 4.0 * xplot 175 | height = 4.0 * yplot 176 | fig = plt.figure(0, figsize=(width, height)) 177 | plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, 178 | top=0.95, wspace=0.05, hspace=0.15) 179 | for i in range(batch_size): 180 | num_person = meta['num_person'][i] 181 | joints_3d = meta['joints_3d'][i] 182 | joints_3d_vis = meta['joints_3d_vis'][i] 183 | ax = plt.subplot(yplot, xplot, i + 1, projection='3d') 184 | for n in range(num_person): 185 | joint = joints_3d[n] 186 | joint_vis = joints_3d_vis[n] 187 | for k in eval("LIMBS{}".format(len(joint))): 188 | if joint_vis[k[0], 0] and joint_vis[k[1], 0]: 189 | x = [float(joint[k[0], 0]), float(joint[k[1], 0])] 190 | y = [float(joint[k[0], 1]), float(joint[k[1], 1])] 191 | z = [float(joint[k[0], 2]), float(joint[k[1], 2])] 192 | ax.plot(x, y, z, c='r', lw=1.5, marker='o', markerfacecolor='w', markersize=2, 193 | markeredgewidth=1) 194 | else: 195 | x = [float(joint[k[0], 0]), float(joint[k[1], 0])] 196 | y = [float(joint[k[0], 1]), float(joint[k[1], 1])] 197 | z = [float(joint[k[0], 2]), float(joint[k[1], 2])] 198 | ax.plot(x, y, z, c='r', ls='--', lw=1.5, marker='o', markerfacecolor='w', markersize=2, 199 | markeredgewidth=1) 200 | 201 | colors = ['b', 'g', 'c', 'y', 'm', 'orange', 'pink', 'royalblue', 'lightgreen', 'gold'] 202 | if preds is not None: 203 | pred = preds[i] 204 | for n in range(len(pred)): 205 | joint = pred[n] 206 | if joint[0, 3] >= 0: 207 | for k in eval("LIMBS{}".format(len(joint))): 208 | x = [float(joint[k[0], 0]), float(joint[k[1], 0])] 209 | y = [float(joint[k[0], 1]), float(joint[k[1], 1])] 210 | z = [float(joint[k[0], 2]), float(joint[k[1], 2])] 211 | ax.plot(x, y, z, c=colors[int(n % 10)], lw=1.5, marker='o', markerfacecolor='w', markersize=2, 212 | markeredgewidth=1) 213 | plt.savefig(file_name) 214 | plt.close(0) 215 | 216 | 217 | def save_debug_3d_cubes(config, meta, root, prefix): 218 | if not config.DEBUG.DEBUG: 219 | return 220 | 221 | basename = os.path.basename(prefix) 222 | dirname = os.path.dirname(prefix) 223 | dirname1 = os.path.join(dirname, 'root_cubes') 224 | 225 | if not os.path.exists(dirname1): 226 | os.makedirs(dirname1) 227 | 228 | prefix = os.path.join(dirname1, basename) 229 | file_name = prefix + "_root.png" 230 | 231 | batch_size = root.shape[0] 232 | root_id = config.DATASET.ROOTIDX 233 | 234 | xplot = min(4, batch_size) 235 | yplot = int(math.ceil(float(batch_size) / xplot)) 236 | 237 | width = 6.0 * xplot 238 | height = 4.0 * yplot 239 | fig = plt.figure(0, figsize=(width, height)) 240 | plt.subplots_adjust(left=0.05, right=0.95, bottom=0.05, 241 | top=0.95, wspace=0.05, hspace=0.15) 242 | for i in range(batch_size): 243 | roots_gt = meta['roots_3d'][i] 244 | num_person = meta['num_person'][i] 245 | roots_pred = root[i] 246 | ax = plt.subplot(yplot, xplot, i + 1, projection='3d') 247 | 248 | x = roots_gt[:num_person, 0].cpu() 249 | y = roots_gt[:num_person, 1].cpu() 250 | z = roots_gt[:num_person, 2].cpu() 251 | ax.scatter(x, y, z, c='r') 252 | 253 | index = roots_pred[:, 3] >= 0 254 | x = roots_pred[index, 0].cpu() 255 | y = roots_pred[index, 1].cpu() 256 | z = roots_pred[index, 2].cpu() 257 | ax.scatter(x, y, z, c='b') 258 | 259 | space_size = config.MULTI_PERSON.SPACE_SIZE 260 | space_center = config.MULTI_PERSON.SPACE_CENTER 261 | ax.set_xlim(space_center[0] - space_size[0] / 2, space_center[0] + space_size[0] / 2) 262 | ax.set_ylim(space_center[1] - space_size[1] / 2, space_center[1] + space_size[1] / 2) 263 | ax.set_zlim(space_center[2] - space_size[2] / 2, space_center[2] + space_size[2] / 2) 264 | 265 | plt.savefig(file_name) 266 | plt.close(0) 267 | -------------------------------------------------------------------------------- /lib/utils/zipreader.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | import os 7 | import zipfile 8 | import xml.etree.ElementTree as ET 9 | 10 | import cv2 11 | import numpy as np 12 | 13 | _im_zfile = [] 14 | _xml_path_zip = [] 15 | _xml_zfile = [] 16 | 17 | 18 | def imread(filename, flags=cv2.IMREAD_COLOR): 19 | global _im_zfile 20 | path = filename 21 | pos_at = path.index('@') 22 | if pos_at == -1: 23 | print("character '@' is not found from the given path '%s'"%(path)) 24 | assert 0 25 | path_zip = path[0: pos_at] 26 | path_img = path[pos_at + 2:] 27 | if not os.path.isfile(path_zip): 28 | print("zip file '%s' is not found"%(path_zip)) 29 | assert 0 30 | for i in range(len(_im_zfile)): 31 | if _im_zfile[i]['path'] == path_zip: 32 | data = _im_zfile[i]['zipfile'].read(path_img) 33 | return cv2.imdecode(np.frombuffer(data, np.uint8), flags) 34 | 35 | _im_zfile.append({ 36 | 'path': path_zip, 37 | 'zipfile': zipfile.ZipFile(path_zip, 'r') 38 | }) 39 | data = _im_zfile[-1]['zipfile'].read(path_img) 40 | 41 | return cv2.imdecode(np.frombuffer(data, np.uint8), flags) 42 | 43 | 44 | def xmlread(filename): 45 | global _xml_path_zip 46 | global _xml_zfile 47 | path = filename 48 | pos_at = path.index('@') 49 | if pos_at == -1: 50 | print("character '@' is not found from the given path '%s'"%(path)) 51 | assert 0 52 | path_zip = path[0: pos_at] 53 | path_xml = path[pos_at + 2:] 54 | if not os.path.isfile(path_zip): 55 | print("zip file '%s' is not found"%(path_zip)) 56 | assert 0 57 | for i in xrange(len(_xml_path_zip)): 58 | if _xml_path_zip[i] == path_zip: 59 | data = _xml_zfile[i].open(path_xml) 60 | return ET.fromstring(data.read()) 61 | _xml_path_zip.append(path_zip) 62 | print("read new xml file '%s'"%(path_zip)) 63 | _xml_zfile.append(zipfile.ZipFile(path_zip, 'r')) 64 | data = _xml_zfile[-1].open(path_xml) 65 | return ET.fromstring(data.read()) 66 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm==4.29.1 2 | json_tricks==3.13.2 3 | torch==1.4.0 4 | opencv_python==4.0.0.21 5 | prettytable==0.7.2 6 | scipy==1.4.1 7 | torchvision==0.5.0 8 | numpy==1.16.2 9 | matplotlib==2.0.2 10 | easydict==1.9 11 | PyYAML==5.4 12 | tensorboardX==2.1 13 | -------------------------------------------------------------------------------- /run/_init_paths.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os.path as osp 11 | import sys 12 | 13 | 14 | def add_path(path): 15 | if path not in sys.path: 16 | sys.path.insert(0, path) 17 | 18 | 19 | this_dir = osp.dirname(__file__) 20 | 21 | lib_path = osp.join(this_dir, '..', 'lib') 22 | add_path(lib_path) 23 | -------------------------------------------------------------------------------- /run/train_3d.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | import torch.backends.cudnn as cudnn 14 | import torch.utils.data 15 | import torch.utils.data.distributed 16 | import torchvision.transforms as transforms 17 | from tensorboardX import SummaryWriter 18 | import argparse 19 | import os 20 | import pprint 21 | import logging 22 | import json 23 | 24 | import _init_paths 25 | from core.config import config 26 | from core.config import update_config 27 | from core.function import train_3d, validate_3d 28 | from utils.utils import create_logger 29 | from utils.utils import save_checkpoint, load_checkpoint, load_model_state 30 | from utils.utils import load_backbone_panoptic 31 | import dataset 32 | import models 33 | 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser(description='Train keypoints network') 37 | parser.add_argument( 38 | '--cfg', help='experiment configure file name', required=True, type=str) 39 | 40 | args, rest = parser.parse_known_args() 41 | update_config(args.cfg) 42 | 43 | return args 44 | 45 | 46 | def get_optimizer(model): 47 | lr = config.TRAIN.LR 48 | if model.module.backbone is not None: 49 | for params in model.module.backbone.parameters(): 50 | params.requires_grad = False # If you want to train the whole model jointly, set it to be True. 51 | for params in model.module.root_net.parameters(): 52 | params.requires_grad = True 53 | for params in model.module.pose_net.parameters(): 54 | params.requires_grad = True 55 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.module.parameters()), lr=lr) 56 | # optimizer = optim.Adam(model.module.parameters(), lr=lr) 57 | 58 | return model, optimizer 59 | 60 | 61 | def main(): 62 | args = parse_args() 63 | logger, final_output_dir, tb_log_dir = create_logger( 64 | config, args.cfg, 'train') 65 | 66 | logger.info(pprint.pformat(args)) 67 | logger.info(pprint.pformat(config)) 68 | 69 | gpus = [int(i) for i in config.GPUS.split(',')] 70 | print('=> Loading data ..') 71 | normalize = transforms.Normalize( 72 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 73 | train_dataset = eval('dataset.' + config.DATASET.TRAIN_DATASET)( 74 | config, config.DATASET.TRAIN_SUBSET, True, 75 | transforms.Compose([ 76 | transforms.ToTensor(), 77 | normalize, 78 | ])) 79 | 80 | train_loader = torch.utils.data.DataLoader( 81 | train_dataset, 82 | batch_size=config.TRAIN.BATCH_SIZE * len(gpus), 83 | shuffle=config.TRAIN.SHUFFLE, 84 | num_workers=config.WORKERS, 85 | pin_memory=True) 86 | 87 | test_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)( 88 | config, config.DATASET.TEST_SUBSET, False, 89 | transforms.Compose([ 90 | transforms.ToTensor(), 91 | normalize, 92 | ])) 93 | 94 | test_loader = torch.utils.data.DataLoader( 95 | test_dataset, 96 | batch_size=config.TEST.BATCH_SIZE * len(gpus), 97 | shuffle=False, 98 | num_workers=config.WORKERS, 99 | pin_memory=True) 100 | 101 | cudnn.benchmark = config.CUDNN.BENCHMARK 102 | torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC 103 | torch.backends.cudnn.enabled = config.CUDNN.ENABLED 104 | 105 | print('=> Constructing models ..') 106 | model = eval('models.' + config.MODEL + '.get_multi_person_pose_net')( 107 | config, is_train=True) 108 | with torch.no_grad(): 109 | model = torch.nn.DataParallel(model, device_ids=gpus).cuda() 110 | 111 | model, optimizer = get_optimizer(model) 112 | 113 | start_epoch = config.TRAIN.BEGIN_EPOCH 114 | end_epoch = config.TRAIN.END_EPOCH 115 | 116 | best_precision = 0 117 | if config.NETWORK.PRETRAINED_BACKBONE: 118 | model = load_backbone_panoptic(model, config.NETWORK.PRETRAINED_BACKBONE) 119 | if config.TRAIN.RESUME: 120 | start_epoch, model, optimizer, best_precision = load_checkpoint(model, optimizer, final_output_dir) 121 | 122 | writer_dict = { 123 | 'writer': SummaryWriter(log_dir=tb_log_dir), 124 | 'train_global_steps': 0, 125 | 'valid_global_steps': 0, 126 | } 127 | 128 | print('=> Training...') 129 | for epoch in range(start_epoch, end_epoch): 130 | print('Epoch: {}'.format(epoch)) 131 | 132 | # lr_scheduler.step() 133 | train_3d(config, model, optimizer, train_loader, epoch, final_output_dir, writer_dict) 134 | precision = validate_3d(config, model, test_loader, final_output_dir) 135 | 136 | if precision > best_precision: 137 | best_precision = precision 138 | best_model = True 139 | else: 140 | best_model = False 141 | 142 | logger.info('=> saving checkpoint to {} (Best: {})'.format(final_output_dir, best_model)) 143 | save_checkpoint({ 144 | 'epoch': epoch + 1, 145 | 'state_dict': model.module.state_dict(), 146 | 'precision': best_precision, 147 | 'optimizer': optimizer.state_dict(), 148 | }, best_model, final_output_dir) 149 | 150 | final_model_state_file = os.path.join(final_output_dir, 151 | 'final_state.pth.tar') 152 | logger.info('saving final model state to {}'.format( 153 | final_model_state_file)) 154 | torch.save(model.module.state_dict(), final_model_state_file) 155 | 156 | writer_dict['writer'].close() 157 | 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /run/validate_3d.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | import torch.backends.cudnn as cudnn 14 | import torch.utils.data 15 | import torch.utils.data.distributed 16 | import torchvision.transforms as transforms 17 | from tensorboardX import SummaryWriter 18 | import argparse 19 | import os 20 | import pprint 21 | import logging 22 | import json 23 | 24 | import _init_paths 25 | from core.config import config 26 | from core.config import update_config 27 | from core.function import train_3d, validate_3d 28 | from utils.utils import create_logger 29 | from utils.utils import save_checkpoint, load_checkpoint, load_model_state 30 | from utils.utils import load_backbone_panoptic 31 | import dataset 32 | import models 33 | 34 | 35 | def parse_args(): 36 | parser = argparse.ArgumentParser(description='Train keypoints network') 37 | parser.add_argument( 38 | '--cfg', help='experiment configure file name', required=True, type=str) 39 | 40 | args, rest = parser.parse_known_args() 41 | update_config(args.cfg) 42 | 43 | return args 44 | 45 | 46 | def main(): 47 | args = parse_args() 48 | logger, final_output_dir, tb_log_dir = create_logger( 49 | config, args.cfg, 'validate') 50 | 51 | logger.info(pprint.pformat(args)) 52 | logger.info(pprint.pformat(config)) 53 | 54 | gpus = [int(i) for i in config.GPUS.split(',')] 55 | print('=> Loading data ..') 56 | normalize = transforms.Normalize( 57 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 58 | 59 | test_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)( 60 | config, config.DATASET.TEST_SUBSET, False, 61 | transforms.Compose([ 62 | transforms.ToTensor(), 63 | normalize, 64 | ])) 65 | 66 | test_loader = torch.utils.data.DataLoader( 67 | test_dataset, 68 | batch_size=config.TEST.BATCH_SIZE * len(gpus), 69 | shuffle=False, 70 | num_workers=config.WORKERS, 71 | pin_memory=True) 72 | 73 | cudnn.benchmark = config.CUDNN.BENCHMARK 74 | torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC 75 | torch.backends.cudnn.enabled = config.CUDNN.ENABLED 76 | 77 | print('=> Constructing models ..') 78 | model = eval('models.' + config.MODEL + '.get_multi_person_pose_net')( 79 | config, is_train=True) 80 | with torch.no_grad(): 81 | model = torch.nn.DataParallel(model, device_ids=gpus).cuda() 82 | 83 | test_model_file = os.path.join(final_output_dir, config.TEST.MODEL_FILE) 84 | if config.TEST.MODEL_FILE and os.path.isfile(test_model_file): 85 | logger.info('=> load models state {}'.format(test_model_file)) 86 | model.module.load_state_dict(torch.load(test_model_file)) 87 | else: 88 | raise ValueError('Check the model file for testing!') 89 | 90 | validate_3d(config, model, test_loader, final_output_dir) 91 | 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /test/_init_paths.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import os.path as osp 11 | import sys 12 | 13 | 14 | def add_path(path): 15 | if path not in sys.path: 16 | sys.path.insert(0, path) 17 | 18 | 19 | this_dir = osp.dirname(__file__) 20 | 21 | lib_path = osp.join(this_dir, '..', 'lib') 22 | add_path(lib_path) 23 | -------------------------------------------------------------------------------- /test/evaluate.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. 4 | # ------------------------------------------------------------------------------ 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | import torch 12 | import torch.backends.cudnn as cudnn 13 | import torch.utils.data 14 | import torch.utils.data.distributed 15 | import torchvision.transforms as transforms 16 | import argparse 17 | import os 18 | from tqdm import tqdm 19 | from prettytable import PrettyTable 20 | import copy 21 | 22 | import _init_paths 23 | from core.config import config 24 | from core.config import update_config 25 | from utils.utils import create_logger, load_backbone_panoptic 26 | import dataset 27 | import models 28 | 29 | 30 | def parse_args(): 31 | parser = argparse.ArgumentParser(description='Train keypoints network') 32 | parser.add_argument( 33 | '--cfg', help='experiment configure file name', required=True, type=str) 34 | 35 | args, rest = parser.parse_known_args() 36 | update_config(args.cfg) 37 | 38 | return args 39 | 40 | 41 | def main(): 42 | args = parse_args() 43 | logger, final_output_dir, tb_log_dir = create_logger( 44 | config, args.cfg, 'eval_map') 45 | cfg_name = os.path.basename(args.cfg).split('.')[0] 46 | 47 | gpus = [int(i) for i in config.GPUS.split(',')] 48 | print('=> Loading data ..') 49 | normalize = transforms.Normalize( 50 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 51 | 52 | test_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)( 53 | config, config.DATASET.TEST_SUBSET, False, 54 | transforms.Compose([ 55 | transforms.ToTensor(), 56 | normalize, 57 | ])) 58 | 59 | test_loader = torch.utils.data.DataLoader( 60 | test_dataset, 61 | batch_size=config.TEST.BATCH_SIZE * len(gpus), 62 | shuffle=False, 63 | num_workers=config.WORKERS, 64 | pin_memory=True) 65 | 66 | cudnn.benchmark = config.CUDNN.BENCHMARK 67 | torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC 68 | torch.backends.cudnn.enabled = config.CUDNN.ENABLED 69 | 70 | print('=> Constructing models ..') 71 | model = eval('models.' + config.MODEL + '.get_multi_person_pose_net')( 72 | config, is_train=True) 73 | with torch.no_grad(): 74 | model = torch.nn.DataParallel(model, device_ids=gpus).cuda() 75 | 76 | test_model_file = os.path.join(final_output_dir, config.TEST.MODEL_FILE) 77 | if config.TEST.MODEL_FILE and os.path.isfile(test_model_file): 78 | logger.info('=> load models state {}'.format(test_model_file)) 79 | model.module.load_state_dict(torch.load(test_model_file)) 80 | else: 81 | raise ValueError('Check the model file for testing!') 82 | 83 | model.eval() 84 | preds = [] 85 | with torch.no_grad(): 86 | for i, (inputs, targets_2d, weights_2d, targets_3d, meta, input_heatmap) in enumerate(tqdm(test_loader)): 87 | if 'panoptic' in config.DATASET.TEST_DATASET: 88 | pred, _, _, _, _, _ = model(views=inputs, meta=meta) 89 | elif 'campus' in config.DATASET.TEST_DATASET or 'shelf' in config.DATASET.TEST_DATASET: 90 | pred, _, _, _, _, _ = model(meta=meta, input_heatmaps=input_heatmap) 91 | 92 | pred = pred.detach().cpu().numpy() 93 | for b in range(pred.shape[0]): 94 | preds.append(pred[b]) 95 | 96 | tb = PrettyTable() 97 | if 'panoptic' in config.DATASET.TEST_DATASET: 98 | mpjpe_threshold = np.arange(25, 155, 25) 99 | aps, recs, mpjpe, _ = test_dataset.evaluate(preds) 100 | tb.field_names = ['Threshold/mm'] + [f'{i}' for i in mpjpe_threshold] 101 | tb.add_row(['AP'] + [f'{ap * 100:.2f}' for ap in aps]) 102 | tb.add_row(['Recall'] + [f'{re * 100:.2f}' for re in recs]) 103 | print(tb) 104 | print(f'MPJPE: {mpjpe:.2f}mm') 105 | else: 106 | actor_pcp, avg_pcp, bone_person_pcp, _ = test_dataset.evaluate(preds) 107 | tb.field_names = ['Bone Group'] + [f'Actor {i+1}' for i in range(len(actor_pcp))] + ['Average'] 108 | for k, v in bone_person_pcp.items(): 109 | tb.add_row([k] + [f'{i*100:.1f}' for i in v] + [f'{np.mean(v)*100:.1f}']) 110 | tb.add_row(['Total'] + [f'{i*100:.1f}' for i in actor_pcp] + [f'{avg_pcp*100:.1f}']) 111 | print(tb) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | --------------------------------------------------------------------------------