├── .gitignore ├── LICENSE ├── README.md ├── Sim3DR ├── .gitignore ├── Sim3DR.py ├── __init__.py ├── _init_paths.py ├── build_sim3dr.sh ├── lib │ ├── rasterize.h │ ├── rasterize.pyx │ └── rasterize_kernel.cpp ├── lighting.py ├── readme.md ├── setup.py └── tests │ ├── .gitignore │ ├── CMakeLists.txt │ ├── io.cpp │ ├── io.h │ └── test.cpp ├── backbone_nets └── mobilenetv2_backbone.py ├── cal_size.sh ├── cal_size_ARE.py ├── cal_size_kpts.py ├── config.py ├── data ├── preprocessed_MFCC │ ├── rand_id00001 │ │ └── 1TmvLk8sB-g │ │ │ ├── 00001.npy │ │ │ ├── 00002.npy │ │ │ └── 00003.npy │ ├── rand_id00002 │ │ └── 0XmNeUnOnlg │ │ │ ├── 00001.npy │ │ │ ├── 00002.npy │ │ │ └── 00003.npy │ ├── rand_id00003 │ │ └── 1M4q6CQM5pA │ │ │ ├── 00001.npy │ │ │ ├── 00002.npy │ │ │ └── 00003.npy │ ├── rand_id00004 │ │ └── _2wZVvsQYFg │ │ │ ├── 00001.npy │ │ │ ├── 00002.npy │ │ │ └── 00003.npy │ └── rand_id00005 │ │ └── 0nH78dDh0N0 │ │ ├── 00001.npy │ │ ├── 00002.npy │ │ └── 00003.npy ├── results │ ├── rand_id00001 │ │ ├── 1TmvLk8sB-g_00001_image.png │ │ ├── 1TmvLk8sB-g_00001_overlap.png │ │ ├── 1TmvLk8sB-g_00002_image.png │ │ ├── 1TmvLk8sB-g_00002_overlap.png │ │ ├── 1TmvLk8sB-g_00003_image.png │ │ └── 1TmvLk8sB-g_00003_overlap.png │ ├── rand_id00002 │ │ ├── 0XmNeUnOnlg_00001_image.png │ │ ├── 0XmNeUnOnlg_00001_overlap.png │ │ ├── 0XmNeUnOnlg_00002_image.png │ │ ├── 0XmNeUnOnlg_00002_overlap.png │ │ ├── 0XmNeUnOnlg_00003_image.png │ │ └── 0XmNeUnOnlg_00003_overlap.png │ ├── rand_id00003 │ │ ├── 1M4q6CQM5pA_00001_image.png │ │ ├── 1M4q6CQM5pA_00001_overlap.png │ │ ├── 1M4q6CQM5pA_00002_image.png │ │ ├── 1M4q6CQM5pA_00002_overlap.png │ │ ├── 1M4q6CQM5pA_00003_image.png │ │ └── 1M4q6CQM5pA_00003_overlap.png │ ├── rand_id00004 │ │ ├── _2wZVvsQYFg_00001_image.png │ │ ├── _2wZVvsQYFg_00001_overlap.png │ │ ├── _2wZVvsQYFg_00002_image.png │ │ ├── _2wZVvsQYFg_00002_overlap.png │ │ ├── _2wZVvsQYFg_00003_image.png │ │ └── _2wZVvsQYFg_00003_overlap.png │ └── rand_id00005 │ │ ├── 0nH78dDh0N0_00001_image.png │ │ ├── 0nH78dDh0N0_00001_overlap.png │ │ ├── 0nH78dDh0N0_00002_image.png │ │ ├── 0nH78dDh0N0_00002_overlap.png │ │ ├── 0nH78dDh0N0_00003_image.png │ │ └── 0nH78dDh0N0_00003_overlap.png ├── results_reference │ ├── Asa_Butterfield │ │ ├── 1TmvLk8sB-g_00001_img.png │ │ ├── 1TmvLk8sB-g_00001_overlap.png │ │ ├── 1TmvLk8sB-g_00002_img.png │ │ ├── 1TmvLk8sB-g_00002_overlap.png │ │ ├── 1TmvLk8sB-g_00003_img.png │ │ └── 1TmvLk8sB-g_00003_overlap.png │ ├── Ashley_Greene │ │ ├── 0XmNeUnOnlg_00001_img.png │ │ ├── 0XmNeUnOnlg_00001_overlap.png │ │ ├── 0XmNeUnOnlg_00002_img.png │ │ ├── 0XmNeUnOnlg_00002_overlap.png │ │ ├── 0XmNeUnOnlg_00003_img.png │ │ └── 0XmNeUnOnlg_00003_overlap.png │ ├── Bellamy_Young │ │ ├── 1M4q6CQM5pA_00001_img.png │ │ ├── 1M4q6CQM5pA_00001_overlap.png │ │ ├── 1M4q6CQM5pA_00002_img.png │ │ ├── 1M4q6CQM5pA_00002_overlap.png │ │ ├── 1M4q6CQM5pA_00003_img.png │ │ └── 1M4q6CQM5pA_00003_overlap.png │ ├── Bethany_Mota │ │ ├── _2wZVvsQYFg_00001_img.png │ │ ├── _2wZVvsQYFg_00001_overlap.png │ │ ├── _2wZVvsQYFg_00002_img.png │ │ ├── _2wZVvsQYFg_00002_overlap.png │ │ ├── _2wZVvsQYFg_00003_img.png │ │ └── _2wZVvsQYFg_00003_overlap.png │ └── Eva_Longoria │ │ ├── 0nH78dDh0N0_00001_img.png │ │ ├── 0nH78dDh0N0_00001_overlap.png │ │ ├── 0nH78dDh0N0_00002_img.png │ │ ├── 0nH78dDh0N0_00002_overlap.png │ │ ├── 0nH78dDh0N0_00003_img.png │ │ └── 0nH78dDh0N0_00003_overlap.png └── vox1_meta.csv ├── dataset.py ├── demo.py ├── demo ├── coherence.png ├── overall_purpose.png └── supervised_comp.png ├── demo_mic.py ├── distiller_zoo.py ├── environment.yml ├── eval_sup.py ├── face_types └── .placeholder ├── gan_train_cascade.py ├── mfcc.py ├── network.py ├── parse_dataset.py ├── pyaudio_recording.py ├── utilf ├── __init__.py └── render.py ├── utils.py └── vad.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | #lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | pretrained_models/ 132 | train.configs/ 133 | data/results/ 134 | *.obj -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Cho Ying Wu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #
Cross-Modal Perceptionist
2 | Code Repository for CVPR 2022 "Cross-Modal Perceptionist: Can Face Geometry be Gleaned from Voices?" 3 | 4 | Cho-Ying Wu, Chin-Cheng Hsu, Ulrich Neumann, University of Southern California 5 | 6 | [Paper] [Project page] [Voxceleb-3D Data] 7 | 8 | Check the project page for the introduction of this cool work! 9 | 10 | Update: 2022/12/01 Added Evaluation code, pretained model, and execution script for supervised framework. Organized data structure of Voxceleb-3D 11 | 12 | Voxceleb-3D: 13 | 14 | (1) [Here] contains data with names starting from F-Z as the training set. 100G zipped file, ~250G after unzip. This set contains pointcloud (.xyz), reconstructed mesh overlapped on images from VGGFace (_b.jpg), and 199-dim 3DMM parameters using BFM Face 2009 basis. This is in contrast to simplified 3DMM basis for first 40-dim shape + next 10-dim expression. (SynergyNet follows 3DDFA-v2 and uses the first 40-dim out of 199-dim as the shape code. Thus, the first 40 in 199 correspond to first 40-dim in SynergyNet.)You can donwload full basis from BFM-2009 official website. There are multiple 3D faces for an identity. 15 | 16 | (2) [Here] contains data with names starting from A-E as the validation set. 300M. The format is the same except there is only one 3D face for an identity as groundtruth. 17 | 18 | (3) [Here] contains images from VGGFace we used to reconstruct 3D faces for (1) and (2) 19 | 20 | (4) [Here] contains preprocessed voice data (MFCC features) from Voxceleb for all the identities. 38G zipped file. Refer to this [meta file] to map id to name. 21 | 22 | (5) [Here] contains preprocessed voice data (MFCC features) from Voxceleb for the testing subset (A-E). You can download it for inference purpose. See later section. 23 | 24 | 25 | 26 |

27 | 28 |

29 | 30 | We study the cross-modal learning and analyze the correlation between voices and 3D face geometry. Unlike previous methods for studying this correlation between voices and faces and only work on the 2D domain, we choose 3D representation that can better validate the supportive evidence from the physiology of the correlation between voices and skeletal and articulator structures, which potentially affect facial geometry. 31 | 32 | Comparison of recovered 3D face meshes with the baseline. 33 | 34 |

35 | 36 |

37 | 38 | Consistency for the same identity using different utterances. 39 | 40 |

41 | 42 |

43 | 44 | ##
Demo: Preprocessed fbank
45 | 46 | We test on Ubuntu 16.04 LTS, NVIDIA 2080 Ti (only GPU is supported), and use anaconda for installing packages 47 | 48 | Install packages 49 | 50 | 1. `conda create --name CMP python=3.8` 51 | 2. Install PyTorch compatible to your computer, we test on PyTorch v1.9 (should be compatible with other 1.0+ versions) 52 | 3. install other dependency: opencv-python, scipy, PIL, Cython, pyaudio 53 | 54 | Or use the environment.yml we provide instead: 55 | - `conda env create -f environment.yml` 56 | - `conda activate CMP` 57 | 58 | 4. Build the rendering toolkit (by c++ and cython) for overlapping 3D meshes on images with configurations 59 | 60 | ``` 61 | cd Sim3DR 62 | bash build_sim3dr.sh 63 | cd .. 64 | ``` 65 | 66 | Download pretrained models and 3DMM configuration data 67 | 68 | 5. Download from [here] (~160M) and unzip under the root folder. This will create 'pretrained_models' (trained by unsupervised CMP) and 'train.configs' (3DMM config data) under the root folder. 69 | 70 | Read the preprocessed fbank for inference 71 | 72 | 6. `python demo.py` (This will fetch the preprocessed MFCC and use them as network inputs) 73 | 7. Results will be generated under `data/results/` (pre-generated references are under `data/results_reference`) 74 | 75 | More preprocessed MFCC and 3D mesh (3DMM params) pairs can be downloaded: [Voxceleb-3D Data] (about 100G). 76 | 77 | ##
Demo: :laughing: Try it! Use device mic input
78 | 79 | 1. Do the above 1-5 step. Plus, download the face type meshes and extract under ./face_types 80 | 81 | 2. `python demo_mic.py` The demo will take 5 seconds recording from your device and predict the face mesh. 82 | 83 | We perform unsupervised gender classfication based on mean male and female shape and calculate the statistics between the predicted face and mean shape. Also we calculate the distance between the four types of faces (Regular, Slim, Skinny, Wide)and indicate which type the voice is closer to. 84 | 85 | 3. Results will be generated under data/results 86 | 87 | ##
Inference from supervised framework
88 | 89 | 1. Do the 1-5 step in Demo. Download pretrained supervised model [here]. Download voice data (A-E) for inference [here], [meta file], and [groundtruth]. Put the pretrained model under './pretrained_models/supervised_64'. Put the vocie data and meta file under './data'. Put the groundtruth under './data' and extract. 90 | 91 | 92 | 2. Edit config.py Line 6: change to 'pretrained_models/supervised_64' 93 | 94 | 3. 95 | ``` 96 | python eval_sup.py 97 | ``` 98 | 99 | This will match identity from voiceID and available 3D faces reconstructed from VGGFace via the meta file. Only predict 3D faces for those matched ID. Then it will save all the mesh obj files under './data/supervised_output/' 100 | 101 | ##
Evaluation
102 | 103 | 1. Do the 1-5 step in Demo. Download generated and saved mesh for validation set (name starting from A-E in Voxceleb-3D). From supervised CMP: https://drive.google.com/file/d/1_xobyRM-abjfrvzjbF7uwMVPFPfeKZC9/view?usp=share_link; 104 | 105 | (The same as groundtruth in the supervised inference) Voxceleb-3D validation set: https://drive.google.com/file/d/1NdkqlCPhl-mvPU9TYlPgHE_FaNJjAysf/view?usp=share_link. Put them under './data' and extract. 106 | 107 | The valiation set for each identity contains image (.jpg), mesh (.obj), pointcloud (.xyz), image overlapped with mesh (_b.jpg), 3DMM parameters (.npy) (199-dim for shape and 29-dim for expression. This is in contrast to simplified 3DMM basis for 40-dim shape and 10-dim expression. You can donwload full basis from BFM-2009 official website. Otherwise, we provided already reconstructed mesh in obj extension) 108 | 109 | 2. 110 | ``` 111 | bash cal_size.sh 112 | ``` 113 | 114 | This will run and report ARE metrics and keypoint error metrics. 115 | 116 | ##
Training
117 | 118 | 1. Train the unsupervised framework 119 | 120 | -- Download 'Voxceleb-3D' data (2), (3), and (4). They are validation set, training images, and training voice banks. Extract and put them under './data' 121 | 122 | -- Download a much smaller set [here] for fast online validation 123 | 124 | -- python gan_train_cascade.py 125 | 126 | ##
Citation
127 | If you find our work useful, please consider to cite us. 128 | 129 | @inproceedings{wu2022cross, 130 | title={Cross-Modal Perceptionist: Can Face Geometry be Gleaned from Voices?}, 131 | author={Wu, Cho-Ying and Hsu, Chin-Cheng and Neumann, Ulrich}, 132 | booktitle={CVPR}, 133 | year={2022} 134 | } 135 | 136 | 137 | This project is developed on [SynergyNet], [3DDFA-V2] and [reconstruction-faces-from-voice] 138 | -------------------------------------------------------------------------------- /Sim3DR/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | cmake-build-debug/ 3 | .idea/ 4 | build/ 5 | *.so 6 | data/ 7 | 8 | lib/rasterize.cpp -------------------------------------------------------------------------------- /Sim3DR/Sim3DR.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from . import _init_paths 4 | import numpy as np 5 | import Sim3DR_Cython 6 | 7 | 8 | def get_normal(vertices, triangles): 9 | normal = np.zeros_like(vertices, dtype=np.float32) 10 | Sim3DR_Cython.get_normal(normal, vertices, triangles, vertices.shape[0], triangles.shape[0]) 11 | return normal 12 | 13 | 14 | def rasterize(vertices, triangles, colors, bg=None, 15 | height=None, width=None, channel=None, 16 | reverse=False): 17 | if bg is not None: 18 | height, width, channel = bg.shape 19 | else: 20 | assert height is not None and width is not None and channel is not None 21 | bg = np.zeros((height, width, channel), dtype=np.float32) 22 | 23 | buffer = np.zeros((height, width), dtype=np.float32) - 1e8 24 | 25 | if colors.dtype != np.float32: 26 | colors = colors.astype(np.float32) 27 | Sim3DR_Cython.rasterize(bg, vertices, triangles, colors, buffer, triangles.shape[0], height, width, channel, 28 | reverse=reverse) 29 | return bg 30 | -------------------------------------------------------------------------------- /Sim3DR/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from .Sim3DR import get_normal, rasterize 4 | from .lighting import RenderPipeline 5 | -------------------------------------------------------------------------------- /Sim3DR/_init_paths.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os.path as osp 4 | import sys 5 | 6 | 7 | def add_path(path): 8 | if path not in sys.path: 9 | sys.path.insert(0, path) 10 | 11 | 12 | this_dir = osp.dirname(__file__) 13 | lib_path = osp.join(this_dir, '.') 14 | add_path(lib_path) 15 | -------------------------------------------------------------------------------- /Sim3DR/build_sim3dr.sh: -------------------------------------------------------------------------------- 1 | python3 setup.py build_ext --inplace -------------------------------------------------------------------------------- /Sim3DR/lib/rasterize.h: -------------------------------------------------------------------------------- 1 | #ifndef MESH_CORE_HPP_ 2 | #define MESH_CORE_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | class Point3D { 14 | public: 15 | float x; 16 | float y; 17 | float z; 18 | 19 | public: 20 | Point3D() : x(0.f), y(0.f), z(0.f) {} 21 | Point3D(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {} 22 | 23 | void initialize(float x_, float y_, float z_){ 24 | this->x = x_; this->y = y_; this->z = z_; 25 | } 26 | 27 | Point3D cross(Point3D &p){ 28 | Point3D c; 29 | c.x = this->y * p.z - this->z * p.y; 30 | c.y = this->z * p.x - this->x * p.z; 31 | c.z = this->x * p.y - this->y * p.x; 32 | return c; 33 | } 34 | 35 | float dot(Point3D &p) { 36 | return this->x * p.x + this->y * p.y + this->z * p.z; 37 | } 38 | 39 | Point3D operator-(const Point3D &p) { 40 | Point3D np; 41 | np.x = this->x - p.x; 42 | np.y = this->y - p.y; 43 | np.z = this->z - p.z; 44 | return np; 45 | } 46 | 47 | }; 48 | 49 | class Point { 50 | public: 51 | float x; 52 | float y; 53 | 54 | public: 55 | Point() : x(0.f), y(0.f) {} 56 | Point(float x_, float y_) : x(x_), y(y_) {} 57 | float dot(Point p) { 58 | return this->x * p.x + this->y * p.y; 59 | } 60 | 61 | Point operator-(const Point &p) { 62 | Point np; 63 | np.x = this->x - p.x; 64 | np.y = this->y - p.y; 65 | return np; 66 | } 67 | 68 | Point operator+(const Point &p) { 69 | Point np; 70 | np.x = this->x + p.x; 71 | np.y = this->y + p.y; 72 | return np; 73 | } 74 | 75 | Point operator*(float s) { 76 | Point np; 77 | np.x = s * this->x; 78 | np.y = s * this->y; 79 | return np; 80 | } 81 | }; 82 | 83 | 84 | bool is_point_in_tri(Point p, Point p0, Point p1, Point p2); 85 | 86 | void get_point_weight(float *weight, Point p, Point p0, Point p1, Point p2); 87 | 88 | void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int ntri, bool norm_flg); 89 | 90 | void _get_ver_normal(float *ver_normal, float *tri_normal, int *triangles, int nver, int ntri); 91 | 92 | void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri); 93 | 94 | void _rasterize_triangles( 95 | float *vertices, int *triangles, float *depth_buffer, int *triangle_buffer, float *barycentric_weight, 96 | int ntri, int h, int w); 97 | 98 | void _rasterize( 99 | unsigned char *image, float *vertices, int *triangles, float *colors, 100 | float *depth_buffer, int ntri, int h, int w, int c, float alpha, bool reverse); 101 | 102 | void _render_texture_core( 103 | float *image, float *vertices, int *triangles, 104 | float *texture, float *tex_coords, int *tex_triangles, 105 | float *depth_buffer, 106 | int nver, int tex_nver, int ntri, 107 | int h, int w, int c, 108 | int tex_h, int tex_w, int tex_c, 109 | int mapping_type); 110 | 111 | void _write_obj_with_colors_texture(string filename, string mtl_name, 112 | float *vertices, int *triangles, float *colors, float *uv_coords, 113 | int nver, int ntri, int ntexver); 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /Sim3DR/lib/rasterize.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | # from libcpp.string cimport string 4 | cimport cython 5 | from libcpp cimport bool 6 | 7 | # from cpython import bool 8 | 9 | # use the Numpy-C-API from Cython 10 | np.import_array() 11 | 12 | # cdefine the signature of our c function 13 | cdef extern from "rasterize.h": 14 | void _rasterize_triangles( 15 | float*vertices, int*triangles, float*depth_buffer, int*triangle_buffer, float*barycentric_weight, 16 | int ntri, int h, int w 17 | ) 18 | 19 | void _rasterize( 20 | unsigned char*image, float*vertices, int*triangles, float*colors, float*depth_buffer, 21 | int ntri, int h, int w, int c, float alpha, bool reverse 22 | ) 23 | 24 | # void _render_texture_core( 25 | # float* image, float* vertices, int* triangles, 26 | # float* texture, float* tex_coords, int* tex_triangles, 27 | # float* depth_buffer, 28 | # int nver, int tex_nver, int ntri, 29 | # int h, int w, int c, 30 | # int tex_h, int tex_w, int tex_c, 31 | # int mapping_type) 32 | 33 | void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int nver, bool norm_flg) 34 | void _get_ver_normal(float *ver_normal, float*tri_normal, int*triangles, int nver, int ntri) 35 | void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri) 36 | 37 | 38 | # void _write_obj_with_colors_texture(string filename, string mtl_name, 39 | # float* vertices, int* triangles, float* colors, float* uv_coords, 40 | # int nver, int ntri, int ntexver) 41 | 42 | @cython.boundscheck(False) 43 | @cython.wraparound(False) 44 | def get_tri_normal(np.ndarray[float, ndim=2, mode="c"] tri_normal not None, 45 | np.ndarray[float, ndim=2, mode = "c"] vertices not None, 46 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 47 | int ntri, bool norm_flg = False): 48 | _get_tri_normal( np.PyArray_DATA(tri_normal), np.PyArray_DATA(vertices), 49 | np.PyArray_DATA(triangles), ntri, norm_flg) 50 | 51 | @cython.boundscheck(False) # turn off bounds-checking for entire function 52 | @cython.wraparound(False) # turn off negative index wrapping for entire function 53 | def get_ver_normal(np.ndarray[float, ndim=2, mode = "c"] ver_normal not None, 54 | np.ndarray[float, ndim=2, mode = "c"] tri_normal not None, 55 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 56 | int nver, int ntri): 57 | _get_ver_normal( 58 | np.PyArray_DATA(ver_normal), np.PyArray_DATA(tri_normal), np.PyArray_DATA(triangles), 59 | nver, ntri) 60 | 61 | @cython.boundscheck(False) # turn off bounds-checking for entire function 62 | @cython.wraparound(False) # turn off negative index wrapping for entire function 63 | def get_normal(np.ndarray[float, ndim=2, mode = "c"] ver_normal not None, 64 | np.ndarray[float, ndim=2, mode = "c"] vertices not None, 65 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 66 | int nver, int ntri): 67 | _get_normal( 68 | np.PyArray_DATA(ver_normal), np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), 69 | nver, ntri) 70 | 71 | 72 | @cython.boundscheck(False) # turn off bounds-checking for entire function 73 | @cython.wraparound(False) # turn off negative index wrapping for entire function 74 | def rasterize_triangles( 75 | np.ndarray[float, ndim=2, mode = "c"] vertices not None, 76 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 77 | np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None, 78 | np.ndarray[int, ndim=2, mode = "c"] triangle_buffer not None, 79 | np.ndarray[float, ndim=2, mode = "c"] barycentric_weight not None, 80 | int ntri, int h, int w 81 | ): 82 | _rasterize_triangles( 83 | np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), 84 | np.PyArray_DATA(depth_buffer), np.PyArray_DATA(triangle_buffer), 85 | np.PyArray_DATA(barycentric_weight), 86 | ntri, h, w) 87 | 88 | @cython.boundscheck(False) # turn off bounds-checking for entire function 89 | @cython.wraparound(False) # turn off negative index wrapping for entire function 90 | def rasterize(np.ndarray[unsigned char, ndim=3, mode = "c"] image not None, 91 | np.ndarray[float, ndim=2, mode = "c"] vertices not None, 92 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 93 | np.ndarray[float, ndim=2, mode = "c"] colors not None, 94 | np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None, 95 | int ntri, int h, int w, int c, float alpha = 1, bool reverse = False 96 | ): 97 | _rasterize( 98 | np.PyArray_DATA(image), np.PyArray_DATA(vertices), 99 | np.PyArray_DATA(triangles), 100 | np.PyArray_DATA(colors), 101 | np.PyArray_DATA(depth_buffer), 102 | ntri, h, w, c, alpha, reverse) 103 | 104 | # def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None, 105 | # np.ndarray[float, ndim=2, mode = "c"] vertices not None, 106 | # np.ndarray[int, ndim=2, mode="c"] triangles not None, 107 | # np.ndarray[float, ndim=3, mode = "c"] texture not None, 108 | # np.ndarray[float, ndim=2, mode = "c"] tex_coords not None, 109 | # np.ndarray[int, ndim=2, mode="c"] tex_triangles not None, 110 | # np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None, 111 | # int nver, int tex_nver, int ntri, 112 | # int h, int w, int c, 113 | # int tex_h, int tex_w, int tex_c, 114 | # int mapping_type 115 | # ): 116 | # _render_texture_core( 117 | # np.PyArray_DATA(image), np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), 118 | # np.PyArray_DATA(texture), np.PyArray_DATA(tex_coords), np.PyArray_DATA(tex_triangles), 119 | # np.PyArray_DATA(depth_buffer), 120 | # nver, tex_nver, ntri, 121 | # h, w, c, 122 | # tex_h, tex_w, tex_c, 123 | # mapping_type) 124 | # 125 | # def write_obj_with_colors_texture_core(string filename, string mtl_name, 126 | # np.ndarray[float, ndim=2, mode = "c"] vertices not None, 127 | # np.ndarray[int, ndim=2, mode="c"] triangles not None, 128 | # np.ndarray[float, ndim=2, mode = "c"] colors not None, 129 | # np.ndarray[float, ndim=2, mode = "c"] uv_coords not None, 130 | # int nver, int ntri, int ntexver 131 | # ): 132 | # _write_obj_with_colors_texture(filename, mtl_name, 133 | # np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), np.PyArray_DATA(colors), np.PyArray_DATA(uv_coords), 134 | # nver, ntri, ntexver) 135 | -------------------------------------------------------------------------------- /Sim3DR/lib/rasterize_kernel.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* 4 | Author: Yao Feng 5 | Modified by Jianzhu Guo 6 | functions that can not be optimazed by vertorization in python. 7 | 1. rasterization.(need process each triangle) 8 | 2. normal of each vertex.(use one-ring, need process each vertex) 9 | 3. write obj(seems that it can be verctorized? anyway, writing it in c++ is simple, so also add function here. --> however, why writting in c++ is still slow?) 10 | */ 11 | 12 | #include "rasterize.h" 13 | 14 | 15 | /* Judge whether the Point is in the triangle 16 | Method: 17 | http://blackpawn.com/texts/pointinpoly/ 18 | Args: 19 | Point: [x, y] 20 | tri_points: three vertices(2d points) of a triangle. 2 coords x 3 vertices 21 | Returns: 22 | bool: true for in triangle 23 | */ 24 | bool is_point_in_tri(Point p, Point p0, Point p1, Point p2) { 25 | // vectors 26 | Point v0, v1, v2; 27 | v0 = p2 - p0; 28 | v1 = p1 - p0; 29 | v2 = p - p0; 30 | 31 | // dot products 32 | float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0) 33 | float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1) 34 | float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2) 35 | float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1) 36 | float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2) 37 | 38 | // barycentric coordinates 39 | float inverDeno; 40 | if (dot00 * dot11 - dot01 * dot01 == 0) 41 | inverDeno = 0; 42 | else 43 | inverDeno = 1 / (dot00 * dot11 - dot01 * dot01); 44 | 45 | float u = (dot11 * dot02 - dot01 * dot12) * inverDeno; 46 | float v = (dot00 * dot12 - dot01 * dot02) * inverDeno; 47 | 48 | // check if Point in triangle 49 | return (u >= 0) && (v >= 0) && (u + v < 1); 50 | } 51 | 52 | void get_point_weight(float *weight, Point p, Point p0, Point p1, Point p2) { 53 | // vectors 54 | Point v0, v1, v2; 55 | v0 = p2 - p0; 56 | v1 = p1 - p0; 57 | v2 = p - p0; 58 | 59 | // dot products 60 | float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0) 61 | float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1) 62 | float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2) 63 | float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1) 64 | float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2) 65 | 66 | // barycentric coordinates 67 | float inverDeno; 68 | if (dot00 * dot11 - dot01 * dot01 == 0) 69 | inverDeno = 0; 70 | else 71 | inverDeno = 1 / (dot00 * dot11 - dot01 * dot01); 72 | 73 | float u = (dot11 * dot02 - dot01 * dot12) * inverDeno; 74 | float v = (dot00 * dot12 - dot01 * dot02) * inverDeno; 75 | 76 | // weight 77 | weight[0] = 1 - u - v; 78 | weight[1] = v; 79 | weight[2] = u; 80 | } 81 | 82 | /* 83 | * Get normals of triangles. 84 | */ 85 | void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int ntri, bool norm_flg) { 86 | int tri_p0_ind, tri_p1_ind, tri_p2_ind; 87 | float v1x, v1y, v1z, v2x, v2y, v2z; 88 | 89 | for (int i = 0; i < ntri; i++) { 90 | tri_p0_ind = triangles[3 * i]; 91 | tri_p1_ind = triangles[3 * i + 1]; 92 | tri_p2_ind = triangles[3 * i + 2]; 93 | 94 | // counter clockwise order 95 | v1x = vertices[3 * tri_p1_ind] - vertices[3 * tri_p0_ind]; 96 | v1y = vertices[3 * tri_p1_ind + 1] - vertices[3 * tri_p0_ind + 1]; 97 | v1z = vertices[3 * tri_p1_ind + 2] - vertices[3 * tri_p0_ind + 2]; 98 | 99 | v2x = vertices[3 * tri_p2_ind] - vertices[3 * tri_p0_ind]; 100 | v2y = vertices[3 * tri_p2_ind + 1] - vertices[3 * tri_p0_ind + 1]; 101 | v2z = vertices[3 * tri_p2_ind + 2] - vertices[3 * tri_p0_ind + 2]; 102 | 103 | if (norm_flg) { 104 | float c1 = v1y * v2z - v1z * v2y; 105 | float c2 = v1z * v2x - v1x * v2z; 106 | float c3 = v1x * v2y - v1y * v2x; 107 | float det = sqrt(c1 * c1 + c2 * c2 + c3 * c3); 108 | if (det <= 0) det = 1e-6; 109 | tri_normal[3 * i] = c1 / det; 110 | tri_normal[3 * i + 1] = c2 / det; 111 | tri_normal[3 * i + 2] = c3 / det; 112 | } else { 113 | tri_normal[3 * i] = v1y * v2z - v1z * v2y; 114 | tri_normal[3 * i + 1] = v1z * v2x - v1x * v2z; 115 | tri_normal[3 * i + 2] = v1x * v2y - v1y * v2x; 116 | } 117 | } 118 | } 119 | 120 | /* 121 | * Get normal vector of vertices using triangle normals 122 | */ 123 | void _get_ver_normal(float *ver_normal, float *tri_normal, int *triangles, int nver, int ntri) { 124 | int tri_p0_ind, tri_p1_ind, tri_p2_ind; 125 | 126 | for (int i = 0; i < ntri; i++) { 127 | tri_p0_ind = triangles[3 * i]; 128 | tri_p1_ind = triangles[3 * i + 1]; 129 | tri_p2_ind = triangles[3 * i + 2]; 130 | 131 | for (int j = 0; j < 3; j++) { 132 | ver_normal[3 * tri_p0_ind + j] += tri_normal[3 * i + j]; 133 | ver_normal[3 * tri_p1_ind + j] += tri_normal[3 * i + j]; 134 | ver_normal[3 * tri_p2_ind + j] += tri_normal[3 * i + j]; 135 | } 136 | } 137 | 138 | // normalizing 139 | float nx, ny, nz, det; 140 | for (int i = 0; i < nver; ++i) { 141 | nx = ver_normal[3 * i]; 142 | ny = ver_normal[3 * i + 1]; 143 | nz = ver_normal[3 * i + 2]; 144 | 145 | det = sqrt(nx * nx + ny * ny + nz * nz); 146 | if (det <= 0) det = 1e-6; 147 | ver_normal[3 * i] = nx / det; 148 | ver_normal[3 * i + 1] = ny / det; 149 | ver_normal[3 * i + 2] = nz / det; 150 | } 151 | } 152 | 153 | /* 154 | * Directly get normal of vertices, which can be regraded as a combination of _get_tri_normal and _get_ver_normal 155 | */ 156 | void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri) { 157 | int tri_p0_ind, tri_p1_ind, tri_p2_ind; 158 | float v1x, v1y, v1z, v2x, v2y, v2z; 159 | 160 | // get tri_normal 161 | // float tri_normal[3 * ntri]; 162 | float* tri_normal; 163 | tri_normal = new float [3 * ntri]; 164 | for (int i = 0; i < ntri; i++) { 165 | tri_p0_ind = triangles[3 * i]; 166 | tri_p1_ind = triangles[3 * i + 1]; 167 | tri_p2_ind = triangles[3 * i + 2]; 168 | 169 | // counter clockwise order 170 | v1x = vertices[3 * tri_p1_ind] - vertices[3 * tri_p0_ind]; 171 | v1y = vertices[3 * tri_p1_ind + 1] - vertices[3 * tri_p0_ind + 1]; 172 | v1z = vertices[3 * tri_p1_ind + 2] - vertices[3 * tri_p0_ind + 2]; 173 | 174 | v2x = vertices[3 * tri_p2_ind] - vertices[3 * tri_p0_ind]; 175 | v2y = vertices[3 * tri_p2_ind + 1] - vertices[3 * tri_p0_ind + 1]; 176 | v2z = vertices[3 * tri_p2_ind + 2] - vertices[3 * tri_p0_ind + 2]; 177 | 178 | 179 | tri_normal[3 * i] = v1y * v2z - v1z * v2y; 180 | tri_normal[3 * i + 1] = v1z * v2x - v1x * v2z; 181 | tri_normal[3 * i + 2] = v1x * v2y - v1y * v2x; 182 | 183 | } 184 | 185 | // get ver_normal 186 | for (int i = 0; i < ntri; i++) { 187 | tri_p0_ind = triangles[3 * i]; 188 | tri_p1_ind = triangles[3 * i + 1]; 189 | tri_p2_ind = triangles[3 * i + 2]; 190 | 191 | for (int j = 0; j < 3; j++) { 192 | ver_normal[3 * tri_p0_ind + j] += tri_normal[3 * i + j]; 193 | ver_normal[3 * tri_p1_ind + j] += tri_normal[3 * i + j]; 194 | ver_normal[3 * tri_p2_ind + j] += tri_normal[3 * i + j]; 195 | } 196 | } 197 | 198 | // normalizing 199 | float nx, ny, nz, det; 200 | for (int i = 0; i < nver; ++i) { 201 | nx = ver_normal[3 * i]; 202 | ny = ver_normal[3 * i + 1]; 203 | nz = ver_normal[3 * i + 2]; 204 | 205 | det = sqrt(nx * nx + ny * ny + nz * nz); 206 | // if (det <= 0) det = 1e-6; 207 | ver_normal[3 * i] = nx / det; 208 | ver_normal[3 * i + 1] = ny / det; 209 | ver_normal[3 * i + 2] = nz / det; 210 | } 211 | 212 | delete[] tri_normal; 213 | } 214 | 215 | // rasterization by Z-Buffer with optimization 216 | // Complexity: < ntri * h * w * c 217 | void _rasterize( 218 | unsigned char *image, float *vertices, int *triangles, float *colors, float *depth_buffer, 219 | int ntri, int h, int w, int c, float alpha, bool reverse) { 220 | int x, y, k; 221 | int tri_p0_ind, tri_p1_ind, tri_p2_ind; 222 | Point p0, p1, p2, p; 223 | int x_min, x_max, y_min, y_max; 224 | float p_depth, p0_depth, p1_depth, p2_depth; 225 | float p_color, p0_color, p1_color, p2_color; 226 | float weight[3]; 227 | 228 | for (int i = 0; i < ntri; i++) { 229 | tri_p0_ind = triangles[3 * i]; 230 | tri_p1_ind = triangles[3 * i + 1]; 231 | tri_p2_ind = triangles[3 * i + 2]; 232 | 233 | p0.x = vertices[3 * tri_p0_ind]; 234 | p0.y = vertices[3 * tri_p0_ind + 1]; 235 | p0_depth = vertices[3 * tri_p0_ind + 2]; 236 | p1.x = vertices[3 * tri_p1_ind]; 237 | p1.y = vertices[3 * tri_p1_ind + 1]; 238 | p1_depth = vertices[3 * tri_p1_ind + 2]; 239 | p2.x = vertices[3 * tri_p2_ind]; 240 | p2.y = vertices[3 * tri_p2_ind + 1]; 241 | p2_depth = vertices[3 * tri_p2_ind + 2]; 242 | 243 | x_min = max((int) floor(min(p0.x, min(p1.x, p2.x))), 0); 244 | x_max = min((int) ceil(max(p0.x, max(p1.x, p2.x))), w - 1); 245 | 246 | y_min = max((int) floor(min(p0.y, min(p1.y, p2.y))), 0); 247 | y_max = min((int) ceil(max(p0.y, max(p1.y, p2.y))), h - 1); 248 | 249 | if (x_max < x_min || y_max < y_min) { 250 | continue; 251 | } 252 | 253 | for (y = y_min; y <= y_max; y++) { 254 | for (x = x_min; x <= x_max; x++) { 255 | p.x = x; 256 | p.y = y; 257 | if (is_point_in_tri(p, p0, p1, p2)) { 258 | get_point_weight(weight, p, p0, p1, p2); 259 | p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth; 260 | 261 | if ((p_depth > depth_buffer[y * w + x])) { 262 | for (k = 0; k < c; k++) { 263 | p0_color = colors[c * tri_p0_ind + k]; 264 | p1_color = colors[c * tri_p1_ind + k]; 265 | p2_color = colors[c * tri_p2_ind + k]; 266 | 267 | p_color = weight[0] * p0_color + weight[1] * p1_color + weight[2] * p2_color; 268 | if (reverse) { 269 | image[(h - 1 - y) * w * c + x * c + k] = (unsigned char) ( 270 | (1 - alpha) * image[(h - 1 - y) * w * c + x * c + k] + alpha * 255 * p_color); 271 | // image[(h - 1 - y) * w * c + x * c + k] = (unsigned char) (255 * p_color); 272 | } else { 273 | image[y * w * c + x * c + k] = (unsigned char) ( 274 | (1 - alpha) * image[y * w * c + x * c + k] + alpha * 255 * p_color); 275 | // image[y * w * c + x * c + k] = (unsigned char) (255 * p_color); 276 | } 277 | } 278 | 279 | depth_buffer[y * w + x] = p_depth; 280 | } 281 | } 282 | } 283 | } 284 | } 285 | } 286 | 287 | 288 | void _rasterize_triangles( 289 | float *vertices, int *triangles, float *depth_buffer, int *triangle_buffer, float *barycentric_weight, 290 | int ntri, int h, int w) { 291 | int i; 292 | int x, y, k; 293 | int tri_p0_ind, tri_p1_ind, tri_p2_ind; 294 | Point p0, p1, p2, p; 295 | int x_min, x_max, y_min, y_max; 296 | float p_depth, p0_depth, p1_depth, p2_depth; 297 | float weight[3]; 298 | 299 | for (i = 0; i < ntri; i++) { 300 | tri_p0_ind = triangles[3 * i]; 301 | tri_p1_ind = triangles[3 * i + 1]; 302 | tri_p2_ind = triangles[3 * i + 2]; 303 | 304 | p0.x = vertices[3 * tri_p0_ind]; 305 | p0.y = vertices[3 * tri_p0_ind + 1]; 306 | p0_depth = vertices[3 * tri_p0_ind + 2]; 307 | p1.x = vertices[3 * tri_p1_ind]; 308 | p1.y = vertices[3 * tri_p1_ind + 1]; 309 | p1_depth = vertices[3 * tri_p1_ind + 2]; 310 | p2.x = vertices[3 * tri_p2_ind]; 311 | p2.y = vertices[3 * tri_p2_ind + 1]; 312 | p2_depth = vertices[3 * tri_p2_ind + 2]; 313 | 314 | x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0); 315 | x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1); 316 | 317 | y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0); 318 | y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1); 319 | 320 | if (x_max < x_min || y_max < y_min) { 321 | continue; 322 | } 323 | 324 | for (y = y_min; y <= y_max; y++) //h 325 | { 326 | for (x = x_min; x <= x_max; x++) //w 327 | { 328 | p.x = x; 329 | p.y = y; 330 | // if (p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || is_point_in_tri(p, p0, p1, p2)) { 331 | if (is_point_in_tri(p, p0, p1, p2)) { 332 | get_point_weight(weight, p, p0, p1, p2); 333 | p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth; 334 | 335 | if ((p_depth > depth_buffer[y * w + x])) { 336 | depth_buffer[y * w + x] = p_depth; 337 | triangle_buffer[y * w + x] = i; 338 | for (k = 0; k < 3; k++) { 339 | barycentric_weight[y * w * 3 + x * 3 + k] = weight[k]; 340 | } 341 | } 342 | } 343 | } 344 | } 345 | } 346 | } 347 | 348 | 349 | // Depth-Buffer 算法 350 | // https://blog.csdn.net/Jurbo/article/details/75007260 351 | void _render_texture_core( 352 | float *image, float *vertices, int *triangles, 353 | float *texture, float *tex_coords, int *tex_triangles, 354 | float *depth_buffer, 355 | int nver, int tex_nver, int ntri, 356 | int h, int w, int c, 357 | int tex_h, int tex_w, int tex_c, 358 | int mapping_type) { 359 | int i; 360 | int x, y, k; 361 | int tri_p0_ind, tri_p1_ind, tri_p2_ind; 362 | int tex_tri_p0_ind, tex_tri_p1_ind, tex_tri_p2_ind; 363 | Point p0, p1, p2, p; 364 | Point tex_p0, tex_p1, tex_p2, tex_p; 365 | int x_min, x_max, y_min, y_max; 366 | float weight[3]; 367 | float p_depth, p0_depth, p1_depth, p2_depth; 368 | float xd, yd; 369 | float ul, ur, dl, dr; 370 | for (i = 0; i < ntri; i++) { 371 | // mesh 372 | tri_p0_ind = triangles[3 * i]; 373 | tri_p1_ind = triangles[3 * i + 1]; 374 | tri_p2_ind = triangles[3 * i + 2]; 375 | 376 | p0.x = vertices[3 * tri_p0_ind]; 377 | p0.y = vertices[3 * tri_p0_ind + 1]; 378 | p0_depth = vertices[3 * tri_p0_ind + 2]; 379 | p1.x = vertices[3 * tri_p1_ind]; 380 | p1.y = vertices[3 * tri_p1_ind + 1]; 381 | p1_depth = vertices[3 * tri_p1_ind + 2]; 382 | p2.x = vertices[3 * tri_p2_ind]; 383 | p2.y = vertices[3 * tri_p2_ind + 1]; 384 | p2_depth = vertices[3 * tri_p2_ind + 2]; 385 | 386 | // texture 387 | tex_tri_p0_ind = tex_triangles[3 * i]; 388 | tex_tri_p1_ind = tex_triangles[3 * i + 1]; 389 | tex_tri_p2_ind = tex_triangles[3 * i + 2]; 390 | 391 | tex_p0.x = tex_coords[3 * tex_tri_p0_ind]; 392 | tex_p0.y = tex_coords[3 * tri_p0_ind + 1]; 393 | tex_p1.x = tex_coords[3 * tex_tri_p1_ind]; 394 | tex_p1.y = tex_coords[3 * tri_p1_ind + 1]; 395 | tex_p2.x = tex_coords[3 * tex_tri_p2_ind]; 396 | tex_p2.y = tex_coords[3 * tri_p2_ind + 1]; 397 | 398 | 399 | x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0); 400 | x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1); 401 | 402 | y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0); 403 | y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1); 404 | 405 | 406 | if (x_max < x_min || y_max < y_min) { 407 | continue; 408 | } 409 | 410 | for (y = y_min; y <= y_max; y++) //h 411 | { 412 | for (x = x_min; x <= x_max; x++) //w 413 | { 414 | p.x = x; 415 | p.y = y; 416 | if (p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || is_point_in_tri(p, p0, p1, p2)) { 417 | get_point_weight(weight, p, p0, p1, p2); 418 | p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth; 419 | 420 | if ((p_depth > depth_buffer[y * w + x])) { 421 | // -- color from texture 422 | // cal weight in mesh tri 423 | get_point_weight(weight, p, p0, p1, p2); 424 | // cal coord in texture 425 | tex_p = tex_p0 * weight[0] + tex_p1 * weight[1] + tex_p2 * weight[2]; 426 | tex_p.x = max(min(tex_p.x, float(tex_w - 1)), float(0)); 427 | tex_p.y = max(min(tex_p.y, float(tex_h - 1)), float(0)); 428 | 429 | yd = tex_p.y - floor(tex_p.y); 430 | xd = tex_p.x - floor(tex_p.x); 431 | for (k = 0; k < c; k++) { 432 | if (mapping_type == 0)// nearest 433 | { 434 | image[y * w * c + x * c + k] = texture[int(round(tex_p.y)) * tex_w * tex_c + 435 | int(round(tex_p.x)) * tex_c + k]; 436 | } else//bilinear interp 437 | { 438 | ul = texture[(int) floor(tex_p.y) * tex_w * tex_c + (int) floor(tex_p.x) * tex_c + k]; 439 | ur = texture[(int) floor(tex_p.y) * tex_w * tex_c + (int) ceil(tex_p.x) * tex_c + k]; 440 | dl = texture[(int) ceil(tex_p.y) * tex_w * tex_c + (int) floor(tex_p.x) * tex_c + k]; 441 | dr = texture[(int) ceil(tex_p.y) * tex_w * tex_c + (int) ceil(tex_p.x) * tex_c + k]; 442 | 443 | image[y * w * c + x * c + k] = 444 | ul * (1 - xd) * (1 - yd) + ur * xd * (1 - yd) + dl * (1 - xd) * yd + 445 | dr * xd * yd; 446 | } 447 | 448 | } 449 | 450 | depth_buffer[y * w + x] = p_depth; 451 | } 452 | } 453 | } 454 | } 455 | } 456 | } 457 | 458 | 459 | // ------------------------------------------------- write 460 | // obj write 461 | // Ref: https://github.com/patrikhuber/eos/blob/master/include/eos/core/Mesh.hpp 462 | void _write_obj_with_colors_texture(string filename, string mtl_name, 463 | float *vertices, int *triangles, float *colors, float *uv_coords, 464 | int nver, int ntri, int ntexver) { 465 | int i; 466 | 467 | ofstream obj_file(filename); 468 | 469 | // first line of the obj file: the mtl name 470 | obj_file << "mtllib " << mtl_name << endl; 471 | 472 | // write vertices 473 | for (i = 0; i < nver; ++i) { 474 | obj_file << "v " << vertices[3 * i] << " " << vertices[3 * i + 1] << " " << vertices[3 * i + 2] << colors[3 * i] 475 | << " " << colors[3 * i + 1] << " " << colors[3 * i + 2] << endl; 476 | } 477 | 478 | // write uv coordinates 479 | for (i = 0; i < ntexver; ++i) { 480 | //obj_file << "vt " << uv_coords[2*i] << " " << (1 - uv_coords[2*i + 1]) << endl; 481 | obj_file << "vt " << uv_coords[2 * i] << " " << uv_coords[2 * i + 1] << endl; 482 | } 483 | 484 | obj_file << "usemtl FaceTexture" << endl; 485 | // write triangles 486 | for (i = 0; i < ntri; ++i) { 487 | // obj_file << "f " << triangles[3*i] << "/" << triangles[3*i] << " " << triangles[3*i + 1] << "/" << triangles[3*i + 1] << " " << triangles[3*i + 2] << "/" << triangles[3*i + 2] << endl; 488 | obj_file << "f " << triangles[3 * i + 2] << "/" << triangles[3 * i + 2] << " " << triangles[3 * i + 1] << "/" 489 | << triangles[3 * i + 1] << " " << triangles[3 * i] << "/" << triangles[3 * i] << endl; 490 | } 491 | 492 | } 493 | -------------------------------------------------------------------------------- /Sim3DR/lighting.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | from .Sim3DR import get_normal, rasterize 5 | 6 | _norm = lambda arr: arr / np.sqrt(np.sum(arr ** 2, axis=1))[:, None] 7 | 8 | 9 | def norm_vertices(vertices): 10 | vertices -= vertices.min(0)[None, :] 11 | vertices /= vertices.max() 12 | vertices *= 2 13 | vertices -= vertices.max(0)[None, :] / 2 14 | return vertices 15 | 16 | 17 | def convert_type(obj): 18 | if isinstance(obj, tuple) or isinstance(obj, list): 19 | return np.array(obj, dtype=np.float32)[None, :] 20 | return obj 21 | 22 | 23 | class RenderPipeline(object): 24 | def __init__(self, **kwargs): 25 | self.intensity_ambient = convert_type(kwargs.get('intensity_ambient', 0.3)) 26 | self.intensity_directional = convert_type(kwargs.get('intensity_directional', 0.6)) 27 | self.intensity_specular = convert_type(kwargs.get('intensity_specular', 0.1)) 28 | self.specular_exp = kwargs.get('specular_exp', 5) 29 | self.color_ambient = convert_type(kwargs.get('color_ambient', (1, 1, 1))) 30 | self.color_directional = convert_type(kwargs.get('color_directional', (1, 1, 1))) 31 | self.light_pos = convert_type(kwargs.get('light_pos', (0, 0, 5))) 32 | self.view_pos = convert_type(kwargs.get('view_pos', (0, 0, 5))) 33 | 34 | def update_light_pos(self, light_pos): 35 | self.light_pos = convert_type(light_pos) 36 | 37 | def __call__(self, vertices, triangles, bg, texture=None): 38 | normal = get_normal(vertices, triangles) 39 | 40 | # 2. lighting 41 | light = np.zeros_like(vertices, dtype=np.float32) 42 | # ambient component 43 | if self.intensity_ambient > 0: 44 | light += self.intensity_ambient * self.color_ambient 45 | 46 | vertices_n = norm_vertices(vertices.copy()) 47 | if self.intensity_directional > 0: 48 | # diffuse component 49 | direction = _norm(self.light_pos - vertices_n) 50 | cos = np.sum(normal * direction, axis=1)[:, None] 51 | # cos = np.clip(cos, 0, 1) 52 | # todo: check below 53 | light += self.intensity_directional * (self.color_directional * np.clip(cos, 0, 1)) 54 | 55 | # specular component 56 | if self.intensity_specular > 0: 57 | v2v = _norm(self.view_pos - vertices_n) 58 | reflection = 2 * cos * normal - direction 59 | spe = np.sum((v2v * reflection) ** self.specular_exp, axis=1)[:, None] 60 | spe = np.where(cos != 0, np.clip(spe, 0, 1), np.zeros_like(spe)) 61 | light += self.intensity_specular * self.color_directional * np.clip(spe, 0, 1) 62 | light = np.clip(light, 0, 1) 63 | 64 | # 2. rasterization, [0, 1] 65 | if texture is None: 66 | render_img = rasterize(vertices, triangles, light, bg=bg) 67 | return render_img 68 | else: 69 | texture *= light 70 | render_img = rasterize(vertices, triangles, texture, bg=bg) 71 | return render_img 72 | 73 | 74 | def main(): 75 | pass 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /Sim3DR/readme.md: -------------------------------------------------------------------------------- 1 | ## Sim3DR 2 | This is a simple 3D render, written by c++ and cython. 3 | 4 | ### Build Sim3DR 5 | 6 | ```shell script 7 | python3 setup.py build_ext --inplace 8 | ``` -------------------------------------------------------------------------------- /Sim3DR/setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | python setup.py build_ext -i 3 | to compile 4 | ''' 5 | 6 | from distutils.core import setup, Extension 7 | from Cython.Build import cythonize 8 | from Cython.Distutils import build_ext 9 | import numpy 10 | 11 | setup( 12 | name='Sim3DR_Cython', # not the package name 13 | cmdclass={'build_ext': build_ext}, 14 | ext_modules=[Extension("Sim3DR_Cython", 15 | sources=["lib/rasterize.pyx", "lib/rasterize_kernel.cpp"], 16 | language='c++', 17 | include_dirs=[numpy.get_include()], 18 | extra_compile_args=["-std=c++11"])], 19 | ) 20 | -------------------------------------------------------------------------------- /Sim3DR/tests/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /Sim3DR/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(TARGET test) 4 | project(${TARGET}) 5 | 6 | #find_package( OpenCV REQUIRED ) 7 | #include_directories( ${OpenCV_INCLUDE_DIRS} ) 8 | 9 | #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -O3") 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -std=c++11") 11 | add_executable(${TARGET} test.cpp rasterize_kernel.cpp io.cpp) 12 | target_include_directories(${TARGET} PRIVATE ${PROJECT_SOURCE_DIR}) 13 | -------------------------------------------------------------------------------- /Sim3DR/tests/io.cpp: -------------------------------------------------------------------------------- 1 | #include "io.h" 2 | 3 | //void load_obj(const string obj_fp, float* vertices, float* colors, float* triangles){ 4 | // string line; 5 | // ifstream in(obj_fp); 6 | // 7 | // if(in.is_open()){ 8 | // while (getline(in, line)){ 9 | // stringstream ss(line); 10 | // 11 | // char t; // type: v, f 12 | // ss >> t; 13 | // if (t == 'v'){ 14 | // 15 | // } 16 | // } 17 | // } 18 | //} 19 | 20 | void load_obj(const char *obj_fp, float *vertices, float *colors, int *triangles, int nver, int ntri) { 21 | FILE *fp; 22 | fp = fopen(obj_fp, "r"); 23 | 24 | char t; // type: v or f 25 | if (fp != nullptr) { 26 | for (int i = 0; i < nver; ++i) { 27 | fscanf(fp, "%c", &t); 28 | for (int j = 0; j < 3; ++j) 29 | fscanf(fp, " %f", &vertices[3 * i + j]); 30 | for (int j = 0; j < 3; ++j) 31 | fscanf(fp, " %f", &colors[3 * i + j]); 32 | fscanf(fp, "\n"); 33 | } 34 | // fscanf(fp, "%c", &t); 35 | for (int i = 0; i < ntri; ++i) { 36 | fscanf(fp, "%c", &t); 37 | for (int j = 0; j < 3; ++j) { 38 | fscanf(fp, " %d", &triangles[3 * i + j]); 39 | triangles[3 * i + j] -= 1; 40 | } 41 | fscanf(fp, "\n"); 42 | } 43 | 44 | fclose(fp); 45 | } 46 | } 47 | 48 | void load_ply(const char *ply_fp, float *vertices, int *triangles, int nver, int ntri) { 49 | FILE *fp; 50 | fp = fopen(ply_fp, "r"); 51 | 52 | // char s[256]; 53 | char t; 54 | if (fp != nullptr) { 55 | // for (int i = 0; i < 9; ++i) 56 | // fscanf(fp, "%s", s); 57 | for (int i = 0; i < nver; ++i) 58 | fscanf(fp, "%f %f %f\n", &vertices[3 * i], &vertices[3 * i + 1], &vertices[3 * i + 2]); 59 | 60 | for (int i = 0; i < ntri; ++i) 61 | fscanf(fp, "%c %d %d %d\n", &t, &triangles[3 * i], &triangles[3 * i + 1], &triangles[3 * i + 2]); 62 | 63 | fclose(fp); 64 | } 65 | } 66 | 67 | void write_ppm(const char *filename, unsigned char *img, int h, int w, int c) { 68 | FILE *fp; 69 | //open file for output 70 | fp = fopen(filename, "wb"); 71 | if (!fp) { 72 | fprintf(stderr, "Unable to open file '%s'\n", filename); 73 | exit(1); 74 | } 75 | 76 | //write the header file 77 | //image format 78 | fprintf(fp, "P6\n"); 79 | 80 | //image size 81 | fprintf(fp, "%d %d\n", w, h); 82 | 83 | // rgb component depth 84 | fprintf(fp, "%d\n", MAX_PXL_VALUE); 85 | 86 | // pixel data 87 | fwrite(img, sizeof(unsigned char), size_t(h * w * c), fp); 88 | fclose(fp); 89 | } -------------------------------------------------------------------------------- /Sim3DR/tests/io.h: -------------------------------------------------------------------------------- 1 | #ifndef IO_H_ 2 | #define IO_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | #define MAX_PXL_VALUE 255 13 | 14 | void load_obj(const char* obj_fp, float* vertices, float* colors, int* triangles, int nver, int ntri); 15 | void load_ply(const char* ply_fp, float* vertices, int* triangles, int nver, int ntri); 16 | 17 | 18 | void write_ppm(const char *filename, unsigned char *img, int h, int w, int c); 19 | 20 | #endif -------------------------------------------------------------------------------- /Sim3DR/tests/test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Tesing cases 3 | */ 4 | 5 | #include 6 | #include 7 | #include "rasterize.h" 8 | #include "io.h" 9 | 10 | void test_isPointInTri() { 11 | Point p0(0, 0); 12 | Point p1(1, 0); 13 | Point p2(1, 1); 14 | 15 | Point p(0.2, 0.2); 16 | 17 | if (is_point_in_tri(p, p0, p1, p2)) 18 | std::cout << "In"; 19 | else 20 | std::cout << "Out"; 21 | std::cout << std::endl; 22 | } 23 | 24 | void test_getPointWeight() { 25 | Point p0(0, 0); 26 | Point p1(1, 0); 27 | Point p2(1, 1); 28 | 29 | Point p(0.2, 0.2); 30 | 31 | float weight[3]; 32 | get_point_weight(weight, p, p0, p1, p2); 33 | std::cout << weight[0] << " " << weight[1] << " " << weight[2] << std::endl; 34 | } 35 | 36 | void test_get_tri_normal() { 37 | float tri_normal[3]; 38 | // float vertices[9] = {1, 0, 0, 0, 0, 0, 0, 1, 0}; 39 | float vertices[9] = {1, 1.1, 0, 0, 0, 0, 0, 0.6, 0.7}; 40 | int triangles[3] = {0, 1, 2}; 41 | int ntri = 1; 42 | 43 | _get_tri_normal(tri_normal, vertices, triangles, ntri); 44 | 45 | for (int i = 0; i < 3; ++i) 46 | std::cout << tri_normal[i] << ", "; 47 | std::cout << std::endl; 48 | } 49 | 50 | void test_load_obj() { 51 | const char *fp = "../data/vd005_mesh.obj"; 52 | int nver = 35709; 53 | int ntri = 70789; 54 | 55 | auto *vertices = new float[nver]; 56 | auto *colors = new float[nver]; 57 | auto *triangles = new int[ntri]; 58 | load_obj(fp, vertices, colors, triangles, nver, ntri); 59 | 60 | delete[] vertices; 61 | delete[] colors; 62 | delete[] triangles; 63 | } 64 | 65 | void test_render() { 66 | // 1. loading obj 67 | // const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/vd005_mesh.obj"; 68 | const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/face1.obj"; 69 | int nver = 35709; //53215; //35709; 70 | int ntri = 70789; //105840;//70789; 71 | 72 | auto *vertices = new float[3 * nver]; 73 | auto *colors = new float[3 * nver]; 74 | auto *triangles = new int[3 * ntri]; 75 | load_obj(fp, vertices, colors, triangles, nver, ntri); 76 | 77 | // 2. rendering 78 | int h = 224, w = 224, c = 3; 79 | 80 | // enlarging 81 | int scale = 4; 82 | h *= scale; 83 | w *= scale; 84 | for (int i = 0; i < nver * 3; ++i) vertices[i] *= scale; 85 | 86 | auto *image = new unsigned char[h * w * c](); 87 | auto *depth_buffer = new float[h * w](); 88 | 89 | for (int i = 0; i < h * w; ++i) depth_buffer[i] = -999999; 90 | 91 | clock_t t; 92 | t = clock(); 93 | 94 | _rasterize(image, vertices, triangles, colors, depth_buffer, ntri, h, w, c, true); 95 | t = clock() - t; 96 | double time_taken = ((double) t) / CLOCKS_PER_SEC; // in seconds 97 | printf("Render took %f seconds to execute \n", time_taken); 98 | 99 | 100 | // auto *image_char = new u_char[h * w * c](); 101 | // for (int i = 0; i < h * w * c; ++i) 102 | // image_char[i] = u_char(255 * image[i]); 103 | write_ppm("res.ppm", image, h, w, c); 104 | 105 | // delete[] image_char; 106 | delete[] vertices; 107 | delete[] colors; 108 | delete[] triangles; 109 | delete[] image; 110 | delete[] depth_buffer; 111 | } 112 | 113 | void test_light() { 114 | // 1. loading obj 115 | const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/emma_input_0_noheader.ply"; 116 | int nver = 53215; //35709; 117 | int ntri = 105840; //70789; 118 | 119 | auto *vertices = new float[3 * nver]; 120 | auto *colors = new float[3 * nver]; 121 | auto *triangles = new int[3 * ntri]; 122 | load_ply(fp, vertices, triangles, nver, ntri); 123 | 124 | // 2. rendering 125 | // int h = 1901, w = 3913, c = 3; 126 | int h = 2000, w = 4000, c = 3; 127 | 128 | // enlarging 129 | // int scale = 1; 130 | // h *= scale; 131 | // w *= scale; 132 | // for (int i = 0; i < nver * 3; ++i) vertices[i] *= scale; 133 | 134 | auto *image = new unsigned char[h * w * c](); 135 | auto *depth_buffer = new float[h * w](); 136 | 137 | for (int i = 0; i < h * w; ++i) depth_buffer[i] = -999999; 138 | for (int i = 0; i < 3 * nver; ++i) colors[i] = 0.8; 139 | 140 | clock_t t; 141 | t = clock(); 142 | 143 | _rasterize(image, vertices, triangles, colors, depth_buffer, ntri, h, w, c, true); 144 | t = clock() - t; 145 | double time_taken = ((double) t) / CLOCKS_PER_SEC; // in seconds 146 | printf("Render took %f seconds to execute \n", time_taken); 147 | 148 | 149 | // auto *image_char = new u_char[h * w * c](); 150 | // for (int i = 0; i < h * w * c; ++i) 151 | // image_char[i] = u_char(255 * image[i]); 152 | write_ppm("emma.ppm", image, h, w, c); 153 | 154 | // delete[] image_char; 155 | delete[] vertices; 156 | delete[] colors; 157 | delete[] triangles; 158 | delete[] image; 159 | delete[] depth_buffer; 160 | } 161 | 162 | int main(int argc, char *argv[]) { 163 | // std::cout << "Hello CMake!" << std::endl; 164 | 165 | // test_isPointInTri(); 166 | // test_getPointWeight(); 167 | // test_get_tri_normal(); 168 | // test_load_obj(); 169 | // test_render(); 170 | test_light(); 171 | return 0; 172 | } -------------------------------------------------------------------------------- /backbone_nets/mobilenetv2_backbone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | __all__ = ['MobileNetV2', 'mobilenet_v2'] 6 | 7 | 8 | model_urls = { 9 | 'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth', 10 | } 11 | 12 | 13 | def _make_divisible(v, divisor, min_value=None): 14 | """ 15 | This function is taken from the original tf repo. 16 | It ensures that all layers have a channel number that is divisible by 8 17 | It can be seen here: 18 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 19 | :param v: 20 | :param divisor: 21 | :param min_value: 22 | :return: 23 | """ 24 | if min_value is None: 25 | min_value = divisor 26 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 27 | # Make sure that round down does not go down by more than 10%. 28 | if new_v < 0.9 * v: 29 | new_v += divisor 30 | return new_v 31 | 32 | 33 | class ConvBNReLU(nn.Sequential): 34 | def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None): 35 | padding = (kernel_size - 1) // 2 36 | if norm_layer is None: 37 | norm_layer = nn.BatchNorm2d 38 | super(ConvBNReLU, self).__init__( 39 | nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), 40 | norm_layer(out_planes), 41 | nn.ReLU6(inplace=True) 42 | ) 43 | 44 | 45 | class InvertedResidual(nn.Module): 46 | def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None): 47 | super(InvertedResidual, self).__init__() 48 | self.stride = stride 49 | assert stride in [1, 2] 50 | 51 | if norm_layer is None: 52 | norm_layer = nn.BatchNorm2d 53 | 54 | hidden_dim = int(round(inp * expand_ratio)) 55 | self.use_res_connect = self.stride == 1 and inp == oup 56 | 57 | layers = [] 58 | if expand_ratio != 1: 59 | # pw 60 | layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer)) 61 | layers.extend([ 62 | # dw 63 | ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer), 64 | # pw-linear 65 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 66 | norm_layer(oup), 67 | ]) 68 | self.conv = nn.Sequential(*layers) 69 | 70 | def forward(self, x): 71 | if self.use_res_connect: 72 | return x + self.conv(x) 73 | else: 74 | return self.conv(x) 75 | 76 | 77 | class MobileNetV2(nn.Module): 78 | def __init__(self, 79 | num_classes=1000, 80 | width_mult=1.0, 81 | inverted_residual_setting=None, 82 | round_nearest=8, 83 | block=None, 84 | norm_layer=None, 85 | last_CN=None): 86 | """ 87 | MobileNet V2 main class 88 | Args: 89 | num_classes (int): Number of classes 90 | width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount 91 | inverted_residual_setting: Network structure 92 | round_nearest (int): Round the number of channels in each layer to be a multiple of this number 93 | Set to 1 to turn off rounding 94 | block: Module specifying inverted residual building block for mobilenet 95 | norm_layer: Module specifying the normalization layer to use 96 | """ 97 | super(MobileNetV2, self).__init__() 98 | 99 | if block is None: 100 | block = InvertedResidual 101 | 102 | if norm_layer is None: 103 | norm_layer = nn.BatchNorm2d 104 | 105 | input_channel = 32 106 | last_channel = 1280 107 | 108 | if inverted_residual_setting is None: 109 | inverted_residual_setting = [ 110 | # t, c, n, s 111 | [1, 16, 1, 1], 112 | [6, 24, 2, 2], 113 | [6, 32, 3, 2], 114 | [6, 64, 4, 2], 115 | [6, 96, 3, 1], 116 | [6, 160, 3, 2], 117 | [6, 320, 1, 1], 118 | ] 119 | 120 | # only check the first element, assuming user knows t,c,n,s are required 121 | if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: 122 | raise ValueError("inverted_residual_setting should be non-empty " 123 | "or a 4-element list, got {}".format(inverted_residual_setting)) 124 | 125 | # building first layer 126 | input_channel = _make_divisible(input_channel * width_mult, round_nearest) 127 | self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) 128 | features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)] 129 | # building inverted residual blocks 130 | total = 0 131 | for t, c, n, s in inverted_residual_setting: 132 | output_channel = _make_divisible(c * width_mult, round_nearest) 133 | for i in range(n): 134 | total += 1 135 | stride = s if i == 0 else 1 136 | features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer)) 137 | input_channel = output_channel 138 | #building last several layers 139 | features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer)) 140 | # make it nn.Sequential 141 | self.features = nn.Sequential(*features) 142 | self.features_first = self.features[:9] 143 | self.features_second = self.features[9:] 144 | 145 | if not last_CN: 146 | self.last_CN = self.last_channel 147 | else: 148 | self.last_CN = last_CN 149 | 150 | # building classifier 151 | 152 | self.num_ori = 12 153 | self.num_shape = 40 154 | self.num_exp = 10 155 | self.num_texture = 40 156 | self.num_bin = 121 157 | self.num_scale = 1 158 | self.num_trans = 3 159 | 160 | if last_CN is not None: 161 | self.connector = nn.Sequential( 162 | nn.Linear(self.last_CN, self.last_CN//16), 163 | nn.ReLU6(inplace=True), 164 | nn.Linear(self.last_CN//16, self.last_CN), 165 | nn.ReLU6(inplace=True), 166 | nn.Sigmoid() 167 | ) 168 | self.adjuster = nn.Sequential( 169 | nn.Linear(self.last_CN, self.last_CN), 170 | nn.BatchNorm1d(self.last_CN)) 171 | 172 | self.classifier_ori = nn.Sequential( 173 | nn.Dropout(0.2), 174 | nn.Linear(self.last_CN, self.num_ori), 175 | ) 176 | self.classifier_shape = nn.Sequential( 177 | nn.Dropout(0.2), 178 | nn.Linear(self.last_CN, self.num_shape), 179 | ) 180 | self.classifier_exp = nn.Sequential( 181 | nn.Dropout(0.2), 182 | nn.Linear(self.last_CN, self.num_exp), 183 | ) 184 | self.classifier_texture = nn.Sequential( 185 | nn.Dropout(0.2), 186 | nn.Linear(self.last_CN, self.num_texture), 187 | ) 188 | 189 | # weight initialization 190 | for m in self.modules(): 191 | if isinstance(m, nn.Conv2d): 192 | nn.init.kaiming_normal_(m.weight, mode='fan_out') 193 | if m.bias is not None: 194 | nn.init.zeros_(m.bias) 195 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 196 | nn.init.ones_(m.weight) 197 | nn.init.zeros_(m.bias) 198 | elif isinstance(m, nn.Linear): 199 | nn.init.normal_(m.weight, 0, 0.01) 200 | nn.init.zeros_(m.bias) 201 | 202 | def _forward_impl(self, x): 203 | # This exists since TorchScript doesn't support inheritance, so the superclass method 204 | # (this one) needs to have a name other than `forward` that can be accessed in a subclass 205 | inter = self.features_first(x) 206 | x = self.features_second(inter) 207 | 208 | x = nn.functional.adaptive_avg_pool2d(x, 1) 209 | x = x.reshape(x.shape[0], -1) 210 | 211 | pool_x = x.clone() 212 | x_ori = self.classifier_ori(x) 213 | x_shape = self.classifier_shape(x) 214 | x_exp = self.classifier_exp(x) 215 | x_tex = self.classifier_texture(x) 216 | x = torch.cat((x_ori, x_shape, x_exp, x_tex), dim=1) 217 | 218 | return x, pool_x, inter 219 | 220 | def forward(self, x): 221 | return self._forward_impl(x) 222 | 223 | 224 | def mobilenet_v2(pretrained=False, progress=True, **kwargs): 225 | """ 226 | Constructs a MobileNetV2 architecture from 227 | `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" `_. 228 | Args: 229 | pretrained (bool): If True, returns a model pre-trained on ImageNet 230 | progress (bool): If True, displays a progress bar of the download to stderr 231 | """ 232 | model = MobileNetV2(**kwargs) 233 | if pretrained: 234 | state_dict = torch.hub.load_state_dict_from_url(model_urls['mobilenet_v2'], 235 | progress=progress) 236 | model.load_state_dict(state_dict) 237 | return model -------------------------------------------------------------------------------- /cal_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make sure you have downloaded saved mesh from supervised learning and validation set for calculating the error 4 | 5 | python cal_size_ARE.py 6 | python cal_size_kpts.py -------------------------------------------------------------------------------- /cal_size_ARE.py: -------------------------------------------------------------------------------- 1 | # This script calculates the point-to-point face size (ARE) 2 | 3 | import numpy as np 4 | import glob 5 | from statistics import mean 6 | 7 | def read_obj(filename): 8 | f = open(filename) 9 | lines = f.readlines() 10 | coll = [] 11 | for l in lines: 12 | if l[0] != 'v': 13 | break 14 | comp = l.split()[1:] 15 | comp = list(map(float, comp)) 16 | coll.append(comp) 17 | 18 | a = np.asarray(coll) 19 | return a 20 | 21 | def read_xyz(filename): 22 | f = open(filename) 23 | lines = f.readlines() 24 | coll = [] 25 | for l in lines: 26 | comp = l.split() 27 | comp = list(map(float, comp)) 28 | coll.append(comp) 29 | a=np.asarray(coll) 30 | return a 31 | 32 | folders = glob.glob('data/all_test_result_3PerP_supervised_64/*') 33 | fore_name, cheek_name, ear_name, mid_name = [], [], [], [] 34 | 35 | for folder in folders: 36 | folder_name = folder.rsplit('/',1)[-1] 37 | print("Evaluating: ", folder_name) 38 | all_predictions = glob.glob(folder+'/*.obj') 39 | target_pts = read_xyz(glob.glob('data/A2E_val/'+folder_name+'/*.xyz')[0]) 40 | target_OICD = np.linalg.norm(target_pts[2217]-target_pts[14607]) 41 | target_foreD = np.linalg.norm(target_pts[1678]-target_pts[42117]) 42 | target_cheekD = np.linalg.norm(target_pts[2294]-target_pts[13635]) 43 | target_earD = np.linalg.norm(target_pts[20636]-target_pts[34153]) 44 | target_midD = np.linalg.norm(target_pts[2130]-target_pts[15003]) 45 | 46 | target_foreOICD = target_foreD/target_OICD 47 | target_cheekOICD = target_cheekD/target_OICD 48 | target_earOICD = target_earD/target_OICD 49 | target_midOICD = target_midD/target_OICD 50 | 51 | fore_err, cheek_err, ear_err, mid_err = [],[],[],[] 52 | 53 | for pred in all_predictions: 54 | pred_pts = read_obj(pred) 55 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607]) 56 | pred_pts *= (target_OICD/pred_OICD) 57 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607]) 58 | pred_midD = np.linalg.norm(pred_pts[2130]-pred_pts[15003]) 59 | pred_foreD = np.linalg.norm(pred_pts[1678]-pred_pts[42117]) 60 | pred_cheekD = np.linalg.norm(pred_pts[2294]-pred_pts[13635]) 61 | pred_earD = np.linalg.norm(pred_pts[20636]-pred_pts[34153]) 62 | 63 | pred_midOICD = pred_midD/pred_OICD 64 | pred_foreOICD = pred_foreD/pred_OICD 65 | pred_cheekOICD = pred_cheekD/pred_OICD 66 | pred_earOICD = pred_earD/pred_OICD 67 | 68 | fore_err.append(abs(pred_foreOICD-target_foreOICD)) 69 | cheek_err.append(abs(pred_cheekOICD-target_cheekOICD)) 70 | ear_err.append(abs(pred_earOICD-target_earOICD)) 71 | mid_err.append(abs(pred_midOICD-target_midOICD)) 72 | 73 | 74 | fore_err_mean, cheek_err_mean, ear_err_mean, mid_err_mean = mean(fore_err), mean(cheek_err), mean(ear_err), mean(mid_err) 75 | fore_name.append(fore_err_mean) 76 | cheek_name.append(cheek_err_mean) 77 | mid_name.append(mid_err_mean) 78 | ear_name.append(ear_err_mean) 79 | 80 | print("Summary of the ARE:") 81 | print("-----------------------") 82 | print("Fore ratio error", mean(fore_name)) 83 | print("Cheek ratio error", mean(cheek_name)) 84 | print("Ear ratio error", mean(ear_name)) 85 | print("Mid ratio error", mean(mid_name)) 86 | -------------------------------------------------------------------------------- /cal_size_kpts.py: -------------------------------------------------------------------------------- 1 | # This script calculates the point-to-point face size (Keypoint) 2 | 3 | import numpy as np 4 | import glob 5 | from statistics import mean 6 | 7 | def read_obj(filename): 8 | f = open(filename) 9 | lines = f.readlines() 10 | coll = [] 11 | for l in lines: 12 | if l[0] != 'v': 13 | break 14 | comp = l.split()[1:] 15 | comp = list(map(float, comp)) 16 | coll.append(comp) 17 | 18 | a = np.asarray(coll) 19 | return a 20 | 21 | def read_xyz(filename): 22 | f = open(filename) 23 | lines = f.readlines() 24 | coll = [] 25 | for l in lines: 26 | comp = l.split() 27 | comp = list(map(float, comp)) 28 | coll.append(comp) 29 | a=np.asarray(coll) 30 | return a 31 | 32 | 33 | kpts = np.load('train.configs/keypoints_sim.npy') 34 | folders = glob.glob('data/all_test_result_3PerP_supervised_64/*') 35 | kpts_name = [] 36 | 37 | for folder in folders: 38 | folder_name = folder.rsplit('/',1)[-1] 39 | print("Evaluating: ", folder_name) 40 | all_predictions = glob.glob(folder+'/*.obj') 41 | target_pts = read_xyz(glob.glob('data/A2E_val/'+folder_name+'/*.xyz')[0]) 42 | target_OICD = np.linalg.norm(target_pts[2217]-target_pts[14607]) 43 | 44 | RMSE_col = [] 45 | 46 | for pred in all_predictions: 47 | pred_pts = read_obj(pred) 48 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607]) 49 | pred_pts *= (target_OICD/pred_OICD) 50 | pred_pts_flat = pred_pts.flatten(order='C') 51 | target_pts_flat = target_pts.flatten(order='C') 52 | 53 | size_R, size_C = target_pts[:,1].max()-target_pts[:,1].min(), target_pts[:,0].max()-target_pts[:,0].min() 54 | pred_kpts, target_kpts = pred_pts_flat[kpts], target_pts_flat[kpts] 55 | RMSE = np.linalg.norm(pred_kpts-target_kpts)/np.sqrt(size_R*size_C) 56 | RMSE_col.append(RMSE) 57 | 58 | kpts_name_mean = mean(RMSE_col) 59 | kpts_name.append(kpts_name_mean) 60 | 61 | print("Keypoints error: ", mean(kpts_name)) -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import string 2 | from dataset import VoiceDataset, FaceDataset 3 | from network import VoiceEmbedNet, Generator, FaceEmbedNet, Classifier 4 | from utils import get_collate_fn, get_collate_fn_4 5 | import os 6 | 7 | DATASET_PARAMETERS = { 8 | # meta data provided by voxceleb1 dataset 9 | 'meta_file': 'data/vox1_meta.csv', 10 | 11 | # voice dataset 12 | 'voice_dir': 'data/fbank', 13 | 'voice_ext': 'npy', 14 | 15 | # face dataset 16 | 'face_dir': 'data/VGG_ALL_FRONTAL', 17 | 'face_ext': '.jpg', 18 | 19 | # train data includes the identities 20 | # whose names start with the characters of 'FGH...XYZ' 21 | 'split': string.ascii_uppercase[5:], 22 | 23 | # dataloader 24 | 'voice_dataset': VoiceDataset, 25 | 'face_dataset': FaceDataset, 26 | 'batch_size': 64, 27 | 'nframe_range': [300, 800], 28 | 'workers_num': 1, 29 | 'collate_fn': get_collate_fn, 30 | 'collate_fn_4': get_collate_fn_4, 31 | 32 | # test data 33 | 'test_data': 'data/test_data/' 34 | } 35 | 36 | SAVE_DIR = 'pretrained_models/' 37 | NUM_EPOCH = 48000 #49999 38 | 39 | if not os.path.exists(SAVE_DIR): 40 | os.makedirs(SAVE_DIR) 41 | 42 | NETWORKS_PARAMETERS = { 43 | 44 | 'SAVE_DIR': SAVE_DIR, 45 | 46 | # VOICE EMBEDDING NETWORK (e) 47 | 'e': { 48 | 'network': VoiceEmbedNet, 49 | 'input_channel': 64, 50 | 'channels': [256, 384, 576, 864], 51 | 'output_channel': 64, # the embedding dimension 52 | 'model_path': 'pretrained_models/voice_embedding.pth', 53 | }, 54 | # GENERATOR (g) 55 | 'g': { 56 | 'network': Generator, 57 | 'input_channel': 64, 58 | 'channels': [1024, 512, 256, 128, 64], # channels for deconvolutional layers 59 | 'output_channel': 3, # images with RGB channels 60 | 'model_path': f'{SAVE_DIR}/generator_{NUM_EPOCH}.pth' 61 | }, 62 | # FACE EMBEDDING NETWORK (f) 63 | 'f': { 64 | 'network': FaceEmbedNet, 65 | 'input_channel': 3, 66 | 'channels': [32, 64, 128, 256, 512], 67 | 'output_channel': 64, 68 | 'model_path': 'models/face_embedding.pth', 69 | }, 70 | # DISCRIMINATOR (d) 71 | 'd': { 72 | 'network': Classifier, # Discrminator is a special Classifier with 1 subject 73 | 'input_channel': 64, 74 | 'channels': [], 75 | 'output_channel': 1, 76 | 'model_path': 'models/discriminator.pth', 77 | }, 78 | # CLASSIFIER (c) 79 | 'c': { 80 | 'network': Classifier, 81 | 'input_channel': 64, 82 | 'channels': [], 83 | 'output_channel': -1, # This parameter is depended on the dataset we used 84 | 'model_path': 'models/classifier.pth', 85 | }, 86 | # OPTIMIZER PARAMETERS 87 | 'lr': 0.0002, 88 | 'beta1': 0.5, 89 | 'beta2': 0.999, 90 | 91 | # MODE, use GPU or not 92 | 'GPU': True, 93 | 94 | 'image3D':{ 95 | 'model_path': f'{SAVE_DIR}/image3D_{NUM_EPOCH}.pth' 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00001.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00002.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00003.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00001.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00002.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00003.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00001.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00002.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00003.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00001.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00002.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00003.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00001.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00002.npy -------------------------------------------------------------------------------- /data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00003.npy -------------------------------------------------------------------------------- /data/results/rand_id00001/1TmvLk8sB-g_00001_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00001_image.png -------------------------------------------------------------------------------- /data/results/rand_id00001/1TmvLk8sB-g_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00001_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00001/1TmvLk8sB-g_00002_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00002_image.png -------------------------------------------------------------------------------- /data/results/rand_id00001/1TmvLk8sB-g_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00002_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00001/1TmvLk8sB-g_00003_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00003_image.png -------------------------------------------------------------------------------- /data/results/rand_id00001/1TmvLk8sB-g_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00003_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00002/0XmNeUnOnlg_00001_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00001_image.png -------------------------------------------------------------------------------- /data/results/rand_id00002/0XmNeUnOnlg_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00001_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00002/0XmNeUnOnlg_00002_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00002_image.png -------------------------------------------------------------------------------- /data/results/rand_id00002/0XmNeUnOnlg_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00002_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00002/0XmNeUnOnlg_00003_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00003_image.png -------------------------------------------------------------------------------- /data/results/rand_id00002/0XmNeUnOnlg_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00003_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00003/1M4q6CQM5pA_00001_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00001_image.png -------------------------------------------------------------------------------- /data/results/rand_id00003/1M4q6CQM5pA_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00001_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00003/1M4q6CQM5pA_00002_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00002_image.png -------------------------------------------------------------------------------- /data/results/rand_id00003/1M4q6CQM5pA_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00002_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00003/1M4q6CQM5pA_00003_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00003_image.png -------------------------------------------------------------------------------- /data/results/rand_id00003/1M4q6CQM5pA_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00003_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00004/_2wZVvsQYFg_00001_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00001_image.png -------------------------------------------------------------------------------- /data/results/rand_id00004/_2wZVvsQYFg_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00001_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00004/_2wZVvsQYFg_00002_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00002_image.png -------------------------------------------------------------------------------- /data/results/rand_id00004/_2wZVvsQYFg_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00002_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00004/_2wZVvsQYFg_00003_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00003_image.png -------------------------------------------------------------------------------- /data/results/rand_id00004/_2wZVvsQYFg_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00003_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00005/0nH78dDh0N0_00001_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00001_image.png -------------------------------------------------------------------------------- /data/results/rand_id00005/0nH78dDh0N0_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00001_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00005/0nH78dDh0N0_00002_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00002_image.png -------------------------------------------------------------------------------- /data/results/rand_id00005/0nH78dDh0N0_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00002_overlap.png -------------------------------------------------------------------------------- /data/results/rand_id00005/0nH78dDh0N0_00003_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00003_image.png -------------------------------------------------------------------------------- /data/results/rand_id00005/0nH78dDh0N0_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00003_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_img.png -------------------------------------------------------------------------------- /data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_img.png -------------------------------------------------------------------------------- /data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_img.png -------------------------------------------------------------------------------- /data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_img.png -------------------------------------------------------------------------------- /data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_img.png -------------------------------------------------------------------------------- /data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_img.png -------------------------------------------------------------------------------- /data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_img.png -------------------------------------------------------------------------------- /data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_img.png -------------------------------------------------------------------------------- /data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_img.png -------------------------------------------------------------------------------- /data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_img.png -------------------------------------------------------------------------------- /data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_img.png -------------------------------------------------------------------------------- /data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_img.png -------------------------------------------------------------------------------- /data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_img.png -------------------------------------------------------------------------------- /data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_img.png -------------------------------------------------------------------------------- /data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_overlap.png -------------------------------------------------------------------------------- /data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_img.png -------------------------------------------------------------------------------- /data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_overlap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_overlap.png -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from PIL import Image 4 | from torch.utils.data import Dataset 5 | import random 6 | 7 | def load_voice(voice_item): 8 | voice_data = np.load(voice_item['filepath']) 9 | voice_data = voice_data.T.astype('float32') 10 | voice_label = voice_item['label_id'] 11 | return voice_data, voice_label 12 | 13 | def load_face(face_item): 14 | face_data = Image.open(face_item['filepath']).convert('RGB').resize([64, 64]) 15 | face_data = np.transpose(np.array(face_data), (2, 0, 1)) 16 | face_data = ((face_data - 127.5) / 127.5).astype('float32') 17 | face_label = face_item['label_id'] 18 | return face_data, face_label 19 | 20 | class VoiceDataset(Dataset): 21 | def __init__(self, voice_list, nframe_range): 22 | self.voice_list = voice_list 23 | self.crop_nframe = nframe_range[1] 24 | self.length = len(self.voice_list) 25 | 26 | def __getitem__(self, index): 27 | ranidx = random.randint(0, self.length-1) 28 | voice_data, voice_label = load_voice(self.voice_list[index]) 29 | if index == self.length-1: 30 | p_ind = index-1 31 | else: 32 | p_ind = index+1 33 | voice_data_p, _ = load_voice(self.voice_list[p_ind]) 34 | voice_data_n, _ = load_voice(self.voice_list[ranidx]) 35 | assert self.crop_nframe <= voice_data.shape[1] 36 | pt = np.random.randint(voice_data.shape[1] - self.crop_nframe + 1) 37 | voice_data = voice_data[:, pt:pt+self.crop_nframe] 38 | pt_p = np.random.randint(voice_data_p.shape[1] - self.crop_nframe + 1) 39 | voice_data_p = voice_data_p[:, pt_p:pt_p+self.crop_nframe] 40 | pt_n = np.random.randint(voice_data_n.shape[1] - self.crop_nframe + 1) 41 | voice_data_n = voice_data_n[:, pt_n:pt_n+self.crop_nframe] 42 | return voice_data, voice_label, voice_data_p, voice_data_n 43 | 44 | def __len__(self): 45 | return len(self.voice_list) 46 | 47 | class FaceDataset(Dataset): 48 | def __init__(self, face_list): 49 | self.face_list = face_list 50 | 51 | def __getitem__(self, index): 52 | face_data, face_label = load_face(self.face_list[index]) 53 | if np.random.random() > 0.5: 54 | face_data = np.flip(face_data, axis=2).copy() 55 | return face_data, face_label 56 | 57 | def __len__(self): 58 | return len(self.face_list) 59 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # This script demos with pre-processed MFCC. 2 | import os 3 | import glob 4 | import torch 5 | import scipy.io as sio 6 | import numpy as np 7 | import cv2 8 | 9 | from config import NETWORKS_PARAMETERS 10 | from network import get_network, SynergyNet 11 | from utils import voice2face_processed 12 | from utilf.render import render_vert 13 | 14 | # initialization 15 | e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False) 16 | g_net, _ = get_network('g', NETWORKS_PARAMETERS, train=False) 17 | 18 | # building models: unsupervised 19 | image3D = SynergyNet(pretrained=False, last_CN=None).cuda().eval() 20 | backbone_ckpt = torch.load(NETWORKS_PARAMETERS['image3D']['model_path']) 21 | image3D.load_state_dict(backbone_ckpt) 22 | 23 | # SynergyNet pretrained network for getting pose 24 | image3D_pretrained = SynergyNet(pretrained=True).cuda().eval() 25 | 26 | # data and config 27 | voice_list = sorted(glob.glob('data/preprocessed_MFCC/*')) 28 | up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True) 29 | tri = sio.loadmat('./train.configs/tri.mat')['tri'] 30 | 31 | # [TODO] Change this variable to yout result output folder 32 | FOLDER_ROOT = 'data/results/' 33 | 34 | if not os.path.exists(FOLDER_ROOT): 35 | os.mkdir(FOLDER_ROOT) 36 | 37 | for folder in voice_list: 38 | index = folder.rsplit('/',1)[-1] 39 | print(index) 40 | 41 | if not os.path.exists(FOLDER_ROOT+index): 42 | os.mkdir(FOLDER_ROOT + index) 43 | 44 | all_sequences = sorted(glob.glob(folder+'/*')) 45 | for sequence in all_sequences: 46 | all_fbanks = sorted(glob.glob(sequence+'/*.npy')) 47 | sequence_name = sequence.rsplit('/',1)[-1] 48 | 49 | for fbank in all_fbanks: 50 | fbank_name = fbank.rsplit('/',1)[-1][:-4] 51 | 52 | with torch.no_grad(): 53 | # voice2face 54 | face_image = voice2face_processed(e_net, g_net, fbank, NETWORKS_PARAMETERS['GPU']) 55 | face_image = up_layer(face_image) 56 | 57 | # Pose from 3DDFA-V2 58 | pose = image3D_pretrained(face_image, return_onlypose=True) 59 | R, off = image3D_pretrained.parse_param_102_pose(pose) 60 | 61 | #Alignment with synthesized image 62 | prediction = image3D(face_image) 63 | prediction = R @ prediction + off 64 | 65 | # transform to image coordinate space 66 | prediction[:, 1, :] = 127 - prediction[:, 1, :] 67 | save_name = FOLDER_ROOT+ index + '/' + sequence_name + '_' + fbank_name 68 | img = (((face_image[0].clamp(-1,1))*127.5)+128).detach().cpu().numpy().astype(np.uint8) 69 | img = np.transpose(img, (1,2,0)) 70 | img = img[:,:,[2,1,0]] 71 | pred = prediction[0].detach().cpu().numpy() 72 | # save 73 | cv2.imwrite(save_name+'_image.png', img) 74 | render_vert(img, pred, alpha=1.0, wfp=save_name+'_overlap.png') 75 | 76 | -------------------------------------------------------------------------------- /demo/coherence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/demo/coherence.png -------------------------------------------------------------------------------- /demo/overall_purpose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/demo/overall_purpose.png -------------------------------------------------------------------------------- /demo/supervised_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/demo/supervised_comp.png -------------------------------------------------------------------------------- /demo_mic.py: -------------------------------------------------------------------------------- 1 | 2 | import cv2 3 | from dataclasses import dataclass, asdict 4 | import glob 5 | import numpy as np 6 | import os 7 | import pyaudio 8 | import scipy.io as sio 9 | from scipy.io import wavfile 10 | import shutil 11 | import torch 12 | import torch.nn.functional as F 13 | import torchvision.utils as vutils 14 | import webrtcvad 15 | 16 | from mfcc import MFCC 17 | from config import NETWORKS_PARAMETERS 18 | from network import get_network, SynergyNet 19 | from utils import voice2face, read_obj 20 | from vad import read_wave, write_wave, frame_generator, vad_collector 21 | from pyaudio_recording import Recorder 22 | from utilf.render import render_vert 23 | 24 | @dataclass 25 | class StreamParams: 26 | format: int = pyaudio.paInt16 27 | channels: int = 1 28 | rate: int = 16000 29 | frames_per_buffer: int = 1024 30 | input: bool = True 31 | output: bool = False 32 | 33 | def to_dict(self) -> dict: 34 | return asdict(self) 35 | 36 | 37 | def rm_sil(voice_file, vad_obj): 38 | """ 39 | remove silence 40 | """ 41 | audio, sample_rate = read_wave(voice_file) 42 | frames = frame_generator(20, audio, sample_rate) 43 | frames = list(frames) 44 | segments = vad_collector(sample_rate, 20, 50, vad_obj, frames) 45 | 46 | if os.path.exists('tmp/'): 47 | shutil.rmtree('tmp/') 48 | os.makedirs('tmp/') 49 | 50 | wave_data = [] 51 | for i, segment in enumerate(segments): 52 | segment_file = 'tmp/' + str(i) + '.wav' 53 | write_wave(segment_file, segment, sample_rate) 54 | wave_data.append(wavfile.read(segment_file)[1]) 55 | shutil.rmtree('tmp/') 56 | 57 | if wave_data: 58 | vad_voice = np.concatenate(wave_data).astype('int16') 59 | return vad_voice 60 | 61 | def get_fbank(voice, mfc_obj): 62 | """ 63 | process audio and create mel-spectrogram 64 | """ 65 | # Extract log mel-spectrogra 66 | fbank = mfc_obj.sig2logspec(voice).astype('float32') 67 | 68 | # Mean and variance normalization of each mel-frequency 69 | fbank = fbank - fbank.mean(axis=0) 70 | fbank = fbank / (fbank.std(axis=0)+np.finfo(np.float32).eps) 71 | 72 | # If the duration of a voice recording is less than 10 seconds (1000 frames), 73 | # repeat the recording until it is longer than 10 seconds and crop. 74 | full_frame_number = 1000 75 | init_frame_number = fbank.shape[0] 76 | while fbank.shape[0] < full_frame_number: 77 | fbank = np.append(fbank, fbank[0:init_frame_number], axis=0) 78 | fbank = fbank[0:full_frame_number,:] 79 | return fbank 80 | 81 | def voice2face(e_net, g_net, voice_file, vad_obj, mfc_obj, GPU=True): 82 | vad_voice = rm_sil(voice_file, vad_obj) 83 | fbank = get_fbank(vad_voice, mfc_obj) 84 | fbank = fbank.T[np.newaxis, ...] 85 | fbank = torch.from_numpy(fbank.astype('float32')) 86 | 87 | if GPU: 88 | fbank = fbank.cuda() 89 | embedding = e_net(fbank) 90 | embedding = F.normalize(embedding) 91 | face = g_net(embedding) 92 | return face 93 | 94 | def main(): 95 | # recording and save under the root 96 | filename = "audio.wav" 97 | # stream_params = StreamParams() 98 | # recorder = Recorder(stream_params) 99 | # # record for 5 seconds 100 | # recorder.record(5, filename) 101 | 102 | # initialization 103 | # voice activity detector, aggressiveness = 2 104 | vad_obj = webrtcvad.Vad(2) 105 | # Mel-Frequency extractor 106 | mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025) 107 | # net definition 108 | e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False) 109 | g_net, _ = get_network('g', NETWORKS_PARAMETERS, train=False) 110 | 111 | # building models: unsupervised 112 | image3D = SynergyNet(pretrained=False, last_CN=None).cuda().eval() 113 | backbone_ckpt = torch.load(NETWORKS_PARAMETERS['image3D']['model_path']) 114 | image3D.load_state_dict(backbone_ckpt) 115 | 116 | # SynergyNet pretrained network for getting pose 117 | image3D_pretrained = SynergyNet(pretrained=True).cuda().eval() 118 | 119 | # data and config 120 | up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True) 121 | tri = sio.loadmat('./train.configs/tri.mat')['tri'] 122 | 123 | # default savepath 124 | FOLDER_ROOT = 'data/results/' 125 | if not os.path.exists(FOLDER_ROOT): 126 | os.makedirs(FOLDER_ROOT) 127 | 128 | with torch.no_grad(): 129 | # voice2face 130 | face_image = voice2face(e_net, g_net, filename, vad_obj, mfc_obj, NETWORKS_PARAMETERS['GPU']) 131 | face_image = up_layer(face_image) 132 | 133 | # Pose from 3DDFA-V2 134 | pose = image3D_pretrained(face_image, return_onlypose=True) 135 | R, off = image3D_pretrained.parse_param_102_pose(pose) 136 | 137 | # Alignment with synthesized image 138 | prediction_fr = image3D(face_image) 139 | prediction = R @ prediction_fr + off 140 | 141 | # calculation between mean male and female shape and classify the gender by meshes 142 | #print(prediction_fr.requires_grad) 143 | prediction_fr_np = prediction_fr.squeeze(0).cpu().numpy() 144 | prediction_fr_np = np.transpose(prediction_fr_np, (1,0)) 145 | mean_male = read_obj('male.obj') # 53215 * 3 146 | mean_female = read_obj('female.obj') # 53215 * 3 147 | N_vertices = prediction_fr_np.shape[0] #53215 148 | error_male = np.linalg.norm(prediction_fr_np - mean_male)/ N_vertices 149 | error_female = np.linalg.norm(prediction_fr_np - mean_female)/ N_vertices 150 | 151 | pred_midD = np.linalg.norm(prediction_fr_np[2130]-prediction_fr_np[15003]) 152 | pred_foreD = np.linalg.norm(prediction_fr_np[1678]-prediction_fr_np[42117]) 153 | pred_cheekD = np.linalg.norm(prediction_fr_np[2294]-prediction_fr_np[13635]) 154 | pred_earD = np.linalg.norm(prediction_fr_np[20636]-prediction_fr_np[34153]) 155 | print("-------------------------") 156 | if error_male < error_female: 157 | print("This is a male's voice") 158 | print("Statistics from the predicted mesh and mean gender mesh") 159 | target_foreD = np.linalg.norm(mean_male[1678]-mean_male[42117]) 160 | target_cheekD = np.linalg.norm(mean_male[2294]-mean_male[13635]) 161 | target_earD = np.linalg.norm(mean_male[20636]-mean_male[34153]) 162 | target_midD = np.linalg.norm(mean_male[2130]-mean_male[15003]) 163 | 164 | ratio_fore = (pred_foreD-target_foreD)/target_foreD 165 | ratio_cheek = (pred_cheekD-target_cheekD)/target_cheekD 166 | ratio_ear = (pred_earD-target_earD)/target_earD 167 | ratio_mid = (pred_midD-target_midD)/target_midD 168 | 169 | print(f"The forehead is {ratio_fore*100}% than the mean male shape") 170 | print(f"The cheek-to-cheek is {ratio_cheek*100}% than the mean male shape") 171 | print(f"The ear-to-ear is {ratio_ear*100}% than the mean male shape") 172 | print(f"The midline is {ratio_mid*100}% than the mean male shape") 173 | else: 174 | print("This is a female's voice") 175 | print("Statistics from the predicted mesh and mean gender mesh") 176 | target_foreD = np.linalg.norm(mean_female[1678]-mean_female[42117]) 177 | target_cheekD = np.linalg.norm(mean_female[2294]-mean_female[13635]) 178 | target_earD = np.linalg.norm(mean_female[20636]-mean_female[34153]) 179 | target_midD = np.linalg.norm(mean_female[2130]-mean_female[15003]) 180 | 181 | ratio_fore = (pred_foreD-target_foreD)/target_foreD 182 | ratio_cheek = (pred_cheekD-target_cheekD)/target_cheekD 183 | ratio_ear = (pred_earD-target_earD)/target_earD 184 | ratio_mid = (pred_midD-target_midD)/target_midD 185 | 186 | print(f"The forehead is {ratio_fore*100}% than the mean female shape") 187 | print(f"The cheek-to-cheek is {ratio_cheek*100}% than the femean male shape") 188 | print(f"The ear-to-ear is {ratio_ear*100}% than the mean female shape") 189 | print(f"The midline is {ratio_mid*100}% than the mean female shape") 190 | print("-------------------------") 191 | 192 | wide_shape = read_obj('wide.obj') 193 | skinny_shape = read_obj('skinny.obj') 194 | regular_shape = read_obj('regular.obj') 195 | slim_shape = read_obj('slim.obj') 196 | error_wide = np.linalg.norm(prediction_fr_np - wide_shape)/ N_vertices 197 | error_skinny = np.linalg.norm(prediction_fr_np - skinny_shape)/ N_vertices 198 | error_regular = np.linalg.norm(prediction_fr_np - regular_shape)/ N_vertices 199 | error_slim = np.linalg.norm(prediction_fr_np - slim_shape)/ N_vertices 200 | err_type = np.array([error_wide, error_skinny, error_regular, error_slim]) 201 | index = np.argsort(err_type)[0] 202 | 203 | if index == 0: 204 | print("The face shape is closer to WIDE") 205 | elif index == 1: 206 | print(f"The face shape is closer to SKINNY") 207 | elif index == 2: 208 | print(f"The face shape is closer to REGULAR") 209 | elif index == 3: 210 | print(f"The face shape is closer to SLIM") 211 | 212 | print("-------------------------") 213 | 214 | # transform to image coordinate space 215 | prediction[:, 1, :] = 127 - prediction[:, 1, :] 216 | save_name = os.path.join(FOLDER_ROOT, 'micIn') 217 | img = (((face_image[0].clamp(-1,1))*127.5)+128).detach().cpu().numpy().astype(np.uint8) 218 | img = np.transpose(img, (1,2,0)) 219 | img = img[:,:,[2,1,0]] 220 | pred = prediction[0].detach().cpu().numpy() 221 | 222 | # save 223 | cv2.imwrite(save_name+'_image.png', img) 224 | render_vert(img, pred, alpha=1.0, wfp=save_name+'_overlap.png') 225 | 226 | vutils.save_image(face_image.detach().clamp(-1,1), filename.replace('.wav', '.png'), normalize=True) 227 | 228 | 229 | if __name__ == '__main__': 230 | main() -------------------------------------------------------------------------------- /distiller_zoo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class Attention(nn.Module): 7 | def __init__(self, p=2): 8 | super(Attention, self).__init__() 9 | self.p = p 10 | 11 | 12 | def forward(self, f_s, f_t): 13 | if f_s.dim() == 2: 14 | return (F.normalize(f_s.pow(self.p))-F.normalize(f_t.pow(self.p))).pow(2).mean() 15 | else: 16 | return (self.at(f_s) - self.at(f_t)).pow(2).mean() 17 | 18 | def at(self, f): 19 | return F.normalize(f.pow(self.p).mean(1).view(f.size(0), -1)) 20 | 21 | class Similarity(nn.Module): 22 | def __init__(self): 23 | super(Similarity, self).__init__() 24 | 25 | 26 | def forward(self, f_s, f_t): 27 | bsz = f_s.shape[0] 28 | f_s = f_s.view(bsz, -1) 29 | f_t = f_t.view(bsz, -1) 30 | G_s = torch.mm(f_s, torch.t(f_s)) 31 | G_s = torch.nn.functional.normalize(G_s) 32 | 33 | G_t = torch.mm(f_t, torch.t(f_t)) 34 | G_t = torch.nn.functional.normalize(G_t) 35 | 36 | G_diff = G_t - G_s 37 | loss = (G_diff*G_diff).view(-1, 1).sum(0)/(bsz*bsz) 38 | return loss 39 | 40 | class Correlation(nn.Module): 41 | def __init__(self): 42 | super(Correlation, self).__init__() 43 | 44 | def forward(self, f_s, f_t): 45 | delta = torch.abs(f_s-f_t) 46 | loss = torch.mean((delta[:-1]*delta[1:]).sum(1)) 47 | return loss 48 | 49 | class NSTLoss(nn.Module): 50 | def __init__(self): 51 | super(NSTLoss, self).__init__() 52 | pass 53 | 54 | def forward(self, f_s, f_t): 55 | 56 | if f_s.dim() == 4: 57 | s_H, t_H = f_s.shape[2], f_t.shape[2] 58 | if s_H > t_H: 59 | f_s = F.adaptive_avg_pool2d(f_s, (t_H, t_H)) 60 | elif s_H < t_H: 61 | f_t = F.adaptive_avg_pool2d(f_t, (s_H, s_H)) 62 | else: 63 | pass 64 | 65 | f_s = f_s.view(f_s.shape[0], f_s.shape[1], -1) 66 | f_s = F.normalize(f_s, dim=2) 67 | f_t = f_t.view(f_t.shape[0], f_t.shape[1], -1) 68 | f_t = F.normalize(f_t, dim=2) 69 | 70 | elif f_s.dim() == 2: 71 | f_s = F.normalize(f_s, dim=1) 72 | f_t = F.normalize(f_t, dim=1) 73 | 74 | full_loss = True 75 | if full_loss: 76 | return (self.poly_kernel(f_t, f_t).mean().detach() + self.poly_kernel(f_s,f_s).mean() - 2 * self.poly_kernel(f_s, f_t).mean()) 77 | else: 78 | return self.poly_kernel(f_s, f_s).mean() 79 | 80 | def poly_kernel(self, a, b): 81 | a = a.unsqueeze(1) 82 | b = b.unsqueeze(2) 83 | res = (a*b).sum(-1).pow(2) 84 | return res 85 | 86 | class RKDLoss(nn.Module): 87 | """Relational Knowledge Disitllation, CVPR2019""" 88 | def __init__(self, w_d=25, w_a=50): 89 | super(RKDLoss, self).__init__() 90 | self.w_d = w_d 91 | self.w_a = w_a 92 | 93 | def forward(self, f_s, f_t): 94 | student = f_s.view(f_s.shape[0], -1) 95 | teacher = f_t.view(f_t.shape[0], -1) 96 | 97 | # RKD distance loss 98 | with torch.no_grad(): 99 | t_d = self.pdist(teacher, squared=False) 100 | mean_td = t_d[t_d > 0].mean() 101 | t_d = t_d / mean_td 102 | 103 | d = self.pdist(student, squared=False) 104 | mean_d = d[d > 0].mean() 105 | d = d / mean_d 106 | 107 | loss_d = F.smooth_l1_loss(d, t_d) 108 | 109 | # RKD Angle loss 110 | with torch.no_grad(): 111 | td = (teacher.unsqueeze(0) - teacher.unsqueeze(1)) 112 | norm_td = F.normalize(td, p=2, dim=2) 113 | t_angle = torch.bmm(norm_td, norm_td.transpose(1, 2)).view(-1) 114 | 115 | sd = (student.unsqueeze(0) - student.unsqueeze(1)) 116 | norm_sd = F.normalize(sd, p=2, dim=2) 117 | s_angle = torch.bmm(norm_sd, norm_sd.transpose(1, 2)).view(-1) 118 | 119 | loss_a = F.smooth_l1_loss(s_angle, t_angle) 120 | 121 | loss = self.w_d * loss_d + self.w_a * loss_a 122 | 123 | return loss 124 | 125 | @staticmethod 126 | def pdist(e, squared=False, eps=1e-12): 127 | e_square = e.pow(2).sum(dim=1) 128 | prod = e @ e.t() 129 | res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clamp(min=eps) 130 | 131 | if not squared: 132 | res = res.sqrt() 133 | 134 | res = res.clone() 135 | res[range(len(e)), range(len(e))] = 0 136 | return res 137 | 138 | class PKT(nn.Module): 139 | """Probabilistic Knowledge Transfer for deep representation learning 140 | Code from author: https://github.com/passalis/probabilistic_kt""" 141 | def __init__(self): 142 | super(PKT, self).__init__() 143 | 144 | def forward(self, f_s, f_t): 145 | return self.cosine_similarity_loss(f_s, f_t) 146 | 147 | @staticmethod 148 | def cosine_similarity_loss(output_net, target_net, eps=0.0000001): 149 | # Normalize each vector by its norm 150 | output_net_norm = torch.sqrt(torch.sum(output_net ** 2, dim=1, keepdim=True)) 151 | output_net = output_net / (output_net_norm + eps) 152 | output_net[output_net != output_net] = 0 153 | 154 | target_net_norm = torch.sqrt(torch.sum(target_net ** 2, dim=1, keepdim=True)) 155 | target_net = target_net / (target_net_norm + eps) 156 | target_net[target_net != target_net] = 0 157 | 158 | # Calculate the cosine similarity 159 | model_similarity = torch.mm(output_net, output_net.transpose(0, 1)) 160 | target_similarity = torch.mm(target_net, target_net.transpose(0, 1)) 161 | 162 | # Scale cosine similarity to 0..1 163 | model_similarity = (model_similarity + 1.0) / 2.0 164 | target_similarity = (target_similarity + 1.0) / 2.0 165 | 166 | # Transform them into probabilities 167 | model_similarity = model_similarity / torch.sum(model_similarity, dim=1, keepdim=True) 168 | target_similarity = target_similarity / torch.sum(target_similarity, dim=1, keepdim=True) 169 | 170 | # Calculate the KL-divergence 171 | loss = torch.mean(target_similarity * torch.log((target_similarity + eps) / (model_similarity + eps))) 172 | 173 | return loss 174 | 175 | class VIDLoss(nn.Module): 176 | """Variational Information Distillation for Knowledge Transfer (CVPR 2019), 177 | code from author: https://github.com/ssahn0215/variational-information-distillation""" 178 | def __init__(self, 179 | num_input_channels, 180 | num_mid_channel, 181 | num_target_channels, 182 | init_pred_var=5.0, 183 | eps=1e-5): 184 | super(VIDLoss, self).__init__() 185 | 186 | def conv1x1(in_channels, out_channels, stride=1): 187 | return nn.Conv2d( 188 | in_channels, out_channels, 189 | kernel_size=1, padding=0, 190 | bias=False, stride=stride) 191 | 192 | self.regressor = nn.Sequential( 193 | conv1x1(num_input_channels, num_mid_channel), 194 | nn.ReLU(), 195 | conv1x1(num_mid_channel, num_mid_channel), 196 | nn.ReLU(), 197 | conv1x1(num_mid_channel, num_target_channels), 198 | ) 199 | self.log_scale = torch.nn.Parameter( 200 | np.log(np.exp(init_pred_var-eps)-1.0) * torch.ones(num_target_channels) 201 | ) 202 | self.eps = eps 203 | 204 | def forward(self, input, target): 205 | # pool for dimentsion match 206 | 207 | # s_H, t_H = input.shape[2], target.shape[2] 208 | # if s_H > t_H: 209 | # input = F.adaptive_avg_pool2d(input, (t_H, t_H)) 210 | # elif s_H < t_H: 211 | # target = F.adaptive_avg_pool2d(target, (s_H, s_H)) 212 | # else: 213 | # pass 214 | if input.dim() == 2: 215 | input = input.unsqueeze(2).unsqueeze(2) 216 | target = target.unsqueeze(2).unsqueeze(2) 217 | 218 | pred_mean = self.regressor(input) 219 | pred_var = torch.log(1.0+torch.exp(self.log_scale))+self.eps 220 | pred_var = pred_var.view(1, -1, 1, 1) 221 | neg_log_prob = 0.5*( 222 | (pred_mean-target)**2/pred_var+torch.log(pred_var) 223 | ) 224 | loss = torch.mean(neg_log_prob) 225 | 226 | return loss -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: CMP 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=4.5=1_gnu 8 | - blas=1.0=mkl 9 | - ca-certificates=2021.7.5=h06a4308_1 10 | - certifi=2021.5.30=py38h06a4308_0 11 | - cudatoolkit=10.2.89=hfd86e86_1 12 | - freetype=2.10.4=h5ab3b9f_0 13 | - intel-openmp=2021.3.0=h06a4308_3350 14 | - jpeg=9b=h024ee3a_2 15 | - lcms2=2.12=h3be6417_0 16 | - ld_impl_linux-64=2.35.1=h7274673_9 17 | - libffi=3.3=he6710b0_2 18 | - libgcc-ng=9.3.0=h5101ec6_17 19 | - libgomp=9.3.0=h5101ec6_17 20 | - libpng=1.6.37=hbc83047_0 21 | - libstdcxx-ng=9.3.0=hd4cf53a_17 22 | - libtiff=4.2.0=h85742a9_0 23 | - libuv=1.40.0=h7b6447c_0 24 | - libwebp-base=1.2.0=h27cfd23_0 25 | - lz4-c=1.9.3=h295c915_1 26 | - mkl=2021.3.0=h06a4308_520 27 | - mkl-service=2.4.0=py38h7f8727e_0 28 | - mkl_fft=1.3.0=py38h42c9631_2 29 | - mkl_random=1.2.2=py38h51133e4_0 30 | - ncurses=6.2=he6710b0_1 31 | - ninja=1.10.2=hff7bd54_1 32 | - numpy=1.20.3=py38hf144106_0 33 | - numpy-base=1.20.3=py38h74d4b33_0 34 | - olefile=0.46=py_0 35 | - openjpeg=2.4.0=h3ad879b_0 36 | - openssl=1.1.1l=h7f8727e_0 37 | - pillow=8.3.1=py38h2c7a002_0 38 | - pip=21.0.1=py38h06a4308_0 39 | - python=3.8.11=h12debd9_0_cpython 40 | - pytorch=1.7.1=py3.8_cuda10.2.89_cudnn7.6.5_0 41 | - readline=8.1=h27cfd23_0 42 | - setuptools=52.0.0=py38h06a4308_0 43 | - six=1.16.0=pyhd3eb1b0_0 44 | - sqlite=3.36.0=hc218d9a_0 45 | - tk=8.6.10=hbc83047_0 46 | - torchaudio=0.7.2=py38 47 | - torchvision=0.8.2=py38_cu102 48 | - typing_extensions=3.10.0.0=pyh06a4308_0 49 | - wheel=0.37.0=pyhd3eb1b0_0 50 | - xz=5.2.5=h7b6447c_0 51 | - zlib=1.2.11=h7b6447c_3 52 | - zstd=1.4.9=haebb681_0 53 | - pip: 54 | - cython==0.29.24 55 | - opencv-python==4.5.3.56 56 | - scipy==1.7.1 57 | - pyaudio 58 | -------------------------------------------------------------------------------- /eval_sup.py: -------------------------------------------------------------------------------- 1 | # This script is for batch processing testing. 2 | 3 | import os 4 | import glob 5 | import torch 6 | import torchvision.utils as vutils 7 | import webrtcvad 8 | import scipy.io as sio 9 | import csv 10 | import numpy as np 11 | 12 | from mfcc import MFCC 13 | from config import NETWORKS_PARAMETERS 14 | from network import get_network, Generator1D_directMLP 15 | from utils import write_obj_with_colors, voice2face_processed_MeshOut 16 | 17 | # initialization 18 | vad_obj = webrtcvad.Vad(2) 19 | mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025) 20 | e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False) 21 | 22 | g_net = Generator1D_directMLP().cuda().eval() 23 | g_net_ckpt = torch.load(NETWORKS_PARAMETERS['g']['model_path']) 24 | g_net.load_state_dict(g_net_ckpt) 25 | 26 | # test 27 | voice_list = sorted(glob.glob('data/fbank/*')) 28 | tri = sio.loadmat('./train.configs/tri.mat')['tri'] 29 | 30 | id_name = {} 31 | csv_file = open('data/vox1_meta.csv') 32 | rows=csv.reader(csv_file, delimiter=' ') 33 | headers = next(rows) 34 | for row in rows: 35 | id_name.update({row[0]:row[1]}) 36 | available_GT = list(map(lambda k: k.rsplit('/',1)[-1], sorted(glob.glob('data/A2E_val/*')))) 37 | 38 | # [TODO] Change this variable to yout result output folder 39 | FOLDER_ROOT = 'supervised_output/' 40 | 41 | if not os.path.exists(FOLDER_ROOT): 42 | os.mkdir(FOLDER_ROOT) 43 | coll = [] 44 | for folder in voice_list: 45 | index = folder.rsplit('/',1)[-1] 46 | print(index) 47 | if index > 'id10309': # The end of E is 10309 48 | break 49 | corr_name = id_name[index] 50 | if not corr_name in available_GT: #check if the fbank id is in the fitted model database 51 | continue 52 | count = 0 53 | 54 | if not os.path.exists(FOLDER_ROOT+corr_name): 55 | os.mkdir(FOLDER_ROOT + corr_name) 56 | 57 | all_sequences = sorted(glob.glob(folder+'/*')) 58 | 59 | for sequence in all_sequences: 60 | print(sequence) 61 | all_fbanks = sorted(glob.glob(sequence+'/*.npy')) 62 | sequence_name = sequence.rsplit('/',1)[-1] 63 | 64 | for fbank in all_fbanks: 65 | print(fbank) 66 | fbank_name = fbank.rsplit('/',1)[-1][:-4] 67 | prediction = voice2face_processed_MeshOut(e_net, g_net, fbank,NETWORKS_PARAMETERS['GPU']).squeeze().detach().cpu() 68 | save_name = FOLDER_ROOT+ corr_name + '/' + sequence_name + '_' + fbank_name 69 | write_obj_with_colors(save_name+'.obj', prediction, triangles=tri) 70 | 71 | count += 1 72 | # the first three in all the fbank sequences 73 | if count >= 3: 74 | break 75 | 76 | if count >= 3: 77 | break 78 | 79 | -------------------------------------------------------------------------------- /face_types/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/face_types/.placeholder -------------------------------------------------------------------------------- /gan_train_cascade.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import torch.nn.functional as F 5 | import torchvision.utils as vutils 6 | import scipy.io as sio 7 | 8 | from torch.utils.data import DataLoader 9 | from config import DATASET_PARAMETERS, NETWORKS_PARAMETERS 10 | from parse_dataset import get_dataset 11 | from network import get_network, SynergyNet 12 | from utils import Meter, cycle, cycle_4, save_model, read_xyz, voice2face_processed, write_obj_with_colors 13 | from distiller_zoo import PKT 14 | 15 | import torch.optim as optim 16 | import glob 17 | import numpy as np 18 | from statistics import mean 19 | import logging 20 | from datetime import datetime 21 | 22 | if not os.path.exists(NETWORKS_PARAMETERS['SAVE_DIR']): 23 | os.makedirs(NETWORKS_PARAMETERS['SAVE_DIR']) 24 | logging.basicConfig( 25 | format='[%(asctime)s] [p%(process)s] [%(pathname)s:%(lineno)d] [%(levelname)s] %(message)s', 26 | level=logging.INFO, 27 | handlers=[ 28 | logging.FileHandler(NETWORKS_PARAMETERS['SAVE_DIR']+'/{:%Y-%m-%d-%H-%M-%S}.log'.format(datetime.now()), mode='w'), 29 | logging.StreamHandler() 30 | ] 31 | ) 32 | logging.info(f'Save the pth at {NETWORKS_PARAMETERS["SAVE_DIR"]}') 33 | 34 | # dataset and dataloader 35 | print('Parsing your dataset...') 36 | voice_list, face_list, id_class_num = get_dataset(DATASET_PARAMETERS) 37 | NETWORKS_PARAMETERS['c']['output_channel'] = id_class_num 38 | 39 | 40 | print('Preparing the datasets...') 41 | voice_dataset = DATASET_PARAMETERS['voice_dataset'](voice_list, 42 | DATASET_PARAMETERS['nframe_range']) 43 | face_dataset = DATASET_PARAMETERS['face_dataset'](face_list) 44 | 45 | print('Preparing the dataloaders...') 46 | collate_fn = DATASET_PARAMETERS['collate_fn'](DATASET_PARAMETERS['nframe_range']) 47 | collate_fn_4 = DATASET_PARAMETERS['collate_fn_4'](DATASET_PARAMETERS['nframe_range']) 48 | voice_loader = DataLoader(voice_dataset, shuffle=True, drop_last=True, 49 | batch_size=DATASET_PARAMETERS['batch_size'], 50 | num_workers=DATASET_PARAMETERS['workers_num'], 51 | collate_fn=collate_fn_4) 52 | face_loader = DataLoader(face_dataset, shuffle=True, drop_last=True, 53 | batch_size=DATASET_PARAMETERS['batch_size'], 54 | num_workers=DATASET_PARAMETERS['workers_num']) 55 | 56 | voice_iterator = iter(cycle_4(voice_loader)) 57 | face_iterator = iter(cycle(face_loader)) 58 | 59 | # networks, Fe, Fg, Fd (f+d), Fc (f+c) 60 | print('Initializing networks...') 61 | e_net, e_optimizer = get_network('e', NETWORKS_PARAMETERS, train=False) 62 | g_net, g_optimizer = get_network('g', NETWORKS_PARAMETERS, train=True) 63 | f_net, f_optimizer = get_network('f', NETWORKS_PARAMETERS, train=True) 64 | d_net, d_optimizer = get_network('d', NETWORKS_PARAMETERS, train=True) 65 | c_net, c_optimizer = get_network('c', NETWORKS_PARAMETERS, train=True) 66 | 67 | # for image to 3D part 68 | image3D_pretrained = SynergyNet(pretrained=True).cuda().eval() 69 | image3D = SynergyNet().cuda() 70 | up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True) 71 | dis_optimizer = optim.Adam(image3D.parameters(), lr=0.0002, betas=(0.5, 0.999)) 72 | g_optimizer = optim.Adam(list(g_net.parameters())+list(image3D.parameters()), lr=0.0002, betas=(0.5, 0.999)) 73 | voice_list = sorted(glob.glob('data/val_sub/*')) 74 | tri = sio.loadmat('./train.configs/tri.mat')['tri'] 75 | 76 | # distiller zoo- we use PKT here; refer to the zoo for more options. 77 | distiller = PKT() 78 | tripLoss = torch.nn.TripletMarginLoss() 79 | 80 | # label for real/fake faces 81 | real_label = torch.full((DATASET_PARAMETERS['batch_size'], 1), 1).float() 82 | fake_label = torch.full((DATASET_PARAMETERS['batch_size'], 1), 0).float() 83 | 84 | # Meters for recording the training status 85 | iteration = Meter('Iter', 'sum', ':5d') 86 | data_time = Meter('Data', 'sum', ':4.2f') 87 | batch_time = Meter('Time', 'sum', ':4.2f') 88 | D_real = Meter('D_real', 'avg', ':3.2f') 89 | D_fake = Meter('D_fake', 'avg', ':3.2f') 90 | C_real = Meter('C_real', 'avg', ':3.2f') 91 | GD_fake = Meter('G_D_fake', 'avg', ':3.2f') 92 | GC_fake = Meter('G_C_fake', 'avg', ':3.2f') 93 | Distill = Meter('Distill', 'avg', ':3.2f') 94 | Trip = Meter('Triplet', 'avg', ':3.2f') 95 | 96 | # Validation point set 97 | 98 | print('Training models...') 99 | for it in range(50000): 100 | # data 101 | start_time = time.time() 102 | 103 | voice, voice_label, voice_p, voice_n = next(voice_iterator) 104 | face, face_label = next(face_iterator) 105 | noise = 0.05*torch.randn(DATASET_PARAMETERS['batch_size'], 64, 1, 1) 106 | 107 | # use GPU or not 108 | if NETWORKS_PARAMETERS['GPU']: 109 | voice, voice_label = voice.cuda(), voice_label.cuda() 110 | face, face_label = face.cuda(), face_label.cuda() 111 | real_label, fake_label = real_label.cuda(), fake_label.cuda() 112 | noise = noise.cuda() 113 | voice_p, voice_n = voice_p.cuda(), voice_n.cuda() 114 | data_time.update(time.time() - start_time) 115 | 116 | # get embeddings and generated faces 117 | embeddings = e_net(voice) 118 | embeddings = F.normalize(embeddings) 119 | # introduce some permutations 120 | embeddings = embeddings + noise 121 | embeddings = F.normalize(embeddings) 122 | fake = g_net(embeddings) 123 | 124 | # get embeddings and generated faces 125 | embeddings_p = e_net(voice_p) 126 | embeddings_p = F.normalize(embeddings_p) 127 | # introduce some permutations 128 | embeddings_p = embeddings_p + noise 129 | embeddings_p = F.normalize(embeddings_p) 130 | fake_p = g_net(embeddings_p) 131 | 132 | # get embeddings and generated faces 133 | embeddings_n = e_net(voice_n) 134 | embeddings_n = F.normalize(embeddings_n) 135 | # introduce some permutations 136 | embeddings_n = embeddings_n + noise 137 | embeddings_n = F.normalize(embeddings_n) 138 | fake_n = g_net(embeddings_n) 139 | 140 | # Discriminator 141 | f_optimizer.zero_grad() 142 | d_optimizer.zero_grad() 143 | c_optimizer.zero_grad() 144 | real_score_out = d_net(f_net(face)) 145 | fake_score_out = d_net(f_net(fake.detach())) 146 | real_label_out = c_net(f_net(face)) 147 | D_real_loss = F.binary_cross_entropy(torch.sigmoid(real_score_out), real_label) 148 | D_fake_loss = F.binary_cross_entropy(torch.sigmoid(fake_score_out), fake_label) 149 | C_real_loss = F.nll_loss(F.log_softmax(real_label_out, 1), face_label) 150 | D_real.update(D_real_loss.item()) 151 | D_fake.update(D_fake_loss.item()) 152 | C_real.update(C_real_loss.item()) 153 | (D_real_loss + D_fake_loss + C_real_loss).backward() 154 | f_optimizer.step() 155 | d_optimizer.step() 156 | c_optimizer.step() 157 | 158 | 159 | ## Joint training 160 | g_optimizer.zero_grad() 161 | fake_score_out = d_net(f_net(fake)) 162 | fake_label_out = c_net(f_net(fake)) 163 | face_image = up_layer(fake) 164 | face_image_p = up_layer(fake_p) 165 | face_image_n = up_layer(fake_n) 166 | prediction_pre, pool_pre, inter_pre = image3D_pretrained(face_image, return_interFeature=True) 167 | prediction, pool, inter = image3D(face_image, return_interFeature=True) 168 | prediction_p = image3D(face_image_p) 169 | prediction_n = image3D(face_image_n) 170 | 171 | GD_fake_loss = F.binary_cross_entropy(torch.sigmoid(fake_score_out), real_label) 172 | GC_fake_loss = F.nll_loss(F.log_softmax(fake_label_out, 1), voice_label) 173 | # distillation loss 174 | distill_loss = 0.5 * F.mse_loss(prediction_pre, prediction) + 10000*(distiller(pool_pre, pool) + distiller(inter_pre.view(inter_pre.shape[0],-1), inter.view(inter.shape[0],-1))) 175 | # triplet loss 176 | triplet_loss = 1.5 * tripLoss(prediction, prediction_p, prediction_n) 177 | (GD_fake_loss + GC_fake_loss + distill_loss + triplet_loss).backward() 178 | GD_fake.update(GD_fake_loss) 179 | GC_fake.update(GC_fake_loss.item()) 180 | Distill.update(distill_loss.item()) 181 | Trip.update(triplet_loss.item()) 182 | g_optimizer.step() 183 | 184 | batch_time.update(time.time() - start_time) 185 | 186 | # print status 187 | if it % 2000 == 0: 188 | msg = str(iteration)+str(data_time)+str(batch_time)+str(D_real)+str(D_fake)+str(C_real)+str(GD_fake)+str(GC_fake)+str(Distill)+str(Trip) 189 | 190 | logging.info(msg) 191 | 192 | data_time.reset() 193 | batch_time.reset() 194 | D_real.reset() 195 | D_fake.reset() 196 | C_real.reset() 197 | GD_fake.reset() 198 | GC_fake.reset() 199 | Distill.reset() 200 | Trip.reset() 201 | 202 | e_net.eval() 203 | g_net.eval() 204 | image3D.eval() 205 | fore_err, cheek_err, ear_err, mid_err = [],[],[],[] 206 | 207 | for folder in voice_list: 208 | name = folder.split('/',1)[-1] 209 | all_fbanks = glob.glob(folder+'/*.npy') 210 | target_pts = read_xyz(glob.glob('data/AtoE_sub/'+name+'/*.xyz')[0]) 211 | 212 | target_OICD = np.linalg.norm(target_pts[2217]-target_pts[14607]) 213 | target_foreD = np.linalg.norm(target_pts[1678]-target_pts[42117]) 214 | target_cheekD = np.linalg.norm(target_pts[2294]-target_pts[13635]) 215 | target_earD = np.linalg.norm(target_pts[20636]-target_pts[34153]) 216 | target_midD = np.linalg.norm(target_pts[2130]-target_pts[15003]) 217 | 218 | target_foreOICD = target_foreD/target_OICD 219 | target_cheekOICD = target_cheekD/target_OICD 220 | target_earOICD = target_earD/target_OICD 221 | target_midOICD = target_midD/target_OICD 222 | 223 | for fbank in all_fbanks: 224 | face_image = voice2face_processed(e_net, g_net, fbank,NETWORKS_PARAMETERS['GPU']) 225 | face_image = up_layer(face_image) 226 | pred_pts = image3D(face_image)[0].squeeze().transpose(1,0).detach().cpu() 227 | 228 | # simple validation 229 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607]) 230 | pred_pts *= (target_OICD/pred_OICD) 231 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607]) 232 | pred_midD = np.linalg.norm(pred_pts[2130]-pred_pts[15003]) 233 | pred_foreD = np.linalg.norm(pred_pts[1678]-pred_pts[42117]) 234 | pred_cheekD = np.linalg.norm(pred_pts[2294]-pred_pts[13635]) 235 | pred_earD = np.linalg.norm(pred_pts[20636]-pred_pts[34153]) 236 | 237 | pred_midOICD = pred_midD/pred_OICD 238 | pred_foreOICD = pred_foreD/pred_OICD 239 | pred_cheekOICD = pred_cheekD/pred_OICD 240 | pred_earOICD = pred_earD/pred_OICD 241 | 242 | fore_err.append(abs(pred_foreOICD-target_foreOICD)) 243 | cheek_err.append(abs(pred_cheekOICD-target_cheekOICD)) 244 | ear_err.append(abs(pred_earOICD-target_earOICD)) 245 | mid_err.append(abs(pred_midOICD-target_midOICD)) 246 | 247 | fore_err_mean, cheek_err_mean, ear_err_mean, mid_err_mean = mean(fore_err), mean(cheek_err), mean(ear_err), mean(mid_err) 248 | val_msg = f'Val forehead: {fore_err_mean:.4f}, cheek: {cheek_err_mean:.4f}, ear: {ear_err_mean:.4f}, mid: {mid_err_mean:.4f}' 249 | 250 | logging.info(val_msg) 251 | 252 | # reset to train 253 | e_net.train() 254 | g_net.train() 255 | image3D.train() 256 | 257 | # snapshot 258 | save_model(g_net, NETWORKS_PARAMETERS['g']['model_path'][:-4]+'_'+str(it)+'.pth') 259 | save_model(image3D, NETWORKS_PARAMETERS['image3D']['model_path'][:-4]+'_'+str(it)+'.pth') 260 | 261 | iteration.update(1) 262 | 263 | -------------------------------------------------------------------------------- /mfcc.py: -------------------------------------------------------------------------------- 1 | """ This code is from 2 | https://github.com/skerit/cmusphinx/blob/master/SphinxTrain/python/cmusphinx/mfcc.py 3 | We fix some bugs and modify the pre-emphasis 4 | """ 5 | 6 | # Copyright (c) 2006 Carnegie Mellon University 7 | # 8 | # You may copy and modify this freely under the same terms as 9 | # Sphinx-III 10 | 11 | """Compute MFCC coefficients. 12 | 13 | This module provides functions for computing MFCC (mel-frequency 14 | cepstral coefficients) as used in the Sphinx speech recognition 15 | system. 16 | """ 17 | 18 | __author__ = "David Huggins-Daines " 19 | __version__ = "$Revision$" 20 | 21 | 22 | 23 | import numpy, numpy.fft 24 | 25 | def mel(f): 26 | return 2595. * numpy.log10(1. + f / 700.) 27 | 28 | def melinv(m): 29 | return 700. * (numpy.power(10., m / 2595.) - 1.) 30 | 31 | class MFCC(object): 32 | def __init__(self, nfilt=40, ncep=13, 33 | lowerf=133.3333, upperf=6855.4976, alpha=0.97, 34 | samprate=16000, frate=100, wlen=0.0256, 35 | nfft=512): 36 | # Store parameters 37 | self.lowerf = lowerf 38 | self.upperf = upperf 39 | self.nfft = nfft 40 | self.ncep = ncep 41 | self.nfilt = nfilt 42 | self.frate = frate 43 | self.fshift = float(samprate) / frate 44 | 45 | # Build Hamming window 46 | self.wlen = int(wlen * samprate) 47 | self.win = numpy.hamming(self.wlen) 48 | 49 | # Prior sample for pre-emphasis 50 | self.prior = 0 51 | self.alpha = alpha 52 | 53 | # Build mel filter matrix 54 | self.filters = numpy.zeros((int(nfft/2)+1,nfilt), 'd') 55 | dfreq = float(samprate) / nfft 56 | if upperf > samprate/2: 57 | raise(Exception, 58 | "Upper frequency %f exceeds Nyquist %f" % (upperf, samprate/2)) 59 | melmax = mel(upperf) 60 | melmin = mel(lowerf) 61 | dmelbw = (melmax - melmin) / (nfilt + 1) 62 | # Filter edges, in Hz 63 | filt_edge = melinv(melmin + dmelbw * numpy.arange(nfilt + 2, dtype='d')) 64 | 65 | for whichfilt in range(0, nfilt): 66 | # Filter triangles, in DFT points 67 | leftfr = int(round(filt_edge[whichfilt] / dfreq)) 68 | centerfr = int(round(filt_edge[whichfilt + 1] / dfreq)) 69 | rightfr = int(round(filt_edge[whichfilt + 2] / dfreq)) 70 | # For some reason this is calculated in Hz, though I think 71 | # it doesn't really matter 72 | fwidth = (rightfr - leftfr) * dfreq 73 | height = 2. / fwidth 74 | 75 | if centerfr != leftfr: 76 | leftslope = height / (centerfr - leftfr) 77 | else: 78 | leftslope = 0 79 | freq = leftfr + 1 80 | while freq < centerfr: 81 | self.filters[freq,whichfilt] = (freq - leftfr) * leftslope 82 | freq = freq + 1 83 | if freq == centerfr: # This is always true 84 | self.filters[freq,whichfilt] = height 85 | freq = freq + 1 86 | if centerfr != rightfr: 87 | rightslope = height / (centerfr - rightfr) 88 | while freq < rightfr: 89 | self.filters[freq,whichfilt] = (freq - rightfr) * rightslope 90 | freq = freq + 1 91 | 92 | # Build DCT matrix 93 | self.s2dct = s2dctmat(nfilt, ncep, 1./nfilt) 94 | self.dct = dctmat(nfilt, ncep, numpy.pi/nfilt) 95 | 96 | def sig2s2mfc(self, sig): 97 | nfr = int(len(sig) / self.fshift + 1) 98 | mfcc = numpy.zeros((nfr, self.ncep), 'd') 99 | fr = 0 100 | while fr < nfr: 101 | start = round(fr * self.fshift) 102 | end = min(len(sig), start + self.wlen) 103 | frame = sig[start:end] 104 | if len(frame) < self.wlen: 105 | frame = numpy.resize(frame,self.wlen) 106 | frame[self.wlen:] = 0 107 | mfcc[fr] = self.frame2s2mfc(frame) 108 | fr = fr + 1 109 | return mfcc 110 | 111 | def sig2logspec(self, sig): 112 | nfr = int(len(sig) / self.fshift + 1) 113 | mfcc = numpy.zeros((nfr, self.nfilt), 'd') 114 | fr = 0 115 | while fr < nfr: 116 | start = round(fr * self.fshift) 117 | end = min(len(sig), start + self.wlen) 118 | frame = sig[start:end] 119 | if len(frame) < self.wlen: 120 | frame = numpy.resize(frame,self.wlen) 121 | frame[self.wlen:] = 0 122 | mfcc[fr] = self.frame2logspec(frame) 123 | fr = fr + 1 124 | return mfcc 125 | 126 | def pre_emphasis(self, frame): 127 | ''' 128 | # FIXME: Do this with matrix multiplication 129 | outfr = numpy.empty(len(frame), 'd') 130 | outfr[0] = frame[0] - self.alpha * self.prior 131 | for i in range(1,len(frame)): 132 | outfr[i] = frame[i] - self.alpha * frame[i-1] 133 | self.prior = frame[-1] 134 | ''' 135 | # NOTE: slightly different pre-emphasis for speed up 136 | frame = numpy.insert(frame, 0, self.prior) 137 | self.prior = frame[-1] 138 | return frame[1:] - self.alpha * frame[:-1] 139 | 140 | def frame2logspec(self, frame): 141 | frame = self.pre_emphasis(frame) * self.win 142 | fft = numpy.fft.rfft(frame, self.nfft) 143 | # Square of absolute value 144 | power = fft.real * fft.real + fft.imag * fft.imag 145 | return numpy.log(numpy.dot(power, self.filters).clip(1e-5,numpy.inf)) 146 | 147 | def frame2s2mfc(self, frame): 148 | logspec = self.frame2logspec(frame) 149 | return numpy.dot(logspec, self.s2dct.T) / self.nfilt 150 | 151 | def s2dctmat(nfilt,ncep,freqstep): 152 | """Return the 'legacy' not-quite-DCT matrix used by Sphinx""" 153 | melcos = numpy.empty((ncep, nfilt), 'double') 154 | for i in range(0,ncep): 155 | freq = numpy.pi * float(i) / nfilt 156 | melcos[i] = numpy.cos(freq * numpy.arange(0.5, float(nfilt)+0.5, 1.0, 'double')) 157 | melcos[:,0] = melcos[:,0] * 0.5 158 | return melcos 159 | 160 | def logspec2s2mfc(logspec, ncep=13): 161 | """Convert log-power-spectrum bins to MFCC using the 'legacy' 162 | Sphinx transform""" 163 | nframes, nfilt = logspec.shape 164 | melcos = s2dctmat(nfilt, ncep, 1./nfilt) 165 | return numpy.dot(logspec, melcos.T) / nfilt 166 | 167 | def dctmat(N,K,freqstep,orthogonalize=True): 168 | """Return the orthogonal DCT-II/DCT-III matrix of size NxK. 169 | For computing or inverting MFCCs, N is the number of 170 | log-power-spectrum bins while K is the number of cepstra.""" 171 | cosmat = numpy.zeros((N, K), 'double') 172 | for n in range(0,N): 173 | for k in range(0, K): 174 | cosmat[n,k] = numpy.cos(freqstep * (n + 0.5) * k) 175 | if orthogonalize: 176 | cosmat[:,0] = cosmat[:,0] * 1./numpy.sqrt(2) 177 | return cosmat 178 | 179 | def dct(input, K=13): 180 | """Convert log-power-spectrum to MFCC using the orthogonal DCT-II""" 181 | nframes, N = input.shape 182 | freqstep = numpy.pi / N 183 | cosmat = dctmat(N,K,freqstep) 184 | return numpy.dot(input, cosmat) * numpy.sqrt(2.0 / N) 185 | 186 | def dct2(input, K=13): 187 | """Convert log-power-spectrum to MFCC using the normalized DCT-II""" 188 | nframes, N = input.shape 189 | freqstep = numpy.pi / N 190 | cosmat = dctmat(N,K,freqstep,False) 191 | return numpy.dot(input, cosmat) * (2.0 / N) 192 | 193 | def idct(input, K=40): 194 | """Convert MFCC to log-power-spectrum using the orthogonal DCT-III""" 195 | nframes, N = input.shape 196 | freqstep = numpy.pi / K 197 | cosmat = dctmat(K,N,freqstep).T 198 | return numpy.dot(input, cosmat) * numpy.sqrt(2.0 / K) 199 | 200 | def dct3(input, K=40): 201 | """Convert MFCC to log-power-spectrum using the unnormalized DCT-III""" 202 | nframes, N = input.shape 203 | freqstep = numpy.pi / K 204 | cosmat = dctmat(K,N,freqstep,False) 205 | cosmat[:,0] = cosmat[:,0] * 0.5 206 | return numpy.dot(input, cosmat.T) 207 | -------------------------------------------------------------------------------- /network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | from backbone_nets import mobilenetv2_backbone 6 | 7 | class VoiceEmbedNet(nn.Module): 8 | def __init__(self, input_channel, channels, output_channel): 9 | super(VoiceEmbedNet, self).__init__() 10 | self.model = nn.Sequential( 11 | nn.Conv1d(input_channel, channels[0], 3, 2, 1, bias=False), 12 | nn.BatchNorm1d(channels[0], affine=True), 13 | nn.ReLU(inplace=True), 14 | nn.Conv1d(channels[0], channels[1], 3, 2, 1, bias=False), 15 | nn.BatchNorm1d(channels[1], affine=True), 16 | nn.ReLU(inplace=True), 17 | nn.Conv1d(channels[1], channels[2], 3, 2, 1, bias=False), 18 | nn.BatchNorm1d(channels[2], affine=True), 19 | nn.ReLU(inplace=True), 20 | nn.Conv1d(channels[2], channels[3], 3, 2, 1, bias=False), 21 | nn.BatchNorm1d(channels[3], affine=True), 22 | nn.ReLU(inplace=True), 23 | nn.Conv1d(channels[3], output_channel, 3, 2, 1, bias=True), 24 | ) 25 | 26 | def forward(self, x): 27 | x = self.model(x) 28 | x = F.avg_pool1d(x, x.size()[2], stride=1) 29 | x = x.view(x.size()[0], -1, 1, 1) 30 | return x 31 | 32 | class Generator(nn.Module): 33 | def __init__(self, input_channel, channels, output_channel): 34 | super(Generator, self).__init__() 35 | self.model = nn.Sequential( 36 | nn.ConvTranspose2d(input_channel, channels[0], 4, 1, 0, bias=True), 37 | nn.ReLU(inplace=True), 38 | nn.ConvTranspose2d(channels[0], channels[1], 4, 2, 1, bias=True), 39 | nn.ReLU(inplace=True), 40 | nn.ConvTranspose2d(channels[1], channels[2], 4, 2, 1, bias=True), 41 | nn.ReLU(inplace=True), 42 | nn.ConvTranspose2d(channels[2], channels[3], 4, 2, 1, bias=True), 43 | nn.ReLU(inplace=True), 44 | nn.ConvTranspose2d(channels[3], channels[4], 4, 2, 1, bias=True), 45 | nn.ReLU(inplace=True), 46 | nn.ConvTranspose2d(channels[4], output_channel, 1, 1, 0, bias=True), 47 | ) 48 | def forward(self, x): 49 | x = self.model(x) 50 | return x 51 | 52 | class FaceEmbedNet(nn.Module): 53 | def __init__(self, input_channel, channels, output_channel): 54 | super(FaceEmbedNet, self).__init__() 55 | self.model = nn.Sequential( 56 | nn.Conv2d(input_channel, channels[0], 1, 1, 0, bias=True), 57 | nn.LeakyReLU(0.2, inplace=True), 58 | nn.Conv2d(channels[0], channels[1], 4, 2, 1, bias=True), 59 | nn.LeakyReLU(0.2, inplace=True), 60 | nn.Conv2d(channels[1], channels[2], 4, 2, 1, bias=True), 61 | nn.LeakyReLU(0.2, inplace=True), 62 | nn.Conv2d(channels[2], channels[3], 4, 2, 1, bias=True), 63 | nn.LeakyReLU(0.2, inplace=True), 64 | nn.Conv2d(channels[3], channels[4], 4, 2, 1, bias=True), 65 | nn.LeakyReLU(0.2, inplace=True), 66 | nn.Conv2d(channels[4], output_channel, 4, 1, 0, bias=True), 67 | ) 68 | 69 | def forward(self, x): 70 | x = self.model(x) 71 | return x 72 | 73 | class Classifier(nn.Module): 74 | def __init__(self, input_channel, channels, output_channel): 75 | super(Classifier, self).__init__() 76 | self.model = nn.Linear(input_channel, output_channel, bias=False) 77 | 78 | def forward(self, x): 79 | x = x.view(x.size()[0], -1) 80 | x = self.model(x) 81 | return x 82 | 83 | def get_network(net_type, params, train=True): 84 | net_params = params[net_type] 85 | net = net_params['network'](net_params['input_channel'], 86 | net_params['channels'], 87 | net_params['output_channel']) 88 | if params['GPU']: 89 | net.cuda() 90 | 91 | if train: 92 | net.train() 93 | optimizer = optim.Adam(net.parameters(), 94 | lr=params['lr'], 95 | betas=(params['beta1'], params['beta2'])) 96 | else: 97 | net.eval() 98 | net.load_state_dict(torch.load(net_params['model_path'])) 99 | optimizer = None 100 | return net, optimizer 101 | 102 | # SynergyNet module definition 103 | class SynergyNet(nn.Module): 104 | '''Defintion of 2D-to-3D-part''' 105 | def __init__(self, pretrained=False, last_CN=None): 106 | super(SynergyNet, self).__init__() 107 | self.backbone = getattr(mobilenetv2_backbone, 'mobilenet_v2')(last_CN=last_CN) 108 | 109 | # load the pretained model for 2D-to-3D 110 | ckpt = torch.load('pretrained_models/2D-to-3D-pretrained.tar')['state_dict'] 111 | model_dict = self.backbone.state_dict() 112 | for k,v in ckpt.items(): 113 | if 'IGM' in k: 114 | name_reduced = k.split('.',3)[-1] 115 | model_dict[name_reduced] = v 116 | 117 | if pretrained: # SynergyNet pretrain 118 | self.backbone.load_state_dict(model_dict) 119 | 120 | # 3DMM parameters and whitening parameters 121 | self.param_std = ckpt['module.param_std'] 122 | self.param_mean = ckpt['module.param_mean'] 123 | self.w_shp = ckpt['module.w_shp'] 124 | self.w_exp = ckpt['module.w_exp'] 125 | self.u = ckpt['module.u'].unsqueeze(0) 126 | 127 | def forward(self, input, return_onlypose=False, return_interFeature=False): 128 | _3D_attr, pool_x, inter = self.backbone(input) 129 | if return_onlypose: 130 | # only return pose 131 | return _3D_attr[:,:12] * self.param_std[:12] + self.param_mean[:12] 132 | else: 133 | # return dense mesh face 134 | _3D_face = self.reconstruct_vertex(_3D_attr, dense=True) 135 | if return_interFeature: 136 | return _3D_face, pool_x, inter 137 | return _3D_face 138 | 139 | def reconstruct_vertex(self, param, whitening=True, dense=False): 140 | ''' 141 | Whitening param -> 3d vertex, based on the 3dmm param: u_base, w_shp, w_exp 142 | dense: if True, return dense vertex, else return 68 sparse landmarks. 143 | Working with batched tensors. Using Fortan-type reshape. 144 | ''' 145 | # 12 transformation + 40 shape + 10 expr + 40 (discarded) texture 146 | if whitening: 147 | if param.shape[1] == 102: 148 | param_ = param * self.param_std + self.param_mean 149 | else: 150 | raise RuntimeError('length of params mismatch') 151 | p, _, alpha_shp, alpha_exp = self.parse_param_102(param_) 152 | _, s = self.p_to_Rs(p) 153 | 154 | # frontal mesh construction with 53215 vertics (BFM Face) 155 | if dense: 156 | vertex = s.unsqueeze(1).unsqueeze(1)*(self.u + self.w_shp @ alpha_shp + self.w_exp @ alpha_exp).squeeze().contiguous().view(-1, 53215, 3).transpose(1,2) 157 | else: 158 | raise NotImplementedError("Only dense mesh reconstruction supported") 159 | 160 | return vertex 161 | 162 | def parse_param_102(self, param): 163 | ''' Parse param into 3DMM semantics''' 164 | p_ = param[:, :12].reshape(-1, 3, 4) 165 | p = p_[:, :, :3] 166 | offset = p_[:, :, -1].reshape(-1, 3, 1) 167 | alpha_shp = param[:, 12:52].reshape(-1, 40, 1) 168 | alpha_exp = param[:, 52:62].reshape(-1, 10, 1) 169 | return p, offset, alpha_shp, alpha_exp 170 | 171 | def parse_param_102_pose(self, param): 172 | ''' Parse only pose params''' 173 | p_ = param[:, :12].reshape(-1, 3, 4) 174 | p = p_[:, :, :3] 175 | R, s = self.p_to_Rs(p) 176 | offset = p_[:, :, -1].reshape(-1, 3, 1) 177 | return R, offset 178 | 179 | def p_to_Rs(self, R): 180 | '''Convert P to R and s as in 3DDFA-V2''' 181 | s = (R[:, 0, :3].norm(dim=1) + R[:, 1, :3].norm(dim=1))/2.0 182 | return F.normalize(R, p=2, dim=2), s 183 | 184 | class Generator1D_directMLP(nn.Module): 185 | def __init__(self): 186 | super(Generator1D_directMLP, self).__init__() 187 | 188 | # building classifier 189 | self.num_scale = 1 190 | self.num_shape = 40 191 | self.num_exp = 10 192 | self.last_channel = 64 193 | 194 | self.classifier_scale = nn.Sequential( 195 | nn.Dropout(0.2), 196 | nn.Linear(self.last_channel, self.num_scale), 197 | ) 198 | self.classifier_shape = nn.Sequential( 199 | nn.Dropout(0.2), 200 | nn.Linear(self.last_channel, self.num_shape), 201 | ) 202 | self.classifier_exp = nn.Sequential( 203 | nn.Dropout(0.2), 204 | nn.Linear(self.last_channel, self.num_exp), 205 | ) 206 | 207 | ckpt = torch.load('pretrained_models/2D-to-3D-pretrained.tar')['state_dict'] 208 | print('Loading whitening parameters from: models/2D-to-3D-pretrained.tar') 209 | self.param_std = ckpt['module.param_std'] 210 | self.param_mean = ckpt['module.param_mean'] 211 | self.w_shp = ckpt['module.w_shp'] 212 | self.w_exp = ckpt['module.w_exp'] 213 | self.u = ckpt['module.u'].unsqueeze(0) 214 | 215 | def forward_test(self, x): 216 | """return mesh 217 | """ 218 | x = x.reshape(x.shape[0], -1) 219 | x_scale = self.classifier_scale(x) 220 | x_shape = self.classifier_shape(x) 221 | x_exp = self.classifier_exp(x) 222 | _3D_attr = torch.cat((x_scale, x_shape, x_exp), dim=1) 223 | _3D_face = self.reconstruct_vertex_51_onlyDeform(_3D_attr, dense=True) 224 | return _3D_face 225 | 226 | def forward_test_param(self, x): 227 | """return 3dmm parameters 228 | """ 229 | x = x.reshape(x.shape[0], -1) 230 | x_scale = self.classifier_scale(x) 231 | x_shape = self.classifier_shape(x) 232 | x_exp = self.classifier_exp(x) 233 | _3D_attr = torch.cat((x_scale, x_shape, x_exp), dim=1) 234 | return _3D_attr 235 | 236 | def reconstruct_vertex_51_onlyDeform(self, param, whitening=True, dense=False): 237 | """51 = 1 (scale) + 40 (shape) + 10 (expr) 238 | """ 239 | if whitening: 240 | if param.shape[1] == 51: # manually mine out whitening params for scale 241 | s = (param[:, 0]*1.538597731841497e-05) + 0.0005920184194110334 242 | param_ = param[:, 1:] * self.param_std[12:62] + self.param_mean[12:62] 243 | else: 244 | raise RuntimeError('length of params mismatch') 245 | alpha_shp, alpha_exp = self.parse_param_50(param_) 246 | if dense: 247 | # since we are predicting 3D face from speech 248 | # only use scale, do not use rotation nor translation 249 | vertex = s.unsqueeze(1).unsqueeze(1)*(self.u + self.w_shp @ alpha_shp + self.w_exp @ alpha_exp).squeeze().contiguous().view(-1, 53215, 3).transpose(1,2) 250 | return vertex 251 | 252 | def parse_param_50(self, param): 253 | """Work for only tensor""" 254 | alpha_shp = param[:, :40].reshape(-1, 40, 1) 255 | alpha_exp = param[:, 40:50].reshape(-1, 10, 1) 256 | return alpha_shp, alpha_exp -------------------------------------------------------------------------------- /parse_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def parse_metafile(meta_file): 4 | with open(meta_file, 'r') as f: 5 | lines = f.readlines()[1:] 6 | celeb_ids = {} 7 | for line in lines: 8 | ID, name, _, _, _ = line.rstrip().split('\t') 9 | celeb_ids[ID] = name 10 | return celeb_ids 11 | 12 | def get_labels(voice_list, face_list): 13 | voice_names = {item['name'] for item in voice_list} 14 | face_names = {item['name'] for item in face_list} 15 | names = voice_names & face_names 16 | 17 | voice_list = [item for item in voice_list if item['name'] in names] 18 | face_list = [item for item in face_list if item['name'] in names] 19 | 20 | names = sorted(list(names)) 21 | label_dict = dict(zip(names, range(len(names)))) 22 | for item in voice_list+face_list: 23 | item['label_id'] = label_dict[item['name']] 24 | return voice_list, face_list, len(names) 25 | 26 | 27 | def get_dataset_files(data_dir, data_ext, celeb_ids, split): 28 | data_list = [] 29 | # read data directory 30 | for root, dirs, filenames in os.walk(data_dir): 31 | for filename in filenames: 32 | if filename.endswith(data_ext): 33 | filepath = os.path.join(root, filename) 34 | # so hacky, be careful! 35 | folder = filepath[len(data_dir):].split('/')[1] 36 | celeb_name = celeb_ids.get(folder, folder) 37 | if celeb_name.startswith(tuple(split)): 38 | data_list.append({'filepath': filepath, 'name': celeb_name}) 39 | return data_list 40 | 41 | def get_dataset(data_params): 42 | celeb_ids = parse_metafile(data_params['meta_file']) 43 | 44 | voice_list = get_dataset_files(data_params['voice_dir'], 45 | data_params['voice_ext'], 46 | celeb_ids, 47 | data_params['split']) 48 | face_list = get_dataset_files(data_params['face_dir'], 49 | data_params['face_ext'], 50 | celeb_ids, 51 | data_params['split']) 52 | return get_labels(voice_list, face_list) 53 | 54 | -------------------------------------------------------------------------------- /pyaudio_recording.py: -------------------------------------------------------------------------------- 1 | import wave 2 | from dataclasses import dataclass, asdict 3 | 4 | import pyaudio 5 | 6 | 7 | @dataclass 8 | class StreamParams: 9 | format: int = pyaudio.paInt16 10 | channels: int = 2 11 | rate: int = 44100 12 | frames_per_buffer: int = 1024 13 | input: bool = True 14 | output: bool = False 15 | 16 | def to_dict(self) -> dict: 17 | return asdict(self) 18 | 19 | 20 | class Recorder: 21 | """Recorder uses the blocking I/O facility from pyaudio to record sound 22 | from mic. 23 | Attributes: 24 | - stream_params: StreamParams object with values for pyaudio Stream 25 | object 26 | """ 27 | def __init__(self, stream_params: StreamParams) -> None: 28 | self.stream_params = stream_params 29 | self._pyaudio = None 30 | self._stream = None 31 | self._wav_file = None 32 | 33 | def record(self, duration: int, save_path: str) -> None: 34 | """Record sound from mic for a given amount of seconds. 35 | :param duration: Number of seconds we want to record for 36 | :param save_path: Where to store recording 37 | """ 38 | print(f"Start recording for {duration} seconds...") 39 | self._create_recording_resources(save_path) 40 | self._write_wav_file_reading_from_stream(duration) 41 | self._close_recording_resources() 42 | print("Stop recording") 43 | 44 | def _create_recording_resources(self, save_path: str) -> None: 45 | self._pyaudio = pyaudio.PyAudio() 46 | self._stream = self._pyaudio.open(**self.stream_params.to_dict()) 47 | self._create_wav_file(save_path) 48 | 49 | def _create_wav_file(self, save_path: str): 50 | self._wav_file = wave.open(save_path, "wb") 51 | self._wav_file.setnchannels(self.stream_params.channels) 52 | self._wav_file.setsampwidth(self._pyaudio.get_sample_size(self.stream_params.format)) 53 | self._wav_file.setframerate(self.stream_params.rate) 54 | 55 | def _write_wav_file_reading_from_stream(self, duration: int) -> None: 56 | for _ in range(int(self.stream_params.rate * duration / self.stream_params.frames_per_buffer)): 57 | audio_data = self._stream.read(self.stream_params.frames_per_buffer) 58 | self._wav_file.writeframes(audio_data) 59 | 60 | def _close_recording_resources(self) -> None: 61 | self._wav_file.close() 62 | self._stream.close() 63 | self._pyaudio.terminate() 64 | 65 | 66 | if __name__ == "__main__": 67 | stream_params = StreamParams() 68 | recorder = Recorder(stream_params) 69 | recorder.record(5, "audio.wav") 70 | -------------------------------------------------------------------------------- /utilf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/utilf/__init__.py -------------------------------------------------------------------------------- /utilf/render.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append('..') 4 | 5 | import cv2 6 | import numpy as np 7 | import scipy.io as sio 8 | 9 | from Sim3DR import RenderPipeline 10 | 11 | # to continuous 12 | def _to_ctype(arr): 13 | if not arr.flags.c_contiguous: 14 | return arr.copy(order='C') 15 | return arr 16 | 17 | # load BFM connectivity of triangles 18 | tri = sio.loadmat('./train.configs/tri.mat')['tri'] - 1 19 | tri = _to_ctype(tri.T).astype(np.int32) 20 | 21 | # Sim3DR definition 22 | cfg = { 23 | 'intensity_ambient': 0.3, 24 | 'color_ambient': (1, 1, 1), 25 | 'intensity_directional': 0.6, 26 | 'color_directional': (1, 1, 1), 27 | 'intensity_specular': 0.1, 28 | 'specular_exp': 5, 29 | 'light_pos': (0, 0, 5), 30 | 'view_pos': (0, 0, 5) 31 | } 32 | 33 | render_app = RenderPipeline(**cfg) 34 | 35 | def render_vert(img, vert, alpha=1.0, wfp=None): 36 | print(f'Save visualization result to {wfp}') 37 | overlap = img.copy() 38 | vert = vert.astype(np.float32) 39 | ver = _to_ctype(vert.T) # transpose 40 | overlap = render_app(ver, tri, overlap) 41 | overlap = cv2.addWeighted(img, 1 - alpha, overlap, alpha, 0) 42 | cv2.imwrite(wfp[:-4]+'.png', overlap) 43 | 44 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import shutil 4 | import numpy as np 5 | import torch.nn.functional as F 6 | import pickle 7 | import os.path as osp 8 | 9 | from PIL import Image 10 | from scipy.io import wavfile 11 | from torch.utils.data.dataloader import default_collate 12 | from vad import read_wave, write_wave, frame_generator, vad_collector 13 | 14 | def make_abs_path(d): 15 | return osp.join(osp.dirname(osp.realpath(__file__)), d) 16 | 17 | def _get_suffix(filename): 18 | """a.jpg -> jpg""" 19 | pos = filename.rfind('.') 20 | if pos == -1: 21 | return '' 22 | return filename[pos + 1:] 23 | 24 | def _load(fp): 25 | suffix = _get_suffix(fp) 26 | if suffix == 'npy': 27 | return np.load(fp) 28 | elif suffix == 'pkl': 29 | return pickle.load(open(fp, 'rb')) 30 | 31 | def _load_tensor(fp, mode='cpu'): 32 | if mode.lower() == 'cpu': 33 | return torch.from_numpy(_load(fp)) 34 | elif mode.lower() == 'gpu': 35 | return torch.from_numpy(_load(fp)).cuda() 36 | 37 | def parse_param_102(param): 38 | """Work for only tensor""" 39 | p_ = param[:, :12].reshape(-1, 3, 4) 40 | p = p_[:, :, :3] 41 | offset = p_[:, :, -1].reshape(-1, 3, 1) 42 | alpha_shp = param[:, 12:52].reshape(-1, 40, 1) 43 | alpha_exp = param[:, 52:62].reshape(-1, 10, 1) 44 | alpha_tex = param[:, 62:102].reshape(-1, 40, 1) 45 | return p, offset, alpha_shp, alpha_exp, alpha_tex 46 | 47 | def to_rotation_mat_renorm(R): 48 | s = (R[:, 0, :3].norm(dim=1) + R[:, 1, :3].norm(dim=1))/2.0 49 | return F.normalize(R, p=2, dim=2), s 50 | 51 | class ParamsPack(): 52 | """3DMM configuration data loading from ./train.configs""" 53 | def __init__(self, version): 54 | data_ver = version 55 | 56 | d = make_abs_path('./train.configs') 57 | 58 | # PCA basis for shape, expression, texture 59 | self.w_shp = _load_tensor(osp.join(d, 'w_shp_{}.npy'.format(data_ver)), mode='gpu') 60 | self.w_exp = _load_tensor(osp.join(d, 'w_exp_{}.npy'.format(data_ver)), mode='gpu') 61 | #self.w_tex = torch.from_numpy(_load(osp.join(d, 'w_tex_sim.npy'))[:,:40]).cuda() 62 | 63 | # param_mean and param_std are used for re-whitening 64 | meta = _load(osp.join(d, 'param_whitening_{}.pkl'.format(data_ver))) 65 | self.param_mean = torch.from_numpy(meta.get('param_mean')).float().cuda() 66 | self.param_std = torch.from_numpy(meta.get('param_std')).float().cuda() 67 | 68 | # mean values 69 | self.u_shp = _load_tensor(osp.join(d, 'u_shp.npy'), mode='gpu') 70 | self.u_exp = _load_tensor(osp.join(d, 'u_exp.npy'), mode='gpu') 71 | #self.u_tex = _load_tensor(osp.join(d, 'u_tex.npy'), mode='gpu') 72 | self.u = self.u_shp + self.u_exp 73 | self.w = torch.cat((self.w_shp, self.w_exp), dim=1) 74 | 75 | # base vector for landmarks 76 | self.std_size = 120 77 | self.dim = self.w_shp.shape[0] // 3 78 | 79 | param_pack = ParamsPack('v201') 80 | 81 | class Meter(object): 82 | # Computes and stores the average and current value 83 | def __init__(self, name, display, fmt=':f'): 84 | self.name = name 85 | self.display = display 86 | self.fmt = fmt 87 | self.reset() 88 | 89 | def reset(self): 90 | self.val = 0 91 | self.avg = 0 92 | self.sum = 0 93 | self.count = 0 94 | 95 | def update(self, val, n=1): 96 | self.val = val 97 | self.sum += val * n 98 | self.count += n 99 | self.avg = self.sum / self.count 100 | 101 | def __str__(self): 102 | fmtstr = '{name}:{' + self.display + self.fmt + '},' 103 | return fmtstr.format(**self.__dict__) 104 | 105 | def get_collate_fn(nframe_range): 106 | def collate_fn(batch): 107 | min_nframe, max_nframe = nframe_range 108 | assert min_nframe <= max_nframe 109 | num_frame = np.random.randint(min_nframe, max_nframe+1) 110 | pt = np.random.randint(0, max_nframe-num_frame+1) 111 | batch = [(item[0][..., pt:pt+num_frame], item[1]) 112 | for item in batch] 113 | return default_collate(batch) 114 | return collate_fn 115 | 116 | def get_collate_fn_4(nframe_range): 117 | def collate_fn(batch): 118 | min_nframe, max_nframe = nframe_range 119 | assert min_nframe <= max_nframe 120 | num_frame = np.random.randint(min_nframe, max_nframe+1) 121 | pt = np.random.randint(0, max_nframe-num_frame+1) 122 | batch = [(item[0][..., pt:pt+num_frame], item[1], item[2][..., pt:pt+num_frame], item[3][..., pt:pt+num_frame]) for item in batch] 123 | return default_collate(batch) 124 | return collate_fn 125 | 126 | def cycle(dataloader): 127 | while True: 128 | for data, label in dataloader: 129 | yield data, label 130 | 131 | def cycle_4(dataloader): 132 | while True: 133 | for data, label, data_p, data_n in dataloader: 134 | yield data, label, data_p, data_n 135 | 136 | def save_model(net, model_path): 137 | model_dir = os.path.dirname(model_path) 138 | if not os.path.exists(model_dir): 139 | os.makedirs(model_dir) 140 | torch.save(net.state_dict(), model_path) 141 | 142 | def rm_sil(voice_file, vad_obj): 143 | """ 144 | This code snippet is basically taken from the repository 145 | 'https://github.com/wiseman/py-webrtcvad' 146 | 147 | It removes the silence clips in a speech recording 148 | """ 149 | audio, sample_rate = read_wave(voice_file) 150 | frames = frame_generator(20, audio, sample_rate) 151 | frames = list(frames) 152 | segments = vad_collector(sample_rate, 20, 50, vad_obj, frames) 153 | 154 | if os.path.exists('tmp/'): 155 | shutil.rmtree('tmp/') 156 | os.makedirs('tmp/') 157 | 158 | wave_data = [] 159 | for i, segment in enumerate(segments): 160 | segment_file = 'tmp/' + str(i) + '.wav' 161 | write_wave(segment_file, segment, sample_rate) 162 | wave_data.append(wavfile.read(segment_file)[1]) 163 | shutil.rmtree('tmp/') 164 | 165 | if wave_data: 166 | vad_voice = np.concatenate(wave_data).astype('int16') 167 | return vad_voice 168 | 169 | def get_fbank(voice, mfc_obj): 170 | # Extract log mel-spectrogra 171 | fbank = mfc_obj.sig2logspec(voice).astype('float32') 172 | 173 | # print(fbank.shape) 174 | # m=fbank.mean(axis=0) 175 | # print(m.shape) 176 | # exit() 177 | 178 | # Mean and variance normalization of each mel-frequency 179 | fbank = fbank - fbank.mean(axis=0) 180 | fbank = fbank / (fbank.std(axis=0)+np.finfo(np.float32).eps) 181 | 182 | # If the duration of a voice recording is less than 10 seconds (1000 frames), 183 | # repeat the recording until it is longer than 10 seconds and crop. 184 | full_frame_number = 1000 185 | init_frame_number = fbank.shape[0] 186 | while fbank.shape[0] < full_frame_number: 187 | fbank = np.append(fbank, fbank[0:init_frame_number], axis=0) 188 | fbank = fbank[0:full_frame_number,:] 189 | return fbank 190 | 191 | 192 | def voice2face(e_net, g_net, voice_file, vad_obj, mfc_obj, GPU=True): 193 | vad_voice = rm_sil(voice_file, vad_obj) 194 | fbank = get_fbank(vad_voice, mfc_obj) 195 | fbank = fbank.T[np.newaxis, ...] 196 | fbank = torch.from_numpy(fbank.astype('float32')) 197 | 198 | if GPU: 199 | fbank = fbank.cuda() 200 | embedding = e_net(fbank) 201 | embedding = F.normalize(embedding) 202 | face = g_net(embedding) 203 | return face 204 | 205 | 206 | def voice2face_processed(e_net, g_net, fbank_obj, GPU=True, return_embeddings=False): 207 | fbank = np.load(fbank_obj) 208 | fbank = fbank.T[np.newaxis, ...] 209 | fbank = torch.from_numpy(fbank.astype('float32')) 210 | 211 | if GPU: 212 | fbank = fbank.cuda() 213 | embedding = e_net(fbank) 214 | embedding = F.normalize(embedding) 215 | 216 | face = g_net(embedding) 217 | 218 | if return_embeddings: 219 | return face, embedding 220 | 221 | return face 222 | 223 | def voice2face_processed_ParamOut(e_net, g_net, fbank_obj, GPU=True): 224 | fbank = np.load(fbank_obj) 225 | fbank = fbank.T[np.newaxis, ...] 226 | fbank = torch.from_numpy(fbank.astype('float32')) 227 | 228 | if GPU: 229 | fbank = fbank.cuda() 230 | embedding = e_net(fbank) 231 | embedding = F.normalize(embedding) 232 | face = g_net.forward_test(embedding) 233 | 234 | return face 235 | 236 | def voice2face_processed_MeshOut(e_net, g_net, fbank_obj, GPU=True): 237 | fbank = np.load(fbank_obj) 238 | fbank = fbank.T[np.newaxis, ...] 239 | fbank = torch.from_numpy(fbank.astype('float32')) 240 | 241 | if GPU: 242 | fbank = fbank.cuda() 243 | embedding = e_net(fbank) 244 | embedding = F.normalize(embedding) 245 | face = g_net.forward_test(embedding) 246 | 247 | return face 248 | 249 | def write_obj_with_colors(obj_name, vertices, triangles): 250 | """ 251 | write out obj mesh files. 252 | """ 253 | if obj_name.split('.')[-1] != 'obj': 254 | obj_name = obj_name + '.obj' 255 | 256 | # write obj 257 | with open(obj_name, 'w') as f: 258 | # write vertices & colors 259 | for i in range(vertices.shape[1]): 260 | s = 'v {} {} {}\n'.format(vertices[0, i], vertices[1, i], vertices[2, i]) 261 | f.write(s) 262 | 263 | # write f: ver ind/ uv ind 264 | for i in range(triangles.shape[1]): 265 | s = 'f {} {} {}\n'.format(triangles[0, i], triangles[1, i], triangles[2, i]) 266 | f.write(s) 267 | 268 | def read_obj(filename): 269 | f = open(filename) 270 | lines = f.readlines() 271 | coll = [] 272 | for l in lines: 273 | if l[0] != 'v': 274 | break 275 | comp = l.split()[1:] 276 | comp = list(map(float, comp)) 277 | coll.append(comp) 278 | 279 | a = np.asarray(coll) 280 | return a 281 | 282 | def read_xyz(filename): 283 | f = open(filename) 284 | lines = f.readlines() 285 | coll = [] 286 | for l in lines: 287 | comp = l.split() 288 | comp = list(map(float, comp)) 289 | coll.append(comp) 290 | a=np.asarray(coll) 291 | return a 292 | -------------------------------------------------------------------------------- /vad.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import contextlib 3 | import sys 4 | import wave 5 | 6 | def read_wave(path): 7 | with contextlib.closing(wave.open(path, 'rb')) as wf: 8 | num_channels = wf.getnchannels() 9 | assert num_channels == 1 10 | sample_width = wf.getsampwidth() 11 | assert sample_width == 2 12 | sample_rate = wf.getframerate() 13 | assert sample_rate in (8000, 16000, 32000) 14 | pcm_data = wf.readframes(wf.getnframes()) 15 | return pcm_data, sample_rate 16 | 17 | 18 | def write_wave(path, audio, sample_rate): 19 | with contextlib.closing(wave.open(path, 'wb')) as wf: 20 | wf.setnchannels(1) 21 | wf.setsampwidth(2) 22 | wf.setframerate(sample_rate) 23 | wf.writeframes(audio) 24 | 25 | 26 | class Frame(object): 27 | def __init__(self, bytes, timestamp, duration): 28 | self.bytes = bytes 29 | self.timestamp = timestamp 30 | self.duration = duration 31 | 32 | 33 | def frame_generator(frame_duration_ms, audio, sample_rate): 34 | n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) 35 | offset = 0 36 | timestamp = 0.0 37 | duration = (float(n) / sample_rate) / 2.0 38 | while offset + n < len(audio): 39 | yield Frame(audio[offset:offset + n], timestamp, duration) 40 | timestamp += duration 41 | offset += n 42 | 43 | 44 | def vad_collector(sample_rate, frame_duration_ms, 45 | padding_duration_ms, vad, frames): 46 | num_padding_frames = int(padding_duration_ms / frame_duration_ms) 47 | ring_buffer = collections.deque(maxlen=num_padding_frames) 48 | triggered = False 49 | voiced_frames = [] 50 | for frame in frames: 51 | #sys.stdout.write( 52 | # '1' if vad.is_speech(frame.bytes, sample_rate) else '0') 53 | if not triggered: 54 | ring_buffer.append(frame) 55 | num_voiced = len([f for f in ring_buffer 56 | if vad.is_speech(f.bytes, sample_rate)]) 57 | if num_voiced > 0.9 * ring_buffer.maxlen: 58 | # sys.stdout.write('+(%s)' % (ring_buffer[0].timestamp,)) 59 | triggered = True 60 | voiced_frames.extend(ring_buffer) 61 | ring_buffer.clear() 62 | else: 63 | voiced_frames.append(frame) 64 | ring_buffer.append(frame) 65 | num_unvoiced = len([f for f in ring_buffer 66 | if not vad.is_speech(f.bytes, sample_rate)]) 67 | if num_unvoiced > 0.9 * ring_buffer.maxlen: 68 | #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) 69 | triggered = False 70 | yield b''.join([f.bytes for f in voiced_frames]) 71 | ring_buffer.clear() 72 | voiced_frames = [] 73 | if voiced_frames: 74 | yield b''.join([f.bytes for f in voiced_frames]) 75 | --------------------------------------------------------------------------------