├── .gitignore
├── LICENSE
├── README.md
├── Sim3DR
├── .gitignore
├── Sim3DR.py
├── __init__.py
├── _init_paths.py
├── build_sim3dr.sh
├── lib
│ ├── rasterize.h
│ ├── rasterize.pyx
│ └── rasterize_kernel.cpp
├── lighting.py
├── readme.md
├── setup.py
└── tests
│ ├── .gitignore
│ ├── CMakeLists.txt
│ ├── io.cpp
│ ├── io.h
│ └── test.cpp
├── backbone_nets
└── mobilenetv2_backbone.py
├── cal_size.sh
├── cal_size_ARE.py
├── cal_size_kpts.py
├── config.py
├── data
├── preprocessed_MFCC
│ ├── rand_id00001
│ │ └── 1TmvLk8sB-g
│ │ │ ├── 00001.npy
│ │ │ ├── 00002.npy
│ │ │ └── 00003.npy
│ ├── rand_id00002
│ │ └── 0XmNeUnOnlg
│ │ │ ├── 00001.npy
│ │ │ ├── 00002.npy
│ │ │ └── 00003.npy
│ ├── rand_id00003
│ │ └── 1M4q6CQM5pA
│ │ │ ├── 00001.npy
│ │ │ ├── 00002.npy
│ │ │ └── 00003.npy
│ ├── rand_id00004
│ │ └── _2wZVvsQYFg
│ │ │ ├── 00001.npy
│ │ │ ├── 00002.npy
│ │ │ └── 00003.npy
│ └── rand_id00005
│ │ └── 0nH78dDh0N0
│ │ ├── 00001.npy
│ │ ├── 00002.npy
│ │ └── 00003.npy
├── results
│ ├── rand_id00001
│ │ ├── 1TmvLk8sB-g_00001_image.png
│ │ ├── 1TmvLk8sB-g_00001_overlap.png
│ │ ├── 1TmvLk8sB-g_00002_image.png
│ │ ├── 1TmvLk8sB-g_00002_overlap.png
│ │ ├── 1TmvLk8sB-g_00003_image.png
│ │ └── 1TmvLk8sB-g_00003_overlap.png
│ ├── rand_id00002
│ │ ├── 0XmNeUnOnlg_00001_image.png
│ │ ├── 0XmNeUnOnlg_00001_overlap.png
│ │ ├── 0XmNeUnOnlg_00002_image.png
│ │ ├── 0XmNeUnOnlg_00002_overlap.png
│ │ ├── 0XmNeUnOnlg_00003_image.png
│ │ └── 0XmNeUnOnlg_00003_overlap.png
│ ├── rand_id00003
│ │ ├── 1M4q6CQM5pA_00001_image.png
│ │ ├── 1M4q6CQM5pA_00001_overlap.png
│ │ ├── 1M4q6CQM5pA_00002_image.png
│ │ ├── 1M4q6CQM5pA_00002_overlap.png
│ │ ├── 1M4q6CQM5pA_00003_image.png
│ │ └── 1M4q6CQM5pA_00003_overlap.png
│ ├── rand_id00004
│ │ ├── _2wZVvsQYFg_00001_image.png
│ │ ├── _2wZVvsQYFg_00001_overlap.png
│ │ ├── _2wZVvsQYFg_00002_image.png
│ │ ├── _2wZVvsQYFg_00002_overlap.png
│ │ ├── _2wZVvsQYFg_00003_image.png
│ │ └── _2wZVvsQYFg_00003_overlap.png
│ └── rand_id00005
│ │ ├── 0nH78dDh0N0_00001_image.png
│ │ ├── 0nH78dDh0N0_00001_overlap.png
│ │ ├── 0nH78dDh0N0_00002_image.png
│ │ ├── 0nH78dDh0N0_00002_overlap.png
│ │ ├── 0nH78dDh0N0_00003_image.png
│ │ └── 0nH78dDh0N0_00003_overlap.png
├── results_reference
│ ├── Asa_Butterfield
│ │ ├── 1TmvLk8sB-g_00001_img.png
│ │ ├── 1TmvLk8sB-g_00001_overlap.png
│ │ ├── 1TmvLk8sB-g_00002_img.png
│ │ ├── 1TmvLk8sB-g_00002_overlap.png
│ │ ├── 1TmvLk8sB-g_00003_img.png
│ │ └── 1TmvLk8sB-g_00003_overlap.png
│ ├── Ashley_Greene
│ │ ├── 0XmNeUnOnlg_00001_img.png
│ │ ├── 0XmNeUnOnlg_00001_overlap.png
│ │ ├── 0XmNeUnOnlg_00002_img.png
│ │ ├── 0XmNeUnOnlg_00002_overlap.png
│ │ ├── 0XmNeUnOnlg_00003_img.png
│ │ └── 0XmNeUnOnlg_00003_overlap.png
│ ├── Bellamy_Young
│ │ ├── 1M4q6CQM5pA_00001_img.png
│ │ ├── 1M4q6CQM5pA_00001_overlap.png
│ │ ├── 1M4q6CQM5pA_00002_img.png
│ │ ├── 1M4q6CQM5pA_00002_overlap.png
│ │ ├── 1M4q6CQM5pA_00003_img.png
│ │ └── 1M4q6CQM5pA_00003_overlap.png
│ ├── Bethany_Mota
│ │ ├── _2wZVvsQYFg_00001_img.png
│ │ ├── _2wZVvsQYFg_00001_overlap.png
│ │ ├── _2wZVvsQYFg_00002_img.png
│ │ ├── _2wZVvsQYFg_00002_overlap.png
│ │ ├── _2wZVvsQYFg_00003_img.png
│ │ └── _2wZVvsQYFg_00003_overlap.png
│ └── Eva_Longoria
│ │ ├── 0nH78dDh0N0_00001_img.png
│ │ ├── 0nH78dDh0N0_00001_overlap.png
│ │ ├── 0nH78dDh0N0_00002_img.png
│ │ ├── 0nH78dDh0N0_00002_overlap.png
│ │ ├── 0nH78dDh0N0_00003_img.png
│ │ └── 0nH78dDh0N0_00003_overlap.png
└── vox1_meta.csv
├── dataset.py
├── demo.py
├── demo
├── coherence.png
├── overall_purpose.png
└── supervised_comp.png
├── demo_mic.py
├── distiller_zoo.py
├── environment.yml
├── eval_sup.py
├── face_types
└── .placeholder
├── gan_train_cascade.py
├── mfcc.py
├── network.py
├── parse_dataset.py
├── pyaudio_recording.py
├── utilf
├── __init__.py
└── render.py
├── utils.py
└── vad.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | #lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | pretrained_models/
132 | train.configs/
133 | data/results/
134 | *.obj
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Cho Ying Wu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #
Cross-Modal Perceptionist
2 | Code Repository for CVPR 2022 "Cross-Modal Perceptionist: Can Face Geometry be Gleaned from Voices?"
3 |
4 | Cho-Ying Wu, Chin-Cheng Hsu, Ulrich Neumann, University of Southern California
5 |
6 | [Paper] [Project page] [Voxceleb-3D Data]
7 |
8 | Check the project page for the introduction of this cool work!
9 |
10 | Update: 2022/12/01 Added Evaluation code, pretained model, and execution script for supervised framework. Organized data structure of Voxceleb-3D
11 |
12 | Voxceleb-3D:
13 |
14 | (1) [Here] contains data with names starting from F-Z as the training set. 100G zipped file, ~250G after unzip. This set contains pointcloud (.xyz), reconstructed mesh overlapped on images from VGGFace (_b.jpg), and 199-dim 3DMM parameters using BFM Face 2009 basis. This is in contrast to simplified 3DMM basis for first 40-dim shape + next 10-dim expression. (SynergyNet follows 3DDFA-v2 and uses the first 40-dim out of 199-dim as the shape code. Thus, the first 40 in 199 correspond to first 40-dim in SynergyNet.)You can donwload full basis from BFM-2009 official website. There are multiple 3D faces for an identity.
15 |
16 | (2) [Here] contains data with names starting from A-E as the validation set. 300M. The format is the same except there is only one 3D face for an identity as groundtruth.
17 |
18 | (3) [Here] contains images from VGGFace we used to reconstruct 3D faces for (1) and (2)
19 |
20 | (4) [Here] contains preprocessed voice data (MFCC features) from Voxceleb for all the identities. 38G zipped file. Refer to this [meta file] to map id to name.
21 |
22 | (5) [Here] contains preprocessed voice data (MFCC features) from Voxceleb for the testing subset (A-E). You can download it for inference purpose. See later section.
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 | We study the cross-modal learning and analyze the correlation between voices and 3D face geometry. Unlike previous methods for studying this correlation between voices and faces and only work on the 2D domain, we choose 3D representation that can better validate the supportive evidence from the physiology of the correlation between voices and skeletal and articulator structures, which potentially affect facial geometry.
31 |
32 | Comparison of recovered 3D face meshes with the baseline.
33 |
34 |
35 |
36 |
37 |
38 | Consistency for the same identity using different utterances.
39 |
40 |
41 |
42 |
43 |
44 | ## Demo: Preprocessed fbank
45 |
46 | We test on Ubuntu 16.04 LTS, NVIDIA 2080 Ti (only GPU is supported), and use anaconda for installing packages
47 |
48 | Install packages
49 |
50 | 1. `conda create --name CMP python=3.8`
51 | 2. Install PyTorch compatible to your computer, we test on PyTorch v1.9 (should be compatible with other 1.0+ versions)
52 | 3. install other dependency: opencv-python, scipy, PIL, Cython, pyaudio
53 |
54 | Or use the environment.yml we provide instead:
55 | - `conda env create -f environment.yml`
56 | - `conda activate CMP`
57 |
58 | 4. Build the rendering toolkit (by c++ and cython) for overlapping 3D meshes on images with configurations
59 |
60 | ```
61 | cd Sim3DR
62 | bash build_sim3dr.sh
63 | cd ..
64 | ```
65 |
66 | Download pretrained models and 3DMM configuration data
67 |
68 | 5. Download from [here] (~160M) and unzip under the root folder. This will create 'pretrained_models' (trained by unsupervised CMP) and 'train.configs' (3DMM config data) under the root folder.
69 |
70 | Read the preprocessed fbank for inference
71 |
72 | 6. `python demo.py` (This will fetch the preprocessed MFCC and use them as network inputs)
73 | 7. Results will be generated under `data/results/` (pre-generated references are under `data/results_reference`)
74 |
75 | More preprocessed MFCC and 3D mesh (3DMM params) pairs can be downloaded: [Voxceleb-3D Data] (about 100G).
76 |
77 | ## Demo: :laughing: Try it! Use device mic input
78 |
79 | 1. Do the above 1-5 step. Plus, download the face type meshes and extract under ./face_types
80 |
81 | 2. `python demo_mic.py` The demo will take 5 seconds recording from your device and predict the face mesh.
82 |
83 | We perform unsupervised gender classfication based on mean male and female shape and calculate the statistics between the predicted face and mean shape. Also we calculate the distance between the four types of faces (Regular, Slim, Skinny, Wide)and indicate which type the voice is closer to.
84 |
85 | 3. Results will be generated under data/results
86 |
87 | ## Inference from supervised framework
88 |
89 | 1. Do the 1-5 step in Demo. Download pretrained supervised model [here]. Download voice data (A-E) for inference [here], [meta file], and [groundtruth]. Put the pretrained model under './pretrained_models/supervised_64'. Put the vocie data and meta file under './data'. Put the groundtruth under './data' and extract.
90 |
91 |
92 | 2. Edit config.py Line 6: change to 'pretrained_models/supervised_64'
93 |
94 | 3.
95 | ```
96 | python eval_sup.py
97 | ```
98 |
99 | This will match identity from voiceID and available 3D faces reconstructed from VGGFace via the meta file. Only predict 3D faces for those matched ID. Then it will save all the mesh obj files under './data/supervised_output/'
100 |
101 | ## Evaluation
102 |
103 | 1. Do the 1-5 step in Demo. Download generated and saved mesh for validation set (name starting from A-E in Voxceleb-3D). From supervised CMP: https://drive.google.com/file/d/1_xobyRM-abjfrvzjbF7uwMVPFPfeKZC9/view?usp=share_link;
104 |
105 | (The same as groundtruth in the supervised inference) Voxceleb-3D validation set: https://drive.google.com/file/d/1NdkqlCPhl-mvPU9TYlPgHE_FaNJjAysf/view?usp=share_link. Put them under './data' and extract.
106 |
107 | The valiation set for each identity contains image (.jpg), mesh (.obj), pointcloud (.xyz), image overlapped with mesh (_b.jpg), 3DMM parameters (.npy) (199-dim for shape and 29-dim for expression. This is in contrast to simplified 3DMM basis for 40-dim shape and 10-dim expression. You can donwload full basis from BFM-2009 official website. Otherwise, we provided already reconstructed mesh in obj extension)
108 |
109 | 2.
110 | ```
111 | bash cal_size.sh
112 | ```
113 |
114 | This will run and report ARE metrics and keypoint error metrics.
115 |
116 | ## Training
117 |
118 | 1. Train the unsupervised framework
119 |
120 | -- Download 'Voxceleb-3D' data (2), (3), and (4). They are validation set, training images, and training voice banks. Extract and put them under './data'
121 |
122 | -- Download a much smaller set [here] for fast online validation
123 |
124 | -- python gan_train_cascade.py
125 |
126 | ## Citation
127 | If you find our work useful, please consider to cite us.
128 |
129 | @inproceedings{wu2022cross,
130 | title={Cross-Modal Perceptionist: Can Face Geometry be Gleaned from Voices?},
131 | author={Wu, Cho-Ying and Hsu, Chin-Cheng and Neumann, Ulrich},
132 | booktitle={CVPR},
133 | year={2022}
134 | }
135 |
136 |
137 | This project is developed on [SynergyNet], [3DDFA-V2] and [reconstruction-faces-from-voice]
138 |
--------------------------------------------------------------------------------
/Sim3DR/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | cmake-build-debug/
3 | .idea/
4 | build/
5 | *.so
6 | data/
7 |
8 | lib/rasterize.cpp
--------------------------------------------------------------------------------
/Sim3DR/Sim3DR.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from . import _init_paths
4 | import numpy as np
5 | import Sim3DR_Cython
6 |
7 |
8 | def get_normal(vertices, triangles):
9 | normal = np.zeros_like(vertices, dtype=np.float32)
10 | Sim3DR_Cython.get_normal(normal, vertices, triangles, vertices.shape[0], triangles.shape[0])
11 | return normal
12 |
13 |
14 | def rasterize(vertices, triangles, colors, bg=None,
15 | height=None, width=None, channel=None,
16 | reverse=False):
17 | if bg is not None:
18 | height, width, channel = bg.shape
19 | else:
20 | assert height is not None and width is not None and channel is not None
21 | bg = np.zeros((height, width, channel), dtype=np.float32)
22 |
23 | buffer = np.zeros((height, width), dtype=np.float32) - 1e8
24 |
25 | if colors.dtype != np.float32:
26 | colors = colors.astype(np.float32)
27 | Sim3DR_Cython.rasterize(bg, vertices, triangles, colors, buffer, triangles.shape[0], height, width, channel,
28 | reverse=reverse)
29 | return bg
30 |
--------------------------------------------------------------------------------
/Sim3DR/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from .Sim3DR import get_normal, rasterize
4 | from .lighting import RenderPipeline
5 |
--------------------------------------------------------------------------------
/Sim3DR/_init_paths.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import os.path as osp
4 | import sys
5 |
6 |
7 | def add_path(path):
8 | if path not in sys.path:
9 | sys.path.insert(0, path)
10 |
11 |
12 | this_dir = osp.dirname(__file__)
13 | lib_path = osp.join(this_dir, '.')
14 | add_path(lib_path)
15 |
--------------------------------------------------------------------------------
/Sim3DR/build_sim3dr.sh:
--------------------------------------------------------------------------------
1 | python3 setup.py build_ext --inplace
--------------------------------------------------------------------------------
/Sim3DR/lib/rasterize.h:
--------------------------------------------------------------------------------
1 | #ifndef MESH_CORE_HPP_
2 | #define MESH_CORE_HPP_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | using namespace std;
12 |
13 | class Point3D {
14 | public:
15 | float x;
16 | float y;
17 | float z;
18 |
19 | public:
20 | Point3D() : x(0.f), y(0.f), z(0.f) {}
21 | Point3D(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
22 |
23 | void initialize(float x_, float y_, float z_){
24 | this->x = x_; this->y = y_; this->z = z_;
25 | }
26 |
27 | Point3D cross(Point3D &p){
28 | Point3D c;
29 | c.x = this->y * p.z - this->z * p.y;
30 | c.y = this->z * p.x - this->x * p.z;
31 | c.z = this->x * p.y - this->y * p.x;
32 | return c;
33 | }
34 |
35 | float dot(Point3D &p) {
36 | return this->x * p.x + this->y * p.y + this->z * p.z;
37 | }
38 |
39 | Point3D operator-(const Point3D &p) {
40 | Point3D np;
41 | np.x = this->x - p.x;
42 | np.y = this->y - p.y;
43 | np.z = this->z - p.z;
44 | return np;
45 | }
46 |
47 | };
48 |
49 | class Point {
50 | public:
51 | float x;
52 | float y;
53 |
54 | public:
55 | Point() : x(0.f), y(0.f) {}
56 | Point(float x_, float y_) : x(x_), y(y_) {}
57 | float dot(Point p) {
58 | return this->x * p.x + this->y * p.y;
59 | }
60 |
61 | Point operator-(const Point &p) {
62 | Point np;
63 | np.x = this->x - p.x;
64 | np.y = this->y - p.y;
65 | return np;
66 | }
67 |
68 | Point operator+(const Point &p) {
69 | Point np;
70 | np.x = this->x + p.x;
71 | np.y = this->y + p.y;
72 | return np;
73 | }
74 |
75 | Point operator*(float s) {
76 | Point np;
77 | np.x = s * this->x;
78 | np.y = s * this->y;
79 | return np;
80 | }
81 | };
82 |
83 |
84 | bool is_point_in_tri(Point p, Point p0, Point p1, Point p2);
85 |
86 | void get_point_weight(float *weight, Point p, Point p0, Point p1, Point p2);
87 |
88 | void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int ntri, bool norm_flg);
89 |
90 | void _get_ver_normal(float *ver_normal, float *tri_normal, int *triangles, int nver, int ntri);
91 |
92 | void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri);
93 |
94 | void _rasterize_triangles(
95 | float *vertices, int *triangles, float *depth_buffer, int *triangle_buffer, float *barycentric_weight,
96 | int ntri, int h, int w);
97 |
98 | void _rasterize(
99 | unsigned char *image, float *vertices, int *triangles, float *colors,
100 | float *depth_buffer, int ntri, int h, int w, int c, float alpha, bool reverse);
101 |
102 | void _render_texture_core(
103 | float *image, float *vertices, int *triangles,
104 | float *texture, float *tex_coords, int *tex_triangles,
105 | float *depth_buffer,
106 | int nver, int tex_nver, int ntri,
107 | int h, int w, int c,
108 | int tex_h, int tex_w, int tex_c,
109 | int mapping_type);
110 |
111 | void _write_obj_with_colors_texture(string filename, string mtl_name,
112 | float *vertices, int *triangles, float *colors, float *uv_coords,
113 | int nver, int ntri, int ntexver);
114 |
115 | #endif
116 |
--------------------------------------------------------------------------------
/Sim3DR/lib/rasterize.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | cimport numpy as np
3 | # from libcpp.string cimport string
4 | cimport cython
5 | from libcpp cimport bool
6 |
7 | # from cpython import bool
8 |
9 | # use the Numpy-C-API from Cython
10 | np.import_array()
11 |
12 | # cdefine the signature of our c function
13 | cdef extern from "rasterize.h":
14 | void _rasterize_triangles(
15 | float*vertices, int*triangles, float*depth_buffer, int*triangle_buffer, float*barycentric_weight,
16 | int ntri, int h, int w
17 | )
18 |
19 | void _rasterize(
20 | unsigned char*image, float*vertices, int*triangles, float*colors, float*depth_buffer,
21 | int ntri, int h, int w, int c, float alpha, bool reverse
22 | )
23 |
24 | # void _render_texture_core(
25 | # float* image, float* vertices, int* triangles,
26 | # float* texture, float* tex_coords, int* tex_triangles,
27 | # float* depth_buffer,
28 | # int nver, int tex_nver, int ntri,
29 | # int h, int w, int c,
30 | # int tex_h, int tex_w, int tex_c,
31 | # int mapping_type)
32 |
33 | void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int nver, bool norm_flg)
34 | void _get_ver_normal(float *ver_normal, float*tri_normal, int*triangles, int nver, int ntri)
35 | void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri)
36 |
37 |
38 | # void _write_obj_with_colors_texture(string filename, string mtl_name,
39 | # float* vertices, int* triangles, float* colors, float* uv_coords,
40 | # int nver, int ntri, int ntexver)
41 |
42 | @cython.boundscheck(False)
43 | @cython.wraparound(False)
44 | def get_tri_normal(np.ndarray[float, ndim=2, mode="c"] tri_normal not None,
45 | np.ndarray[float, ndim=2, mode = "c"] vertices not None,
46 | np.ndarray[int, ndim=2, mode="c"] triangles not None,
47 | int ntri, bool norm_flg = False):
48 | _get_tri_normal( np.PyArray_DATA(tri_normal), np.PyArray_DATA(vertices),
49 | np.PyArray_DATA(triangles), ntri, norm_flg)
50 |
51 | @cython.boundscheck(False) # turn off bounds-checking for entire function
52 | @cython.wraparound(False) # turn off negative index wrapping for entire function
53 | def get_ver_normal(np.ndarray[float, ndim=2, mode = "c"] ver_normal not None,
54 | np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
55 | np.ndarray[int, ndim=2, mode="c"] triangles not None,
56 | int nver, int ntri):
57 | _get_ver_normal(
58 | np.PyArray_DATA(ver_normal), np.PyArray_DATA(tri_normal), np.PyArray_DATA(triangles),
59 | nver, ntri)
60 |
61 | @cython.boundscheck(False) # turn off bounds-checking for entire function
62 | @cython.wraparound(False) # turn off negative index wrapping for entire function
63 | def get_normal(np.ndarray[float, ndim=2, mode = "c"] ver_normal not None,
64 | np.ndarray[float, ndim=2, mode = "c"] vertices not None,
65 | np.ndarray[int, ndim=2, mode="c"] triangles not None,
66 | int nver, int ntri):
67 | _get_normal(
68 | np.PyArray_DATA(ver_normal), np.PyArray_DATA(vertices), np.PyArray_DATA(triangles),
69 | nver, ntri)
70 |
71 |
72 | @cython.boundscheck(False) # turn off bounds-checking for entire function
73 | @cython.wraparound(False) # turn off negative index wrapping for entire function
74 | def rasterize_triangles(
75 | np.ndarray[float, ndim=2, mode = "c"] vertices not None,
76 | np.ndarray[int, ndim=2, mode="c"] triangles not None,
77 | np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
78 | np.ndarray[int, ndim=2, mode = "c"] triangle_buffer not None,
79 | np.ndarray[float, ndim=2, mode = "c"] barycentric_weight not None,
80 | int ntri, int h, int w
81 | ):
82 | _rasterize_triangles(
83 | np.PyArray_DATA(vertices), np.PyArray_DATA(triangles),
84 | np.PyArray_DATA(depth_buffer), np.PyArray_DATA(triangle_buffer),
85 | np.PyArray_DATA(barycentric_weight),
86 | ntri, h, w)
87 |
88 | @cython.boundscheck(False) # turn off bounds-checking for entire function
89 | @cython.wraparound(False) # turn off negative index wrapping for entire function
90 | def rasterize(np.ndarray[unsigned char, ndim=3, mode = "c"] image not None,
91 | np.ndarray[float, ndim=2, mode = "c"] vertices not None,
92 | np.ndarray[int, ndim=2, mode="c"] triangles not None,
93 | np.ndarray[float, ndim=2, mode = "c"] colors not None,
94 | np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
95 | int ntri, int h, int w, int c, float alpha = 1, bool reverse = False
96 | ):
97 | _rasterize(
98 | np.PyArray_DATA(image), np.PyArray_DATA(vertices),
99 | np.PyArray_DATA(triangles),
100 | np.PyArray_DATA(colors),
101 | np.PyArray_DATA(depth_buffer),
102 | ntri, h, w, c, alpha, reverse)
103 |
104 | # def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,
105 | # np.ndarray[float, ndim=2, mode = "c"] vertices not None,
106 | # np.ndarray[int, ndim=2, mode="c"] triangles not None,
107 | # np.ndarray[float, ndim=3, mode = "c"] texture not None,
108 | # np.ndarray[float, ndim=2, mode = "c"] tex_coords not None,
109 | # np.ndarray[int, ndim=2, mode="c"] tex_triangles not None,
110 | # np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
111 | # int nver, int tex_nver, int ntri,
112 | # int h, int w, int c,
113 | # int tex_h, int tex_w, int tex_c,
114 | # int mapping_type
115 | # ):
116 | # _render_texture_core(
117 | # np.PyArray_DATA(image), np.PyArray_DATA(vertices), np.PyArray_DATA(triangles),
118 | # np.PyArray_DATA(texture), np.PyArray_DATA(tex_coords), np.PyArray_DATA(tex_triangles),
119 | # np.PyArray_DATA(depth_buffer),
120 | # nver, tex_nver, ntri,
121 | # h, w, c,
122 | # tex_h, tex_w, tex_c,
123 | # mapping_type)
124 | #
125 | # def write_obj_with_colors_texture_core(string filename, string mtl_name,
126 | # np.ndarray[float, ndim=2, mode = "c"] vertices not None,
127 | # np.ndarray[int, ndim=2, mode="c"] triangles not None,
128 | # np.ndarray[float, ndim=2, mode = "c"] colors not None,
129 | # np.ndarray[float, ndim=2, mode = "c"] uv_coords not None,
130 | # int nver, int ntri, int ntexver
131 | # ):
132 | # _write_obj_with_colors_texture(filename, mtl_name,
133 | # np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), np.PyArray_DATA(colors), np.PyArray_DATA(uv_coords),
134 | # nver, ntri, ntexver)
135 |
--------------------------------------------------------------------------------
/Sim3DR/lib/rasterize_kernel.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | /*
4 | Author: Yao Feng
5 | Modified by Jianzhu Guo
6 | functions that can not be optimazed by vertorization in python.
7 | 1. rasterization.(need process each triangle)
8 | 2. normal of each vertex.(use one-ring, need process each vertex)
9 | 3. write obj(seems that it can be verctorized? anyway, writing it in c++ is simple, so also add function here. --> however, why writting in c++ is still slow?)
10 | */
11 |
12 | #include "rasterize.h"
13 |
14 |
15 | /* Judge whether the Point is in the triangle
16 | Method:
17 | http://blackpawn.com/texts/pointinpoly/
18 | Args:
19 | Point: [x, y]
20 | tri_points: three vertices(2d points) of a triangle. 2 coords x 3 vertices
21 | Returns:
22 | bool: true for in triangle
23 | */
24 | bool is_point_in_tri(Point p, Point p0, Point p1, Point p2) {
25 | // vectors
26 | Point v0, v1, v2;
27 | v0 = p2 - p0;
28 | v1 = p1 - p0;
29 | v2 = p - p0;
30 |
31 | // dot products
32 | float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0)
33 | float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1)
34 | float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2)
35 | float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1)
36 | float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2)
37 |
38 | // barycentric coordinates
39 | float inverDeno;
40 | if (dot00 * dot11 - dot01 * dot01 == 0)
41 | inverDeno = 0;
42 | else
43 | inverDeno = 1 / (dot00 * dot11 - dot01 * dot01);
44 |
45 | float u = (dot11 * dot02 - dot01 * dot12) * inverDeno;
46 | float v = (dot00 * dot12 - dot01 * dot02) * inverDeno;
47 |
48 | // check if Point in triangle
49 | return (u >= 0) && (v >= 0) && (u + v < 1);
50 | }
51 |
52 | void get_point_weight(float *weight, Point p, Point p0, Point p1, Point p2) {
53 | // vectors
54 | Point v0, v1, v2;
55 | v0 = p2 - p0;
56 | v1 = p1 - p0;
57 | v2 = p - p0;
58 |
59 | // dot products
60 | float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0)
61 | float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1)
62 | float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2)
63 | float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1)
64 | float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2)
65 |
66 | // barycentric coordinates
67 | float inverDeno;
68 | if (dot00 * dot11 - dot01 * dot01 == 0)
69 | inverDeno = 0;
70 | else
71 | inverDeno = 1 / (dot00 * dot11 - dot01 * dot01);
72 |
73 | float u = (dot11 * dot02 - dot01 * dot12) * inverDeno;
74 | float v = (dot00 * dot12 - dot01 * dot02) * inverDeno;
75 |
76 | // weight
77 | weight[0] = 1 - u - v;
78 | weight[1] = v;
79 | weight[2] = u;
80 | }
81 |
82 | /*
83 | * Get normals of triangles.
84 | */
85 | void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int ntri, bool norm_flg) {
86 | int tri_p0_ind, tri_p1_ind, tri_p2_ind;
87 | float v1x, v1y, v1z, v2x, v2y, v2z;
88 |
89 | for (int i = 0; i < ntri; i++) {
90 | tri_p0_ind = triangles[3 * i];
91 | tri_p1_ind = triangles[3 * i + 1];
92 | tri_p2_ind = triangles[3 * i + 2];
93 |
94 | // counter clockwise order
95 | v1x = vertices[3 * tri_p1_ind] - vertices[3 * tri_p0_ind];
96 | v1y = vertices[3 * tri_p1_ind + 1] - vertices[3 * tri_p0_ind + 1];
97 | v1z = vertices[3 * tri_p1_ind + 2] - vertices[3 * tri_p0_ind + 2];
98 |
99 | v2x = vertices[3 * tri_p2_ind] - vertices[3 * tri_p0_ind];
100 | v2y = vertices[3 * tri_p2_ind + 1] - vertices[3 * tri_p0_ind + 1];
101 | v2z = vertices[3 * tri_p2_ind + 2] - vertices[3 * tri_p0_ind + 2];
102 |
103 | if (norm_flg) {
104 | float c1 = v1y * v2z - v1z * v2y;
105 | float c2 = v1z * v2x - v1x * v2z;
106 | float c3 = v1x * v2y - v1y * v2x;
107 | float det = sqrt(c1 * c1 + c2 * c2 + c3 * c3);
108 | if (det <= 0) det = 1e-6;
109 | tri_normal[3 * i] = c1 / det;
110 | tri_normal[3 * i + 1] = c2 / det;
111 | tri_normal[3 * i + 2] = c3 / det;
112 | } else {
113 | tri_normal[3 * i] = v1y * v2z - v1z * v2y;
114 | tri_normal[3 * i + 1] = v1z * v2x - v1x * v2z;
115 | tri_normal[3 * i + 2] = v1x * v2y - v1y * v2x;
116 | }
117 | }
118 | }
119 |
120 | /*
121 | * Get normal vector of vertices using triangle normals
122 | */
123 | void _get_ver_normal(float *ver_normal, float *tri_normal, int *triangles, int nver, int ntri) {
124 | int tri_p0_ind, tri_p1_ind, tri_p2_ind;
125 |
126 | for (int i = 0; i < ntri; i++) {
127 | tri_p0_ind = triangles[3 * i];
128 | tri_p1_ind = triangles[3 * i + 1];
129 | tri_p2_ind = triangles[3 * i + 2];
130 |
131 | for (int j = 0; j < 3; j++) {
132 | ver_normal[3 * tri_p0_ind + j] += tri_normal[3 * i + j];
133 | ver_normal[3 * tri_p1_ind + j] += tri_normal[3 * i + j];
134 | ver_normal[3 * tri_p2_ind + j] += tri_normal[3 * i + j];
135 | }
136 | }
137 |
138 | // normalizing
139 | float nx, ny, nz, det;
140 | for (int i = 0; i < nver; ++i) {
141 | nx = ver_normal[3 * i];
142 | ny = ver_normal[3 * i + 1];
143 | nz = ver_normal[3 * i + 2];
144 |
145 | det = sqrt(nx * nx + ny * ny + nz * nz);
146 | if (det <= 0) det = 1e-6;
147 | ver_normal[3 * i] = nx / det;
148 | ver_normal[3 * i + 1] = ny / det;
149 | ver_normal[3 * i + 2] = nz / det;
150 | }
151 | }
152 |
153 | /*
154 | * Directly get normal of vertices, which can be regraded as a combination of _get_tri_normal and _get_ver_normal
155 | */
156 | void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri) {
157 | int tri_p0_ind, tri_p1_ind, tri_p2_ind;
158 | float v1x, v1y, v1z, v2x, v2y, v2z;
159 |
160 | // get tri_normal
161 | // float tri_normal[3 * ntri];
162 | float* tri_normal;
163 | tri_normal = new float [3 * ntri];
164 | for (int i = 0; i < ntri; i++) {
165 | tri_p0_ind = triangles[3 * i];
166 | tri_p1_ind = triangles[3 * i + 1];
167 | tri_p2_ind = triangles[3 * i + 2];
168 |
169 | // counter clockwise order
170 | v1x = vertices[3 * tri_p1_ind] - vertices[3 * tri_p0_ind];
171 | v1y = vertices[3 * tri_p1_ind + 1] - vertices[3 * tri_p0_ind + 1];
172 | v1z = vertices[3 * tri_p1_ind + 2] - vertices[3 * tri_p0_ind + 2];
173 |
174 | v2x = vertices[3 * tri_p2_ind] - vertices[3 * tri_p0_ind];
175 | v2y = vertices[3 * tri_p2_ind + 1] - vertices[3 * tri_p0_ind + 1];
176 | v2z = vertices[3 * tri_p2_ind + 2] - vertices[3 * tri_p0_ind + 2];
177 |
178 |
179 | tri_normal[3 * i] = v1y * v2z - v1z * v2y;
180 | tri_normal[3 * i + 1] = v1z * v2x - v1x * v2z;
181 | tri_normal[3 * i + 2] = v1x * v2y - v1y * v2x;
182 |
183 | }
184 |
185 | // get ver_normal
186 | for (int i = 0; i < ntri; i++) {
187 | tri_p0_ind = triangles[3 * i];
188 | tri_p1_ind = triangles[3 * i + 1];
189 | tri_p2_ind = triangles[3 * i + 2];
190 |
191 | for (int j = 0; j < 3; j++) {
192 | ver_normal[3 * tri_p0_ind + j] += tri_normal[3 * i + j];
193 | ver_normal[3 * tri_p1_ind + j] += tri_normal[3 * i + j];
194 | ver_normal[3 * tri_p2_ind + j] += tri_normal[3 * i + j];
195 | }
196 | }
197 |
198 | // normalizing
199 | float nx, ny, nz, det;
200 | for (int i = 0; i < nver; ++i) {
201 | nx = ver_normal[3 * i];
202 | ny = ver_normal[3 * i + 1];
203 | nz = ver_normal[3 * i + 2];
204 |
205 | det = sqrt(nx * nx + ny * ny + nz * nz);
206 | // if (det <= 0) det = 1e-6;
207 | ver_normal[3 * i] = nx / det;
208 | ver_normal[3 * i + 1] = ny / det;
209 | ver_normal[3 * i + 2] = nz / det;
210 | }
211 |
212 | delete[] tri_normal;
213 | }
214 |
215 | // rasterization by Z-Buffer with optimization
216 | // Complexity: < ntri * h * w * c
217 | void _rasterize(
218 | unsigned char *image, float *vertices, int *triangles, float *colors, float *depth_buffer,
219 | int ntri, int h, int w, int c, float alpha, bool reverse) {
220 | int x, y, k;
221 | int tri_p0_ind, tri_p1_ind, tri_p2_ind;
222 | Point p0, p1, p2, p;
223 | int x_min, x_max, y_min, y_max;
224 | float p_depth, p0_depth, p1_depth, p2_depth;
225 | float p_color, p0_color, p1_color, p2_color;
226 | float weight[3];
227 |
228 | for (int i = 0; i < ntri; i++) {
229 | tri_p0_ind = triangles[3 * i];
230 | tri_p1_ind = triangles[3 * i + 1];
231 | tri_p2_ind = triangles[3 * i + 2];
232 |
233 | p0.x = vertices[3 * tri_p0_ind];
234 | p0.y = vertices[3 * tri_p0_ind + 1];
235 | p0_depth = vertices[3 * tri_p0_ind + 2];
236 | p1.x = vertices[3 * tri_p1_ind];
237 | p1.y = vertices[3 * tri_p1_ind + 1];
238 | p1_depth = vertices[3 * tri_p1_ind + 2];
239 | p2.x = vertices[3 * tri_p2_ind];
240 | p2.y = vertices[3 * tri_p2_ind + 1];
241 | p2_depth = vertices[3 * tri_p2_ind + 2];
242 |
243 | x_min = max((int) floor(min(p0.x, min(p1.x, p2.x))), 0);
244 | x_max = min((int) ceil(max(p0.x, max(p1.x, p2.x))), w - 1);
245 |
246 | y_min = max((int) floor(min(p0.y, min(p1.y, p2.y))), 0);
247 | y_max = min((int) ceil(max(p0.y, max(p1.y, p2.y))), h - 1);
248 |
249 | if (x_max < x_min || y_max < y_min) {
250 | continue;
251 | }
252 |
253 | for (y = y_min; y <= y_max; y++) {
254 | for (x = x_min; x <= x_max; x++) {
255 | p.x = x;
256 | p.y = y;
257 | if (is_point_in_tri(p, p0, p1, p2)) {
258 | get_point_weight(weight, p, p0, p1, p2);
259 | p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth;
260 |
261 | if ((p_depth > depth_buffer[y * w + x])) {
262 | for (k = 0; k < c; k++) {
263 | p0_color = colors[c * tri_p0_ind + k];
264 | p1_color = colors[c * tri_p1_ind + k];
265 | p2_color = colors[c * tri_p2_ind + k];
266 |
267 | p_color = weight[0] * p0_color + weight[1] * p1_color + weight[2] * p2_color;
268 | if (reverse) {
269 | image[(h - 1 - y) * w * c + x * c + k] = (unsigned char) (
270 | (1 - alpha) * image[(h - 1 - y) * w * c + x * c + k] + alpha * 255 * p_color);
271 | // image[(h - 1 - y) * w * c + x * c + k] = (unsigned char) (255 * p_color);
272 | } else {
273 | image[y * w * c + x * c + k] = (unsigned char) (
274 | (1 - alpha) * image[y * w * c + x * c + k] + alpha * 255 * p_color);
275 | // image[y * w * c + x * c + k] = (unsigned char) (255 * p_color);
276 | }
277 | }
278 |
279 | depth_buffer[y * w + x] = p_depth;
280 | }
281 | }
282 | }
283 | }
284 | }
285 | }
286 |
287 |
288 | void _rasterize_triangles(
289 | float *vertices, int *triangles, float *depth_buffer, int *triangle_buffer, float *barycentric_weight,
290 | int ntri, int h, int w) {
291 | int i;
292 | int x, y, k;
293 | int tri_p0_ind, tri_p1_ind, tri_p2_ind;
294 | Point p0, p1, p2, p;
295 | int x_min, x_max, y_min, y_max;
296 | float p_depth, p0_depth, p1_depth, p2_depth;
297 | float weight[3];
298 |
299 | for (i = 0; i < ntri; i++) {
300 | tri_p0_ind = triangles[3 * i];
301 | tri_p1_ind = triangles[3 * i + 1];
302 | tri_p2_ind = triangles[3 * i + 2];
303 |
304 | p0.x = vertices[3 * tri_p0_ind];
305 | p0.y = vertices[3 * tri_p0_ind + 1];
306 | p0_depth = vertices[3 * tri_p0_ind + 2];
307 | p1.x = vertices[3 * tri_p1_ind];
308 | p1.y = vertices[3 * tri_p1_ind + 1];
309 | p1_depth = vertices[3 * tri_p1_ind + 2];
310 | p2.x = vertices[3 * tri_p2_ind];
311 | p2.y = vertices[3 * tri_p2_ind + 1];
312 | p2_depth = vertices[3 * tri_p2_ind + 2];
313 |
314 | x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0);
315 | x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1);
316 |
317 | y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0);
318 | y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1);
319 |
320 | if (x_max < x_min || y_max < y_min) {
321 | continue;
322 | }
323 |
324 | for (y = y_min; y <= y_max; y++) //h
325 | {
326 | for (x = x_min; x <= x_max; x++) //w
327 | {
328 | p.x = x;
329 | p.y = y;
330 | // if (p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || is_point_in_tri(p, p0, p1, p2)) {
331 | if (is_point_in_tri(p, p0, p1, p2)) {
332 | get_point_weight(weight, p, p0, p1, p2);
333 | p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth;
334 |
335 | if ((p_depth > depth_buffer[y * w + x])) {
336 | depth_buffer[y * w + x] = p_depth;
337 | triangle_buffer[y * w + x] = i;
338 | for (k = 0; k < 3; k++) {
339 | barycentric_weight[y * w * 3 + x * 3 + k] = weight[k];
340 | }
341 | }
342 | }
343 | }
344 | }
345 | }
346 | }
347 |
348 |
349 | // Depth-Buffer 算法
350 | // https://blog.csdn.net/Jurbo/article/details/75007260
351 | void _render_texture_core(
352 | float *image, float *vertices, int *triangles,
353 | float *texture, float *tex_coords, int *tex_triangles,
354 | float *depth_buffer,
355 | int nver, int tex_nver, int ntri,
356 | int h, int w, int c,
357 | int tex_h, int tex_w, int tex_c,
358 | int mapping_type) {
359 | int i;
360 | int x, y, k;
361 | int tri_p0_ind, tri_p1_ind, tri_p2_ind;
362 | int tex_tri_p0_ind, tex_tri_p1_ind, tex_tri_p2_ind;
363 | Point p0, p1, p2, p;
364 | Point tex_p0, tex_p1, tex_p2, tex_p;
365 | int x_min, x_max, y_min, y_max;
366 | float weight[3];
367 | float p_depth, p0_depth, p1_depth, p2_depth;
368 | float xd, yd;
369 | float ul, ur, dl, dr;
370 | for (i = 0; i < ntri; i++) {
371 | // mesh
372 | tri_p0_ind = triangles[3 * i];
373 | tri_p1_ind = triangles[3 * i + 1];
374 | tri_p2_ind = triangles[3 * i + 2];
375 |
376 | p0.x = vertices[3 * tri_p0_ind];
377 | p0.y = vertices[3 * tri_p0_ind + 1];
378 | p0_depth = vertices[3 * tri_p0_ind + 2];
379 | p1.x = vertices[3 * tri_p1_ind];
380 | p1.y = vertices[3 * tri_p1_ind + 1];
381 | p1_depth = vertices[3 * tri_p1_ind + 2];
382 | p2.x = vertices[3 * tri_p2_ind];
383 | p2.y = vertices[3 * tri_p2_ind + 1];
384 | p2_depth = vertices[3 * tri_p2_ind + 2];
385 |
386 | // texture
387 | tex_tri_p0_ind = tex_triangles[3 * i];
388 | tex_tri_p1_ind = tex_triangles[3 * i + 1];
389 | tex_tri_p2_ind = tex_triangles[3 * i + 2];
390 |
391 | tex_p0.x = tex_coords[3 * tex_tri_p0_ind];
392 | tex_p0.y = tex_coords[3 * tri_p0_ind + 1];
393 | tex_p1.x = tex_coords[3 * tex_tri_p1_ind];
394 | tex_p1.y = tex_coords[3 * tri_p1_ind + 1];
395 | tex_p2.x = tex_coords[3 * tex_tri_p2_ind];
396 | tex_p2.y = tex_coords[3 * tri_p2_ind + 1];
397 |
398 |
399 | x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0);
400 | x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1);
401 |
402 | y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0);
403 | y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1);
404 |
405 |
406 | if (x_max < x_min || y_max < y_min) {
407 | continue;
408 | }
409 |
410 | for (y = y_min; y <= y_max; y++) //h
411 | {
412 | for (x = x_min; x <= x_max; x++) //w
413 | {
414 | p.x = x;
415 | p.y = y;
416 | if (p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || is_point_in_tri(p, p0, p1, p2)) {
417 | get_point_weight(weight, p, p0, p1, p2);
418 | p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth;
419 |
420 | if ((p_depth > depth_buffer[y * w + x])) {
421 | // -- color from texture
422 | // cal weight in mesh tri
423 | get_point_weight(weight, p, p0, p1, p2);
424 | // cal coord in texture
425 | tex_p = tex_p0 * weight[0] + tex_p1 * weight[1] + tex_p2 * weight[2];
426 | tex_p.x = max(min(tex_p.x, float(tex_w - 1)), float(0));
427 | tex_p.y = max(min(tex_p.y, float(tex_h - 1)), float(0));
428 |
429 | yd = tex_p.y - floor(tex_p.y);
430 | xd = tex_p.x - floor(tex_p.x);
431 | for (k = 0; k < c; k++) {
432 | if (mapping_type == 0)// nearest
433 | {
434 | image[y * w * c + x * c + k] = texture[int(round(tex_p.y)) * tex_w * tex_c +
435 | int(round(tex_p.x)) * tex_c + k];
436 | } else//bilinear interp
437 | {
438 | ul = texture[(int) floor(tex_p.y) * tex_w * tex_c + (int) floor(tex_p.x) * tex_c + k];
439 | ur = texture[(int) floor(tex_p.y) * tex_w * tex_c + (int) ceil(tex_p.x) * tex_c + k];
440 | dl = texture[(int) ceil(tex_p.y) * tex_w * tex_c + (int) floor(tex_p.x) * tex_c + k];
441 | dr = texture[(int) ceil(tex_p.y) * tex_w * tex_c + (int) ceil(tex_p.x) * tex_c + k];
442 |
443 | image[y * w * c + x * c + k] =
444 | ul * (1 - xd) * (1 - yd) + ur * xd * (1 - yd) + dl * (1 - xd) * yd +
445 | dr * xd * yd;
446 | }
447 |
448 | }
449 |
450 | depth_buffer[y * w + x] = p_depth;
451 | }
452 | }
453 | }
454 | }
455 | }
456 | }
457 |
458 |
459 | // ------------------------------------------------- write
460 | // obj write
461 | // Ref: https://github.com/patrikhuber/eos/blob/master/include/eos/core/Mesh.hpp
462 | void _write_obj_with_colors_texture(string filename, string mtl_name,
463 | float *vertices, int *triangles, float *colors, float *uv_coords,
464 | int nver, int ntri, int ntexver) {
465 | int i;
466 |
467 | ofstream obj_file(filename);
468 |
469 | // first line of the obj file: the mtl name
470 | obj_file << "mtllib " << mtl_name << endl;
471 |
472 | // write vertices
473 | for (i = 0; i < nver; ++i) {
474 | obj_file << "v " << vertices[3 * i] << " " << vertices[3 * i + 1] << " " << vertices[3 * i + 2] << colors[3 * i]
475 | << " " << colors[3 * i + 1] << " " << colors[3 * i + 2] << endl;
476 | }
477 |
478 | // write uv coordinates
479 | for (i = 0; i < ntexver; ++i) {
480 | //obj_file << "vt " << uv_coords[2*i] << " " << (1 - uv_coords[2*i + 1]) << endl;
481 | obj_file << "vt " << uv_coords[2 * i] << " " << uv_coords[2 * i + 1] << endl;
482 | }
483 |
484 | obj_file << "usemtl FaceTexture" << endl;
485 | // write triangles
486 | for (i = 0; i < ntri; ++i) {
487 | // obj_file << "f " << triangles[3*i] << "/" << triangles[3*i] << " " << triangles[3*i + 1] << "/" << triangles[3*i + 1] << " " << triangles[3*i + 2] << "/" << triangles[3*i + 2] << endl;
488 | obj_file << "f " << triangles[3 * i + 2] << "/" << triangles[3 * i + 2] << " " << triangles[3 * i + 1] << "/"
489 | << triangles[3 * i + 1] << " " << triangles[3 * i] << "/" << triangles[3 * i] << endl;
490 | }
491 |
492 | }
493 |
--------------------------------------------------------------------------------
/Sim3DR/lighting.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import numpy as np
4 | from .Sim3DR import get_normal, rasterize
5 |
6 | _norm = lambda arr: arr / np.sqrt(np.sum(arr ** 2, axis=1))[:, None]
7 |
8 |
9 | def norm_vertices(vertices):
10 | vertices -= vertices.min(0)[None, :]
11 | vertices /= vertices.max()
12 | vertices *= 2
13 | vertices -= vertices.max(0)[None, :] / 2
14 | return vertices
15 |
16 |
17 | def convert_type(obj):
18 | if isinstance(obj, tuple) or isinstance(obj, list):
19 | return np.array(obj, dtype=np.float32)[None, :]
20 | return obj
21 |
22 |
23 | class RenderPipeline(object):
24 | def __init__(self, **kwargs):
25 | self.intensity_ambient = convert_type(kwargs.get('intensity_ambient', 0.3))
26 | self.intensity_directional = convert_type(kwargs.get('intensity_directional', 0.6))
27 | self.intensity_specular = convert_type(kwargs.get('intensity_specular', 0.1))
28 | self.specular_exp = kwargs.get('specular_exp', 5)
29 | self.color_ambient = convert_type(kwargs.get('color_ambient', (1, 1, 1)))
30 | self.color_directional = convert_type(kwargs.get('color_directional', (1, 1, 1)))
31 | self.light_pos = convert_type(kwargs.get('light_pos', (0, 0, 5)))
32 | self.view_pos = convert_type(kwargs.get('view_pos', (0, 0, 5)))
33 |
34 | def update_light_pos(self, light_pos):
35 | self.light_pos = convert_type(light_pos)
36 |
37 | def __call__(self, vertices, triangles, bg, texture=None):
38 | normal = get_normal(vertices, triangles)
39 |
40 | # 2. lighting
41 | light = np.zeros_like(vertices, dtype=np.float32)
42 | # ambient component
43 | if self.intensity_ambient > 0:
44 | light += self.intensity_ambient * self.color_ambient
45 |
46 | vertices_n = norm_vertices(vertices.copy())
47 | if self.intensity_directional > 0:
48 | # diffuse component
49 | direction = _norm(self.light_pos - vertices_n)
50 | cos = np.sum(normal * direction, axis=1)[:, None]
51 | # cos = np.clip(cos, 0, 1)
52 | # todo: check below
53 | light += self.intensity_directional * (self.color_directional * np.clip(cos, 0, 1))
54 |
55 | # specular component
56 | if self.intensity_specular > 0:
57 | v2v = _norm(self.view_pos - vertices_n)
58 | reflection = 2 * cos * normal - direction
59 | spe = np.sum((v2v * reflection) ** self.specular_exp, axis=1)[:, None]
60 | spe = np.where(cos != 0, np.clip(spe, 0, 1), np.zeros_like(spe))
61 | light += self.intensity_specular * self.color_directional * np.clip(spe, 0, 1)
62 | light = np.clip(light, 0, 1)
63 |
64 | # 2. rasterization, [0, 1]
65 | if texture is None:
66 | render_img = rasterize(vertices, triangles, light, bg=bg)
67 | return render_img
68 | else:
69 | texture *= light
70 | render_img = rasterize(vertices, triangles, texture, bg=bg)
71 | return render_img
72 |
73 |
74 | def main():
75 | pass
76 |
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/Sim3DR/readme.md:
--------------------------------------------------------------------------------
1 | ## Sim3DR
2 | This is a simple 3D render, written by c++ and cython.
3 |
4 | ### Build Sim3DR
5 |
6 | ```shell script
7 | python3 setup.py build_ext --inplace
8 | ```
--------------------------------------------------------------------------------
/Sim3DR/setup.py:
--------------------------------------------------------------------------------
1 | '''
2 | python setup.py build_ext -i
3 | to compile
4 | '''
5 |
6 | from distutils.core import setup, Extension
7 | from Cython.Build import cythonize
8 | from Cython.Distutils import build_ext
9 | import numpy
10 |
11 | setup(
12 | name='Sim3DR_Cython', # not the package name
13 | cmdclass={'build_ext': build_ext},
14 | ext_modules=[Extension("Sim3DR_Cython",
15 | sources=["lib/rasterize.pyx", "lib/rasterize_kernel.cpp"],
16 | language='c++',
17 | include_dirs=[numpy.get_include()],
18 | extra_compile_args=["-std=c++11"])],
19 | )
20 |
--------------------------------------------------------------------------------
/Sim3DR/tests/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 |
--------------------------------------------------------------------------------
/Sim3DR/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 2.8)
2 |
3 | set(TARGET test)
4 | project(${TARGET})
5 |
6 | #find_package( OpenCV REQUIRED )
7 | #include_directories( ${OpenCV_INCLUDE_DIRS} )
8 |
9 | #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -O3")
10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -std=c++11")
11 | add_executable(${TARGET} test.cpp rasterize_kernel.cpp io.cpp)
12 | target_include_directories(${TARGET} PRIVATE ${PROJECT_SOURCE_DIR})
13 |
--------------------------------------------------------------------------------
/Sim3DR/tests/io.cpp:
--------------------------------------------------------------------------------
1 | #include "io.h"
2 |
3 | //void load_obj(const string obj_fp, float* vertices, float* colors, float* triangles){
4 | // string line;
5 | // ifstream in(obj_fp);
6 | //
7 | // if(in.is_open()){
8 | // while (getline(in, line)){
9 | // stringstream ss(line);
10 | //
11 | // char t; // type: v, f
12 | // ss >> t;
13 | // if (t == 'v'){
14 | //
15 | // }
16 | // }
17 | // }
18 | //}
19 |
20 | void load_obj(const char *obj_fp, float *vertices, float *colors, int *triangles, int nver, int ntri) {
21 | FILE *fp;
22 | fp = fopen(obj_fp, "r");
23 |
24 | char t; // type: v or f
25 | if (fp != nullptr) {
26 | for (int i = 0; i < nver; ++i) {
27 | fscanf(fp, "%c", &t);
28 | for (int j = 0; j < 3; ++j)
29 | fscanf(fp, " %f", &vertices[3 * i + j]);
30 | for (int j = 0; j < 3; ++j)
31 | fscanf(fp, " %f", &colors[3 * i + j]);
32 | fscanf(fp, "\n");
33 | }
34 | // fscanf(fp, "%c", &t);
35 | for (int i = 0; i < ntri; ++i) {
36 | fscanf(fp, "%c", &t);
37 | for (int j = 0; j < 3; ++j) {
38 | fscanf(fp, " %d", &triangles[3 * i + j]);
39 | triangles[3 * i + j] -= 1;
40 | }
41 | fscanf(fp, "\n");
42 | }
43 |
44 | fclose(fp);
45 | }
46 | }
47 |
48 | void load_ply(const char *ply_fp, float *vertices, int *triangles, int nver, int ntri) {
49 | FILE *fp;
50 | fp = fopen(ply_fp, "r");
51 |
52 | // char s[256];
53 | char t;
54 | if (fp != nullptr) {
55 | // for (int i = 0; i < 9; ++i)
56 | // fscanf(fp, "%s", s);
57 | for (int i = 0; i < nver; ++i)
58 | fscanf(fp, "%f %f %f\n", &vertices[3 * i], &vertices[3 * i + 1], &vertices[3 * i + 2]);
59 |
60 | for (int i = 0; i < ntri; ++i)
61 | fscanf(fp, "%c %d %d %d\n", &t, &triangles[3 * i], &triangles[3 * i + 1], &triangles[3 * i + 2]);
62 |
63 | fclose(fp);
64 | }
65 | }
66 |
67 | void write_ppm(const char *filename, unsigned char *img, int h, int w, int c) {
68 | FILE *fp;
69 | //open file for output
70 | fp = fopen(filename, "wb");
71 | if (!fp) {
72 | fprintf(stderr, "Unable to open file '%s'\n", filename);
73 | exit(1);
74 | }
75 |
76 | //write the header file
77 | //image format
78 | fprintf(fp, "P6\n");
79 |
80 | //image size
81 | fprintf(fp, "%d %d\n", w, h);
82 |
83 | // rgb component depth
84 | fprintf(fp, "%d\n", MAX_PXL_VALUE);
85 |
86 | // pixel data
87 | fwrite(img, sizeof(unsigned char), size_t(h * w * c), fp);
88 | fclose(fp);
89 | }
--------------------------------------------------------------------------------
/Sim3DR/tests/io.h:
--------------------------------------------------------------------------------
1 | #ifndef IO_H_
2 | #define IO_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | using namespace std;
11 |
12 | #define MAX_PXL_VALUE 255
13 |
14 | void load_obj(const char* obj_fp, float* vertices, float* colors, int* triangles, int nver, int ntri);
15 | void load_ply(const char* ply_fp, float* vertices, int* triangles, int nver, int ntri);
16 |
17 |
18 | void write_ppm(const char *filename, unsigned char *img, int h, int w, int c);
19 |
20 | #endif
--------------------------------------------------------------------------------
/Sim3DR/tests/test.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Tesing cases
3 | */
4 |
5 | #include
6 | #include
7 | #include "rasterize.h"
8 | #include "io.h"
9 |
10 | void test_isPointInTri() {
11 | Point p0(0, 0);
12 | Point p1(1, 0);
13 | Point p2(1, 1);
14 |
15 | Point p(0.2, 0.2);
16 |
17 | if (is_point_in_tri(p, p0, p1, p2))
18 | std::cout << "In";
19 | else
20 | std::cout << "Out";
21 | std::cout << std::endl;
22 | }
23 |
24 | void test_getPointWeight() {
25 | Point p0(0, 0);
26 | Point p1(1, 0);
27 | Point p2(1, 1);
28 |
29 | Point p(0.2, 0.2);
30 |
31 | float weight[3];
32 | get_point_weight(weight, p, p0, p1, p2);
33 | std::cout << weight[0] << " " << weight[1] << " " << weight[2] << std::endl;
34 | }
35 |
36 | void test_get_tri_normal() {
37 | float tri_normal[3];
38 | // float vertices[9] = {1, 0, 0, 0, 0, 0, 0, 1, 0};
39 | float vertices[9] = {1, 1.1, 0, 0, 0, 0, 0, 0.6, 0.7};
40 | int triangles[3] = {0, 1, 2};
41 | int ntri = 1;
42 |
43 | _get_tri_normal(tri_normal, vertices, triangles, ntri);
44 |
45 | for (int i = 0; i < 3; ++i)
46 | std::cout << tri_normal[i] << ", ";
47 | std::cout << std::endl;
48 | }
49 |
50 | void test_load_obj() {
51 | const char *fp = "../data/vd005_mesh.obj";
52 | int nver = 35709;
53 | int ntri = 70789;
54 |
55 | auto *vertices = new float[nver];
56 | auto *colors = new float[nver];
57 | auto *triangles = new int[ntri];
58 | load_obj(fp, vertices, colors, triangles, nver, ntri);
59 |
60 | delete[] vertices;
61 | delete[] colors;
62 | delete[] triangles;
63 | }
64 |
65 | void test_render() {
66 | // 1. loading obj
67 | // const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/vd005_mesh.obj";
68 | const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/face1.obj";
69 | int nver = 35709; //53215; //35709;
70 | int ntri = 70789; //105840;//70789;
71 |
72 | auto *vertices = new float[3 * nver];
73 | auto *colors = new float[3 * nver];
74 | auto *triangles = new int[3 * ntri];
75 | load_obj(fp, vertices, colors, triangles, nver, ntri);
76 |
77 | // 2. rendering
78 | int h = 224, w = 224, c = 3;
79 |
80 | // enlarging
81 | int scale = 4;
82 | h *= scale;
83 | w *= scale;
84 | for (int i = 0; i < nver * 3; ++i) vertices[i] *= scale;
85 |
86 | auto *image = new unsigned char[h * w * c]();
87 | auto *depth_buffer = new float[h * w]();
88 |
89 | for (int i = 0; i < h * w; ++i) depth_buffer[i] = -999999;
90 |
91 | clock_t t;
92 | t = clock();
93 |
94 | _rasterize(image, vertices, triangles, colors, depth_buffer, ntri, h, w, c, true);
95 | t = clock() - t;
96 | double time_taken = ((double) t) / CLOCKS_PER_SEC; // in seconds
97 | printf("Render took %f seconds to execute \n", time_taken);
98 |
99 |
100 | // auto *image_char = new u_char[h * w * c]();
101 | // for (int i = 0; i < h * w * c; ++i)
102 | // image_char[i] = u_char(255 * image[i]);
103 | write_ppm("res.ppm", image, h, w, c);
104 |
105 | // delete[] image_char;
106 | delete[] vertices;
107 | delete[] colors;
108 | delete[] triangles;
109 | delete[] image;
110 | delete[] depth_buffer;
111 | }
112 |
113 | void test_light() {
114 | // 1. loading obj
115 | const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/emma_input_0_noheader.ply";
116 | int nver = 53215; //35709;
117 | int ntri = 105840; //70789;
118 |
119 | auto *vertices = new float[3 * nver];
120 | auto *colors = new float[3 * nver];
121 | auto *triangles = new int[3 * ntri];
122 | load_ply(fp, vertices, triangles, nver, ntri);
123 |
124 | // 2. rendering
125 | // int h = 1901, w = 3913, c = 3;
126 | int h = 2000, w = 4000, c = 3;
127 |
128 | // enlarging
129 | // int scale = 1;
130 | // h *= scale;
131 | // w *= scale;
132 | // for (int i = 0; i < nver * 3; ++i) vertices[i] *= scale;
133 |
134 | auto *image = new unsigned char[h * w * c]();
135 | auto *depth_buffer = new float[h * w]();
136 |
137 | for (int i = 0; i < h * w; ++i) depth_buffer[i] = -999999;
138 | for (int i = 0; i < 3 * nver; ++i) colors[i] = 0.8;
139 |
140 | clock_t t;
141 | t = clock();
142 |
143 | _rasterize(image, vertices, triangles, colors, depth_buffer, ntri, h, w, c, true);
144 | t = clock() - t;
145 | double time_taken = ((double) t) / CLOCKS_PER_SEC; // in seconds
146 | printf("Render took %f seconds to execute \n", time_taken);
147 |
148 |
149 | // auto *image_char = new u_char[h * w * c]();
150 | // for (int i = 0; i < h * w * c; ++i)
151 | // image_char[i] = u_char(255 * image[i]);
152 | write_ppm("emma.ppm", image, h, w, c);
153 |
154 | // delete[] image_char;
155 | delete[] vertices;
156 | delete[] colors;
157 | delete[] triangles;
158 | delete[] image;
159 | delete[] depth_buffer;
160 | }
161 |
162 | int main(int argc, char *argv[]) {
163 | // std::cout << "Hello CMake!" << std::endl;
164 |
165 | // test_isPointInTri();
166 | // test_getPointWeight();
167 | // test_get_tri_normal();
168 | // test_load_obj();
169 | // test_render();
170 | test_light();
171 | return 0;
172 | }
--------------------------------------------------------------------------------
/backbone_nets/mobilenetv2_backbone.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | __all__ = ['MobileNetV2', 'mobilenet_v2']
6 |
7 |
8 | model_urls = {
9 | 'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
10 | }
11 |
12 |
13 | def _make_divisible(v, divisor, min_value=None):
14 | """
15 | This function is taken from the original tf repo.
16 | It ensures that all layers have a channel number that is divisible by 8
17 | It can be seen here:
18 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
19 | :param v:
20 | :param divisor:
21 | :param min_value:
22 | :return:
23 | """
24 | if min_value is None:
25 | min_value = divisor
26 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
27 | # Make sure that round down does not go down by more than 10%.
28 | if new_v < 0.9 * v:
29 | new_v += divisor
30 | return new_v
31 |
32 |
33 | class ConvBNReLU(nn.Sequential):
34 | def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None):
35 | padding = (kernel_size - 1) // 2
36 | if norm_layer is None:
37 | norm_layer = nn.BatchNorm2d
38 | super(ConvBNReLU, self).__init__(
39 | nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
40 | norm_layer(out_planes),
41 | nn.ReLU6(inplace=True)
42 | )
43 |
44 |
45 | class InvertedResidual(nn.Module):
46 | def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None):
47 | super(InvertedResidual, self).__init__()
48 | self.stride = stride
49 | assert stride in [1, 2]
50 |
51 | if norm_layer is None:
52 | norm_layer = nn.BatchNorm2d
53 |
54 | hidden_dim = int(round(inp * expand_ratio))
55 | self.use_res_connect = self.stride == 1 and inp == oup
56 |
57 | layers = []
58 | if expand_ratio != 1:
59 | # pw
60 | layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
61 | layers.extend([
62 | # dw
63 | ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim, norm_layer=norm_layer),
64 | # pw-linear
65 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
66 | norm_layer(oup),
67 | ])
68 | self.conv = nn.Sequential(*layers)
69 |
70 | def forward(self, x):
71 | if self.use_res_connect:
72 | return x + self.conv(x)
73 | else:
74 | return self.conv(x)
75 |
76 |
77 | class MobileNetV2(nn.Module):
78 | def __init__(self,
79 | num_classes=1000,
80 | width_mult=1.0,
81 | inverted_residual_setting=None,
82 | round_nearest=8,
83 | block=None,
84 | norm_layer=None,
85 | last_CN=None):
86 | """
87 | MobileNet V2 main class
88 | Args:
89 | num_classes (int): Number of classes
90 | width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
91 | inverted_residual_setting: Network structure
92 | round_nearest (int): Round the number of channels in each layer to be a multiple of this number
93 | Set to 1 to turn off rounding
94 | block: Module specifying inverted residual building block for mobilenet
95 | norm_layer: Module specifying the normalization layer to use
96 | """
97 | super(MobileNetV2, self).__init__()
98 |
99 | if block is None:
100 | block = InvertedResidual
101 |
102 | if norm_layer is None:
103 | norm_layer = nn.BatchNorm2d
104 |
105 | input_channel = 32
106 | last_channel = 1280
107 |
108 | if inverted_residual_setting is None:
109 | inverted_residual_setting = [
110 | # t, c, n, s
111 | [1, 16, 1, 1],
112 | [6, 24, 2, 2],
113 | [6, 32, 3, 2],
114 | [6, 64, 4, 2],
115 | [6, 96, 3, 1],
116 | [6, 160, 3, 2],
117 | [6, 320, 1, 1],
118 | ]
119 |
120 | # only check the first element, assuming user knows t,c,n,s are required
121 | if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
122 | raise ValueError("inverted_residual_setting should be non-empty "
123 | "or a 4-element list, got {}".format(inverted_residual_setting))
124 |
125 | # building first layer
126 | input_channel = _make_divisible(input_channel * width_mult, round_nearest)
127 | self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
128 | features = [ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)]
129 | # building inverted residual blocks
130 | total = 0
131 | for t, c, n, s in inverted_residual_setting:
132 | output_channel = _make_divisible(c * width_mult, round_nearest)
133 | for i in range(n):
134 | total += 1
135 | stride = s if i == 0 else 1
136 | features.append(block(input_channel, output_channel, stride, expand_ratio=t, norm_layer=norm_layer))
137 | input_channel = output_channel
138 | #building last several layers
139 | features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer))
140 | # make it nn.Sequential
141 | self.features = nn.Sequential(*features)
142 | self.features_first = self.features[:9]
143 | self.features_second = self.features[9:]
144 |
145 | if not last_CN:
146 | self.last_CN = self.last_channel
147 | else:
148 | self.last_CN = last_CN
149 |
150 | # building classifier
151 |
152 | self.num_ori = 12
153 | self.num_shape = 40
154 | self.num_exp = 10
155 | self.num_texture = 40
156 | self.num_bin = 121
157 | self.num_scale = 1
158 | self.num_trans = 3
159 |
160 | if last_CN is not None:
161 | self.connector = nn.Sequential(
162 | nn.Linear(self.last_CN, self.last_CN//16),
163 | nn.ReLU6(inplace=True),
164 | nn.Linear(self.last_CN//16, self.last_CN),
165 | nn.ReLU6(inplace=True),
166 | nn.Sigmoid()
167 | )
168 | self.adjuster = nn.Sequential(
169 | nn.Linear(self.last_CN, self.last_CN),
170 | nn.BatchNorm1d(self.last_CN))
171 |
172 | self.classifier_ori = nn.Sequential(
173 | nn.Dropout(0.2),
174 | nn.Linear(self.last_CN, self.num_ori),
175 | )
176 | self.classifier_shape = nn.Sequential(
177 | nn.Dropout(0.2),
178 | nn.Linear(self.last_CN, self.num_shape),
179 | )
180 | self.classifier_exp = nn.Sequential(
181 | nn.Dropout(0.2),
182 | nn.Linear(self.last_CN, self.num_exp),
183 | )
184 | self.classifier_texture = nn.Sequential(
185 | nn.Dropout(0.2),
186 | nn.Linear(self.last_CN, self.num_texture),
187 | )
188 |
189 | # weight initialization
190 | for m in self.modules():
191 | if isinstance(m, nn.Conv2d):
192 | nn.init.kaiming_normal_(m.weight, mode='fan_out')
193 | if m.bias is not None:
194 | nn.init.zeros_(m.bias)
195 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
196 | nn.init.ones_(m.weight)
197 | nn.init.zeros_(m.bias)
198 | elif isinstance(m, nn.Linear):
199 | nn.init.normal_(m.weight, 0, 0.01)
200 | nn.init.zeros_(m.bias)
201 |
202 | def _forward_impl(self, x):
203 | # This exists since TorchScript doesn't support inheritance, so the superclass method
204 | # (this one) needs to have a name other than `forward` that can be accessed in a subclass
205 | inter = self.features_first(x)
206 | x = self.features_second(inter)
207 |
208 | x = nn.functional.adaptive_avg_pool2d(x, 1)
209 | x = x.reshape(x.shape[0], -1)
210 |
211 | pool_x = x.clone()
212 | x_ori = self.classifier_ori(x)
213 | x_shape = self.classifier_shape(x)
214 | x_exp = self.classifier_exp(x)
215 | x_tex = self.classifier_texture(x)
216 | x = torch.cat((x_ori, x_shape, x_exp, x_tex), dim=1)
217 |
218 | return x, pool_x, inter
219 |
220 | def forward(self, x):
221 | return self._forward_impl(x)
222 |
223 |
224 | def mobilenet_v2(pretrained=False, progress=True, **kwargs):
225 | """
226 | Constructs a MobileNetV2 architecture from
227 | `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" `_.
228 | Args:
229 | pretrained (bool): If True, returns a model pre-trained on ImageNet
230 | progress (bool): If True, displays a progress bar of the download to stderr
231 | """
232 | model = MobileNetV2(**kwargs)
233 | if pretrained:
234 | state_dict = torch.hub.load_state_dict_from_url(model_urls['mobilenet_v2'],
235 | progress=progress)
236 | model.load_state_dict(state_dict)
237 | return model
--------------------------------------------------------------------------------
/cal_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # make sure you have downloaded saved mesh from supervised learning and validation set for calculating the error
4 |
5 | python cal_size_ARE.py
6 | python cal_size_kpts.py
--------------------------------------------------------------------------------
/cal_size_ARE.py:
--------------------------------------------------------------------------------
1 | # This script calculates the point-to-point face size (ARE)
2 |
3 | import numpy as np
4 | import glob
5 | from statistics import mean
6 |
7 | def read_obj(filename):
8 | f = open(filename)
9 | lines = f.readlines()
10 | coll = []
11 | for l in lines:
12 | if l[0] != 'v':
13 | break
14 | comp = l.split()[1:]
15 | comp = list(map(float, comp))
16 | coll.append(comp)
17 |
18 | a = np.asarray(coll)
19 | return a
20 |
21 | def read_xyz(filename):
22 | f = open(filename)
23 | lines = f.readlines()
24 | coll = []
25 | for l in lines:
26 | comp = l.split()
27 | comp = list(map(float, comp))
28 | coll.append(comp)
29 | a=np.asarray(coll)
30 | return a
31 |
32 | folders = glob.glob('data/all_test_result_3PerP_supervised_64/*')
33 | fore_name, cheek_name, ear_name, mid_name = [], [], [], []
34 |
35 | for folder in folders:
36 | folder_name = folder.rsplit('/',1)[-1]
37 | print("Evaluating: ", folder_name)
38 | all_predictions = glob.glob(folder+'/*.obj')
39 | target_pts = read_xyz(glob.glob('data/A2E_val/'+folder_name+'/*.xyz')[0])
40 | target_OICD = np.linalg.norm(target_pts[2217]-target_pts[14607])
41 | target_foreD = np.linalg.norm(target_pts[1678]-target_pts[42117])
42 | target_cheekD = np.linalg.norm(target_pts[2294]-target_pts[13635])
43 | target_earD = np.linalg.norm(target_pts[20636]-target_pts[34153])
44 | target_midD = np.linalg.norm(target_pts[2130]-target_pts[15003])
45 |
46 | target_foreOICD = target_foreD/target_OICD
47 | target_cheekOICD = target_cheekD/target_OICD
48 | target_earOICD = target_earD/target_OICD
49 | target_midOICD = target_midD/target_OICD
50 |
51 | fore_err, cheek_err, ear_err, mid_err = [],[],[],[]
52 |
53 | for pred in all_predictions:
54 | pred_pts = read_obj(pred)
55 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607])
56 | pred_pts *= (target_OICD/pred_OICD)
57 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607])
58 | pred_midD = np.linalg.norm(pred_pts[2130]-pred_pts[15003])
59 | pred_foreD = np.linalg.norm(pred_pts[1678]-pred_pts[42117])
60 | pred_cheekD = np.linalg.norm(pred_pts[2294]-pred_pts[13635])
61 | pred_earD = np.linalg.norm(pred_pts[20636]-pred_pts[34153])
62 |
63 | pred_midOICD = pred_midD/pred_OICD
64 | pred_foreOICD = pred_foreD/pred_OICD
65 | pred_cheekOICD = pred_cheekD/pred_OICD
66 | pred_earOICD = pred_earD/pred_OICD
67 |
68 | fore_err.append(abs(pred_foreOICD-target_foreOICD))
69 | cheek_err.append(abs(pred_cheekOICD-target_cheekOICD))
70 | ear_err.append(abs(pred_earOICD-target_earOICD))
71 | mid_err.append(abs(pred_midOICD-target_midOICD))
72 |
73 |
74 | fore_err_mean, cheek_err_mean, ear_err_mean, mid_err_mean = mean(fore_err), mean(cheek_err), mean(ear_err), mean(mid_err)
75 | fore_name.append(fore_err_mean)
76 | cheek_name.append(cheek_err_mean)
77 | mid_name.append(mid_err_mean)
78 | ear_name.append(ear_err_mean)
79 |
80 | print("Summary of the ARE:")
81 | print("-----------------------")
82 | print("Fore ratio error", mean(fore_name))
83 | print("Cheek ratio error", mean(cheek_name))
84 | print("Ear ratio error", mean(ear_name))
85 | print("Mid ratio error", mean(mid_name))
86 |
--------------------------------------------------------------------------------
/cal_size_kpts.py:
--------------------------------------------------------------------------------
1 | # This script calculates the point-to-point face size (Keypoint)
2 |
3 | import numpy as np
4 | import glob
5 | from statistics import mean
6 |
7 | def read_obj(filename):
8 | f = open(filename)
9 | lines = f.readlines()
10 | coll = []
11 | for l in lines:
12 | if l[0] != 'v':
13 | break
14 | comp = l.split()[1:]
15 | comp = list(map(float, comp))
16 | coll.append(comp)
17 |
18 | a = np.asarray(coll)
19 | return a
20 |
21 | def read_xyz(filename):
22 | f = open(filename)
23 | lines = f.readlines()
24 | coll = []
25 | for l in lines:
26 | comp = l.split()
27 | comp = list(map(float, comp))
28 | coll.append(comp)
29 | a=np.asarray(coll)
30 | return a
31 |
32 |
33 | kpts = np.load('train.configs/keypoints_sim.npy')
34 | folders = glob.glob('data/all_test_result_3PerP_supervised_64/*')
35 | kpts_name = []
36 |
37 | for folder in folders:
38 | folder_name = folder.rsplit('/',1)[-1]
39 | print("Evaluating: ", folder_name)
40 | all_predictions = glob.glob(folder+'/*.obj')
41 | target_pts = read_xyz(glob.glob('data/A2E_val/'+folder_name+'/*.xyz')[0])
42 | target_OICD = np.linalg.norm(target_pts[2217]-target_pts[14607])
43 |
44 | RMSE_col = []
45 |
46 | for pred in all_predictions:
47 | pred_pts = read_obj(pred)
48 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607])
49 | pred_pts *= (target_OICD/pred_OICD)
50 | pred_pts_flat = pred_pts.flatten(order='C')
51 | target_pts_flat = target_pts.flatten(order='C')
52 |
53 | size_R, size_C = target_pts[:,1].max()-target_pts[:,1].min(), target_pts[:,0].max()-target_pts[:,0].min()
54 | pred_kpts, target_kpts = pred_pts_flat[kpts], target_pts_flat[kpts]
55 | RMSE = np.linalg.norm(pred_kpts-target_kpts)/np.sqrt(size_R*size_C)
56 | RMSE_col.append(RMSE)
57 |
58 | kpts_name_mean = mean(RMSE_col)
59 | kpts_name.append(kpts_name_mean)
60 |
61 | print("Keypoints error: ", mean(kpts_name))
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import string
2 | from dataset import VoiceDataset, FaceDataset
3 | from network import VoiceEmbedNet, Generator, FaceEmbedNet, Classifier
4 | from utils import get_collate_fn, get_collate_fn_4
5 | import os
6 |
7 | DATASET_PARAMETERS = {
8 | # meta data provided by voxceleb1 dataset
9 | 'meta_file': 'data/vox1_meta.csv',
10 |
11 | # voice dataset
12 | 'voice_dir': 'data/fbank',
13 | 'voice_ext': 'npy',
14 |
15 | # face dataset
16 | 'face_dir': 'data/VGG_ALL_FRONTAL',
17 | 'face_ext': '.jpg',
18 |
19 | # train data includes the identities
20 | # whose names start with the characters of 'FGH...XYZ'
21 | 'split': string.ascii_uppercase[5:],
22 |
23 | # dataloader
24 | 'voice_dataset': VoiceDataset,
25 | 'face_dataset': FaceDataset,
26 | 'batch_size': 64,
27 | 'nframe_range': [300, 800],
28 | 'workers_num': 1,
29 | 'collate_fn': get_collate_fn,
30 | 'collate_fn_4': get_collate_fn_4,
31 |
32 | # test data
33 | 'test_data': 'data/test_data/'
34 | }
35 |
36 | SAVE_DIR = 'pretrained_models/'
37 | NUM_EPOCH = 48000 #49999
38 |
39 | if not os.path.exists(SAVE_DIR):
40 | os.makedirs(SAVE_DIR)
41 |
42 | NETWORKS_PARAMETERS = {
43 |
44 | 'SAVE_DIR': SAVE_DIR,
45 |
46 | # VOICE EMBEDDING NETWORK (e)
47 | 'e': {
48 | 'network': VoiceEmbedNet,
49 | 'input_channel': 64,
50 | 'channels': [256, 384, 576, 864],
51 | 'output_channel': 64, # the embedding dimension
52 | 'model_path': 'pretrained_models/voice_embedding.pth',
53 | },
54 | # GENERATOR (g)
55 | 'g': {
56 | 'network': Generator,
57 | 'input_channel': 64,
58 | 'channels': [1024, 512, 256, 128, 64], # channels for deconvolutional layers
59 | 'output_channel': 3, # images with RGB channels
60 | 'model_path': f'{SAVE_DIR}/generator_{NUM_EPOCH}.pth'
61 | },
62 | # FACE EMBEDDING NETWORK (f)
63 | 'f': {
64 | 'network': FaceEmbedNet,
65 | 'input_channel': 3,
66 | 'channels': [32, 64, 128, 256, 512],
67 | 'output_channel': 64,
68 | 'model_path': 'models/face_embedding.pth',
69 | },
70 | # DISCRIMINATOR (d)
71 | 'd': {
72 | 'network': Classifier, # Discrminator is a special Classifier with 1 subject
73 | 'input_channel': 64,
74 | 'channels': [],
75 | 'output_channel': 1,
76 | 'model_path': 'models/discriminator.pth',
77 | },
78 | # CLASSIFIER (c)
79 | 'c': {
80 | 'network': Classifier,
81 | 'input_channel': 64,
82 | 'channels': [],
83 | 'output_channel': -1, # This parameter is depended on the dataset we used
84 | 'model_path': 'models/classifier.pth',
85 | },
86 | # OPTIMIZER PARAMETERS
87 | 'lr': 0.0002,
88 | 'beta1': 0.5,
89 | 'beta2': 0.999,
90 |
91 | # MODE, use GPU or not
92 | 'GPU': True,
93 |
94 | 'image3D':{
95 | 'model_path': f'{SAVE_DIR}/image3D_{NUM_EPOCH}.pth'
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00001.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00002.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00001/1TmvLk8sB-g/00003.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00001.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00002.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00002/0XmNeUnOnlg/00003.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00001.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00002.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00003/1M4q6CQM5pA/00003.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00001.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00002.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00004/_2wZVvsQYFg/00003.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00001.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00002.npy
--------------------------------------------------------------------------------
/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/preprocessed_MFCC/rand_id00005/0nH78dDh0N0/00003.npy
--------------------------------------------------------------------------------
/data/results/rand_id00001/1TmvLk8sB-g_00001_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00001_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00001/1TmvLk8sB-g_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00001_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00001/1TmvLk8sB-g_00002_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00002_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00001/1TmvLk8sB-g_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00002_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00001/1TmvLk8sB-g_00003_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00003_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00001/1TmvLk8sB-g_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00001/1TmvLk8sB-g_00003_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00002/0XmNeUnOnlg_00001_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00001_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00002/0XmNeUnOnlg_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00001_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00002/0XmNeUnOnlg_00002_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00002_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00002/0XmNeUnOnlg_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00002_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00002/0XmNeUnOnlg_00003_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00003_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00002/0XmNeUnOnlg_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00002/0XmNeUnOnlg_00003_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00003/1M4q6CQM5pA_00001_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00001_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00003/1M4q6CQM5pA_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00001_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00003/1M4q6CQM5pA_00002_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00002_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00003/1M4q6CQM5pA_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00002_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00003/1M4q6CQM5pA_00003_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00003_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00003/1M4q6CQM5pA_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00003/1M4q6CQM5pA_00003_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00004/_2wZVvsQYFg_00001_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00001_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00004/_2wZVvsQYFg_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00001_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00004/_2wZVvsQYFg_00002_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00002_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00004/_2wZVvsQYFg_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00002_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00004/_2wZVvsQYFg_00003_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00003_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00004/_2wZVvsQYFg_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00004/_2wZVvsQYFg_00003_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00005/0nH78dDh0N0_00001_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00001_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00005/0nH78dDh0N0_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00001_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00005/0nH78dDh0N0_00002_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00002_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00005/0nH78dDh0N0_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00002_overlap.png
--------------------------------------------------------------------------------
/data/results/rand_id00005/0nH78dDh0N0_00003_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00003_image.png
--------------------------------------------------------------------------------
/data/results/rand_id00005/0nH78dDh0N0_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results/rand_id00005/0nH78dDh0N0_00003_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_img.png
--------------------------------------------------------------------------------
/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00001_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_img.png
--------------------------------------------------------------------------------
/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00002_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_img.png
--------------------------------------------------------------------------------
/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Asa_Butterfield/1TmvLk8sB-g_00003_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_img.png
--------------------------------------------------------------------------------
/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00001_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_img.png
--------------------------------------------------------------------------------
/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00002_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_img.png
--------------------------------------------------------------------------------
/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Ashley_Greene/0XmNeUnOnlg_00003_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_img.png
--------------------------------------------------------------------------------
/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00001_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_img.png
--------------------------------------------------------------------------------
/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00002_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_img.png
--------------------------------------------------------------------------------
/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bellamy_Young/1M4q6CQM5pA_00003_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_img.png
--------------------------------------------------------------------------------
/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00001_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_img.png
--------------------------------------------------------------------------------
/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00002_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_img.png
--------------------------------------------------------------------------------
/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Bethany_Mota/_2wZVvsQYFg_00003_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_img.png
--------------------------------------------------------------------------------
/data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00001_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_img.png
--------------------------------------------------------------------------------
/data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00002_overlap.png
--------------------------------------------------------------------------------
/data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_img.png
--------------------------------------------------------------------------------
/data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_overlap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/data/results_reference/Eva_Longoria/0nH78dDh0N0_00003_overlap.png
--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from PIL import Image
4 | from torch.utils.data import Dataset
5 | import random
6 |
7 | def load_voice(voice_item):
8 | voice_data = np.load(voice_item['filepath'])
9 | voice_data = voice_data.T.astype('float32')
10 | voice_label = voice_item['label_id']
11 | return voice_data, voice_label
12 |
13 | def load_face(face_item):
14 | face_data = Image.open(face_item['filepath']).convert('RGB').resize([64, 64])
15 | face_data = np.transpose(np.array(face_data), (2, 0, 1))
16 | face_data = ((face_data - 127.5) / 127.5).astype('float32')
17 | face_label = face_item['label_id']
18 | return face_data, face_label
19 |
20 | class VoiceDataset(Dataset):
21 | def __init__(self, voice_list, nframe_range):
22 | self.voice_list = voice_list
23 | self.crop_nframe = nframe_range[1]
24 | self.length = len(self.voice_list)
25 |
26 | def __getitem__(self, index):
27 | ranidx = random.randint(0, self.length-1)
28 | voice_data, voice_label = load_voice(self.voice_list[index])
29 | if index == self.length-1:
30 | p_ind = index-1
31 | else:
32 | p_ind = index+1
33 | voice_data_p, _ = load_voice(self.voice_list[p_ind])
34 | voice_data_n, _ = load_voice(self.voice_list[ranidx])
35 | assert self.crop_nframe <= voice_data.shape[1]
36 | pt = np.random.randint(voice_data.shape[1] - self.crop_nframe + 1)
37 | voice_data = voice_data[:, pt:pt+self.crop_nframe]
38 | pt_p = np.random.randint(voice_data_p.shape[1] - self.crop_nframe + 1)
39 | voice_data_p = voice_data_p[:, pt_p:pt_p+self.crop_nframe]
40 | pt_n = np.random.randint(voice_data_n.shape[1] - self.crop_nframe + 1)
41 | voice_data_n = voice_data_n[:, pt_n:pt_n+self.crop_nframe]
42 | return voice_data, voice_label, voice_data_p, voice_data_n
43 |
44 | def __len__(self):
45 | return len(self.voice_list)
46 |
47 | class FaceDataset(Dataset):
48 | def __init__(self, face_list):
49 | self.face_list = face_list
50 |
51 | def __getitem__(self, index):
52 | face_data, face_label = load_face(self.face_list[index])
53 | if np.random.random() > 0.5:
54 | face_data = np.flip(face_data, axis=2).copy()
55 | return face_data, face_label
56 |
57 | def __len__(self):
58 | return len(self.face_list)
59 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | # This script demos with pre-processed MFCC.
2 | import os
3 | import glob
4 | import torch
5 | import scipy.io as sio
6 | import numpy as np
7 | import cv2
8 |
9 | from config import NETWORKS_PARAMETERS
10 | from network import get_network, SynergyNet
11 | from utils import voice2face_processed
12 | from utilf.render import render_vert
13 |
14 | # initialization
15 | e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False)
16 | g_net, _ = get_network('g', NETWORKS_PARAMETERS, train=False)
17 |
18 | # building models: unsupervised
19 | image3D = SynergyNet(pretrained=False, last_CN=None).cuda().eval()
20 | backbone_ckpt = torch.load(NETWORKS_PARAMETERS['image3D']['model_path'])
21 | image3D.load_state_dict(backbone_ckpt)
22 |
23 | # SynergyNet pretrained network for getting pose
24 | image3D_pretrained = SynergyNet(pretrained=True).cuda().eval()
25 |
26 | # data and config
27 | voice_list = sorted(glob.glob('data/preprocessed_MFCC/*'))
28 | up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True)
29 | tri = sio.loadmat('./train.configs/tri.mat')['tri']
30 |
31 | # [TODO] Change this variable to yout result output folder
32 | FOLDER_ROOT = 'data/results/'
33 |
34 | if not os.path.exists(FOLDER_ROOT):
35 | os.mkdir(FOLDER_ROOT)
36 |
37 | for folder in voice_list:
38 | index = folder.rsplit('/',1)[-1]
39 | print(index)
40 |
41 | if not os.path.exists(FOLDER_ROOT+index):
42 | os.mkdir(FOLDER_ROOT + index)
43 |
44 | all_sequences = sorted(glob.glob(folder+'/*'))
45 | for sequence in all_sequences:
46 | all_fbanks = sorted(glob.glob(sequence+'/*.npy'))
47 | sequence_name = sequence.rsplit('/',1)[-1]
48 |
49 | for fbank in all_fbanks:
50 | fbank_name = fbank.rsplit('/',1)[-1][:-4]
51 |
52 | with torch.no_grad():
53 | # voice2face
54 | face_image = voice2face_processed(e_net, g_net, fbank, NETWORKS_PARAMETERS['GPU'])
55 | face_image = up_layer(face_image)
56 |
57 | # Pose from 3DDFA-V2
58 | pose = image3D_pretrained(face_image, return_onlypose=True)
59 | R, off = image3D_pretrained.parse_param_102_pose(pose)
60 |
61 | #Alignment with synthesized image
62 | prediction = image3D(face_image)
63 | prediction = R @ prediction + off
64 |
65 | # transform to image coordinate space
66 | prediction[:, 1, :] = 127 - prediction[:, 1, :]
67 | save_name = FOLDER_ROOT+ index + '/' + sequence_name + '_' + fbank_name
68 | img = (((face_image[0].clamp(-1,1))*127.5)+128).detach().cpu().numpy().astype(np.uint8)
69 | img = np.transpose(img, (1,2,0))
70 | img = img[:,:,[2,1,0]]
71 | pred = prediction[0].detach().cpu().numpy()
72 | # save
73 | cv2.imwrite(save_name+'_image.png', img)
74 | render_vert(img, pred, alpha=1.0, wfp=save_name+'_overlap.png')
75 |
76 |
--------------------------------------------------------------------------------
/demo/coherence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/demo/coherence.png
--------------------------------------------------------------------------------
/demo/overall_purpose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/demo/overall_purpose.png
--------------------------------------------------------------------------------
/demo/supervised_comp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/demo/supervised_comp.png
--------------------------------------------------------------------------------
/demo_mic.py:
--------------------------------------------------------------------------------
1 |
2 | import cv2
3 | from dataclasses import dataclass, asdict
4 | import glob
5 | import numpy as np
6 | import os
7 | import pyaudio
8 | import scipy.io as sio
9 | from scipy.io import wavfile
10 | import shutil
11 | import torch
12 | import torch.nn.functional as F
13 | import torchvision.utils as vutils
14 | import webrtcvad
15 |
16 | from mfcc import MFCC
17 | from config import NETWORKS_PARAMETERS
18 | from network import get_network, SynergyNet
19 | from utils import voice2face, read_obj
20 | from vad import read_wave, write_wave, frame_generator, vad_collector
21 | from pyaudio_recording import Recorder
22 | from utilf.render import render_vert
23 |
24 | @dataclass
25 | class StreamParams:
26 | format: int = pyaudio.paInt16
27 | channels: int = 1
28 | rate: int = 16000
29 | frames_per_buffer: int = 1024
30 | input: bool = True
31 | output: bool = False
32 |
33 | def to_dict(self) -> dict:
34 | return asdict(self)
35 |
36 |
37 | def rm_sil(voice_file, vad_obj):
38 | """
39 | remove silence
40 | """
41 | audio, sample_rate = read_wave(voice_file)
42 | frames = frame_generator(20, audio, sample_rate)
43 | frames = list(frames)
44 | segments = vad_collector(sample_rate, 20, 50, vad_obj, frames)
45 |
46 | if os.path.exists('tmp/'):
47 | shutil.rmtree('tmp/')
48 | os.makedirs('tmp/')
49 |
50 | wave_data = []
51 | for i, segment in enumerate(segments):
52 | segment_file = 'tmp/' + str(i) + '.wav'
53 | write_wave(segment_file, segment, sample_rate)
54 | wave_data.append(wavfile.read(segment_file)[1])
55 | shutil.rmtree('tmp/')
56 |
57 | if wave_data:
58 | vad_voice = np.concatenate(wave_data).astype('int16')
59 | return vad_voice
60 |
61 | def get_fbank(voice, mfc_obj):
62 | """
63 | process audio and create mel-spectrogram
64 | """
65 | # Extract log mel-spectrogra
66 | fbank = mfc_obj.sig2logspec(voice).astype('float32')
67 |
68 | # Mean and variance normalization of each mel-frequency
69 | fbank = fbank - fbank.mean(axis=0)
70 | fbank = fbank / (fbank.std(axis=0)+np.finfo(np.float32).eps)
71 |
72 | # If the duration of a voice recording is less than 10 seconds (1000 frames),
73 | # repeat the recording until it is longer than 10 seconds and crop.
74 | full_frame_number = 1000
75 | init_frame_number = fbank.shape[0]
76 | while fbank.shape[0] < full_frame_number:
77 | fbank = np.append(fbank, fbank[0:init_frame_number], axis=0)
78 | fbank = fbank[0:full_frame_number,:]
79 | return fbank
80 |
81 | def voice2face(e_net, g_net, voice_file, vad_obj, mfc_obj, GPU=True):
82 | vad_voice = rm_sil(voice_file, vad_obj)
83 | fbank = get_fbank(vad_voice, mfc_obj)
84 | fbank = fbank.T[np.newaxis, ...]
85 | fbank = torch.from_numpy(fbank.astype('float32'))
86 |
87 | if GPU:
88 | fbank = fbank.cuda()
89 | embedding = e_net(fbank)
90 | embedding = F.normalize(embedding)
91 | face = g_net(embedding)
92 | return face
93 |
94 | def main():
95 | # recording and save under the root
96 | filename = "audio.wav"
97 | # stream_params = StreamParams()
98 | # recorder = Recorder(stream_params)
99 | # # record for 5 seconds
100 | # recorder.record(5, filename)
101 |
102 | # initialization
103 | # voice activity detector, aggressiveness = 2
104 | vad_obj = webrtcvad.Vad(2)
105 | # Mel-Frequency extractor
106 | mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025)
107 | # net definition
108 | e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False)
109 | g_net, _ = get_network('g', NETWORKS_PARAMETERS, train=False)
110 |
111 | # building models: unsupervised
112 | image3D = SynergyNet(pretrained=False, last_CN=None).cuda().eval()
113 | backbone_ckpt = torch.load(NETWORKS_PARAMETERS['image3D']['model_path'])
114 | image3D.load_state_dict(backbone_ckpt)
115 |
116 | # SynergyNet pretrained network for getting pose
117 | image3D_pretrained = SynergyNet(pretrained=True).cuda().eval()
118 |
119 | # data and config
120 | up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True)
121 | tri = sio.loadmat('./train.configs/tri.mat')['tri']
122 |
123 | # default savepath
124 | FOLDER_ROOT = 'data/results/'
125 | if not os.path.exists(FOLDER_ROOT):
126 | os.makedirs(FOLDER_ROOT)
127 |
128 | with torch.no_grad():
129 | # voice2face
130 | face_image = voice2face(e_net, g_net, filename, vad_obj, mfc_obj, NETWORKS_PARAMETERS['GPU'])
131 | face_image = up_layer(face_image)
132 |
133 | # Pose from 3DDFA-V2
134 | pose = image3D_pretrained(face_image, return_onlypose=True)
135 | R, off = image3D_pretrained.parse_param_102_pose(pose)
136 |
137 | # Alignment with synthesized image
138 | prediction_fr = image3D(face_image)
139 | prediction = R @ prediction_fr + off
140 |
141 | # calculation between mean male and female shape and classify the gender by meshes
142 | #print(prediction_fr.requires_grad)
143 | prediction_fr_np = prediction_fr.squeeze(0).cpu().numpy()
144 | prediction_fr_np = np.transpose(prediction_fr_np, (1,0))
145 | mean_male = read_obj('male.obj') # 53215 * 3
146 | mean_female = read_obj('female.obj') # 53215 * 3
147 | N_vertices = prediction_fr_np.shape[0] #53215
148 | error_male = np.linalg.norm(prediction_fr_np - mean_male)/ N_vertices
149 | error_female = np.linalg.norm(prediction_fr_np - mean_female)/ N_vertices
150 |
151 | pred_midD = np.linalg.norm(prediction_fr_np[2130]-prediction_fr_np[15003])
152 | pred_foreD = np.linalg.norm(prediction_fr_np[1678]-prediction_fr_np[42117])
153 | pred_cheekD = np.linalg.norm(prediction_fr_np[2294]-prediction_fr_np[13635])
154 | pred_earD = np.linalg.norm(prediction_fr_np[20636]-prediction_fr_np[34153])
155 | print("-------------------------")
156 | if error_male < error_female:
157 | print("This is a male's voice")
158 | print("Statistics from the predicted mesh and mean gender mesh")
159 | target_foreD = np.linalg.norm(mean_male[1678]-mean_male[42117])
160 | target_cheekD = np.linalg.norm(mean_male[2294]-mean_male[13635])
161 | target_earD = np.linalg.norm(mean_male[20636]-mean_male[34153])
162 | target_midD = np.linalg.norm(mean_male[2130]-mean_male[15003])
163 |
164 | ratio_fore = (pred_foreD-target_foreD)/target_foreD
165 | ratio_cheek = (pred_cheekD-target_cheekD)/target_cheekD
166 | ratio_ear = (pred_earD-target_earD)/target_earD
167 | ratio_mid = (pred_midD-target_midD)/target_midD
168 |
169 | print(f"The forehead is {ratio_fore*100}% than the mean male shape")
170 | print(f"The cheek-to-cheek is {ratio_cheek*100}% than the mean male shape")
171 | print(f"The ear-to-ear is {ratio_ear*100}% than the mean male shape")
172 | print(f"The midline is {ratio_mid*100}% than the mean male shape")
173 | else:
174 | print("This is a female's voice")
175 | print("Statistics from the predicted mesh and mean gender mesh")
176 | target_foreD = np.linalg.norm(mean_female[1678]-mean_female[42117])
177 | target_cheekD = np.linalg.norm(mean_female[2294]-mean_female[13635])
178 | target_earD = np.linalg.norm(mean_female[20636]-mean_female[34153])
179 | target_midD = np.linalg.norm(mean_female[2130]-mean_female[15003])
180 |
181 | ratio_fore = (pred_foreD-target_foreD)/target_foreD
182 | ratio_cheek = (pred_cheekD-target_cheekD)/target_cheekD
183 | ratio_ear = (pred_earD-target_earD)/target_earD
184 | ratio_mid = (pred_midD-target_midD)/target_midD
185 |
186 | print(f"The forehead is {ratio_fore*100}% than the mean female shape")
187 | print(f"The cheek-to-cheek is {ratio_cheek*100}% than the femean male shape")
188 | print(f"The ear-to-ear is {ratio_ear*100}% than the mean female shape")
189 | print(f"The midline is {ratio_mid*100}% than the mean female shape")
190 | print("-------------------------")
191 |
192 | wide_shape = read_obj('wide.obj')
193 | skinny_shape = read_obj('skinny.obj')
194 | regular_shape = read_obj('regular.obj')
195 | slim_shape = read_obj('slim.obj')
196 | error_wide = np.linalg.norm(prediction_fr_np - wide_shape)/ N_vertices
197 | error_skinny = np.linalg.norm(prediction_fr_np - skinny_shape)/ N_vertices
198 | error_regular = np.linalg.norm(prediction_fr_np - regular_shape)/ N_vertices
199 | error_slim = np.linalg.norm(prediction_fr_np - slim_shape)/ N_vertices
200 | err_type = np.array([error_wide, error_skinny, error_regular, error_slim])
201 | index = np.argsort(err_type)[0]
202 |
203 | if index == 0:
204 | print("The face shape is closer to WIDE")
205 | elif index == 1:
206 | print(f"The face shape is closer to SKINNY")
207 | elif index == 2:
208 | print(f"The face shape is closer to REGULAR")
209 | elif index == 3:
210 | print(f"The face shape is closer to SLIM")
211 |
212 | print("-------------------------")
213 |
214 | # transform to image coordinate space
215 | prediction[:, 1, :] = 127 - prediction[:, 1, :]
216 | save_name = os.path.join(FOLDER_ROOT, 'micIn')
217 | img = (((face_image[0].clamp(-1,1))*127.5)+128).detach().cpu().numpy().astype(np.uint8)
218 | img = np.transpose(img, (1,2,0))
219 | img = img[:,:,[2,1,0]]
220 | pred = prediction[0].detach().cpu().numpy()
221 |
222 | # save
223 | cv2.imwrite(save_name+'_image.png', img)
224 | render_vert(img, pred, alpha=1.0, wfp=save_name+'_overlap.png')
225 |
226 | vutils.save_image(face_image.detach().clamp(-1,1), filename.replace('.wav', '.png'), normalize=True)
227 |
228 |
229 | if __name__ == '__main__':
230 | main()
--------------------------------------------------------------------------------
/distiller_zoo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 |
6 | class Attention(nn.Module):
7 | def __init__(self, p=2):
8 | super(Attention, self).__init__()
9 | self.p = p
10 |
11 |
12 | def forward(self, f_s, f_t):
13 | if f_s.dim() == 2:
14 | return (F.normalize(f_s.pow(self.p))-F.normalize(f_t.pow(self.p))).pow(2).mean()
15 | else:
16 | return (self.at(f_s) - self.at(f_t)).pow(2).mean()
17 |
18 | def at(self, f):
19 | return F.normalize(f.pow(self.p).mean(1).view(f.size(0), -1))
20 |
21 | class Similarity(nn.Module):
22 | def __init__(self):
23 | super(Similarity, self).__init__()
24 |
25 |
26 | def forward(self, f_s, f_t):
27 | bsz = f_s.shape[0]
28 | f_s = f_s.view(bsz, -1)
29 | f_t = f_t.view(bsz, -1)
30 | G_s = torch.mm(f_s, torch.t(f_s))
31 | G_s = torch.nn.functional.normalize(G_s)
32 |
33 | G_t = torch.mm(f_t, torch.t(f_t))
34 | G_t = torch.nn.functional.normalize(G_t)
35 |
36 | G_diff = G_t - G_s
37 | loss = (G_diff*G_diff).view(-1, 1).sum(0)/(bsz*bsz)
38 | return loss
39 |
40 | class Correlation(nn.Module):
41 | def __init__(self):
42 | super(Correlation, self).__init__()
43 |
44 | def forward(self, f_s, f_t):
45 | delta = torch.abs(f_s-f_t)
46 | loss = torch.mean((delta[:-1]*delta[1:]).sum(1))
47 | return loss
48 |
49 | class NSTLoss(nn.Module):
50 | def __init__(self):
51 | super(NSTLoss, self).__init__()
52 | pass
53 |
54 | def forward(self, f_s, f_t):
55 |
56 | if f_s.dim() == 4:
57 | s_H, t_H = f_s.shape[2], f_t.shape[2]
58 | if s_H > t_H:
59 | f_s = F.adaptive_avg_pool2d(f_s, (t_H, t_H))
60 | elif s_H < t_H:
61 | f_t = F.adaptive_avg_pool2d(f_t, (s_H, s_H))
62 | else:
63 | pass
64 |
65 | f_s = f_s.view(f_s.shape[0], f_s.shape[1], -1)
66 | f_s = F.normalize(f_s, dim=2)
67 | f_t = f_t.view(f_t.shape[0], f_t.shape[1], -1)
68 | f_t = F.normalize(f_t, dim=2)
69 |
70 | elif f_s.dim() == 2:
71 | f_s = F.normalize(f_s, dim=1)
72 | f_t = F.normalize(f_t, dim=1)
73 |
74 | full_loss = True
75 | if full_loss:
76 | return (self.poly_kernel(f_t, f_t).mean().detach() + self.poly_kernel(f_s,f_s).mean() - 2 * self.poly_kernel(f_s, f_t).mean())
77 | else:
78 | return self.poly_kernel(f_s, f_s).mean()
79 |
80 | def poly_kernel(self, a, b):
81 | a = a.unsqueeze(1)
82 | b = b.unsqueeze(2)
83 | res = (a*b).sum(-1).pow(2)
84 | return res
85 |
86 | class RKDLoss(nn.Module):
87 | """Relational Knowledge Disitllation, CVPR2019"""
88 | def __init__(self, w_d=25, w_a=50):
89 | super(RKDLoss, self).__init__()
90 | self.w_d = w_d
91 | self.w_a = w_a
92 |
93 | def forward(self, f_s, f_t):
94 | student = f_s.view(f_s.shape[0], -1)
95 | teacher = f_t.view(f_t.shape[0], -1)
96 |
97 | # RKD distance loss
98 | with torch.no_grad():
99 | t_d = self.pdist(teacher, squared=False)
100 | mean_td = t_d[t_d > 0].mean()
101 | t_d = t_d / mean_td
102 |
103 | d = self.pdist(student, squared=False)
104 | mean_d = d[d > 0].mean()
105 | d = d / mean_d
106 |
107 | loss_d = F.smooth_l1_loss(d, t_d)
108 |
109 | # RKD Angle loss
110 | with torch.no_grad():
111 | td = (teacher.unsqueeze(0) - teacher.unsqueeze(1))
112 | norm_td = F.normalize(td, p=2, dim=2)
113 | t_angle = torch.bmm(norm_td, norm_td.transpose(1, 2)).view(-1)
114 |
115 | sd = (student.unsqueeze(0) - student.unsqueeze(1))
116 | norm_sd = F.normalize(sd, p=2, dim=2)
117 | s_angle = torch.bmm(norm_sd, norm_sd.transpose(1, 2)).view(-1)
118 |
119 | loss_a = F.smooth_l1_loss(s_angle, t_angle)
120 |
121 | loss = self.w_d * loss_d + self.w_a * loss_a
122 |
123 | return loss
124 |
125 | @staticmethod
126 | def pdist(e, squared=False, eps=1e-12):
127 | e_square = e.pow(2).sum(dim=1)
128 | prod = e @ e.t()
129 | res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clamp(min=eps)
130 |
131 | if not squared:
132 | res = res.sqrt()
133 |
134 | res = res.clone()
135 | res[range(len(e)), range(len(e))] = 0
136 | return res
137 |
138 | class PKT(nn.Module):
139 | """Probabilistic Knowledge Transfer for deep representation learning
140 | Code from author: https://github.com/passalis/probabilistic_kt"""
141 | def __init__(self):
142 | super(PKT, self).__init__()
143 |
144 | def forward(self, f_s, f_t):
145 | return self.cosine_similarity_loss(f_s, f_t)
146 |
147 | @staticmethod
148 | def cosine_similarity_loss(output_net, target_net, eps=0.0000001):
149 | # Normalize each vector by its norm
150 | output_net_norm = torch.sqrt(torch.sum(output_net ** 2, dim=1, keepdim=True))
151 | output_net = output_net / (output_net_norm + eps)
152 | output_net[output_net != output_net] = 0
153 |
154 | target_net_norm = torch.sqrt(torch.sum(target_net ** 2, dim=1, keepdim=True))
155 | target_net = target_net / (target_net_norm + eps)
156 | target_net[target_net != target_net] = 0
157 |
158 | # Calculate the cosine similarity
159 | model_similarity = torch.mm(output_net, output_net.transpose(0, 1))
160 | target_similarity = torch.mm(target_net, target_net.transpose(0, 1))
161 |
162 | # Scale cosine similarity to 0..1
163 | model_similarity = (model_similarity + 1.0) / 2.0
164 | target_similarity = (target_similarity + 1.0) / 2.0
165 |
166 | # Transform them into probabilities
167 | model_similarity = model_similarity / torch.sum(model_similarity, dim=1, keepdim=True)
168 | target_similarity = target_similarity / torch.sum(target_similarity, dim=1, keepdim=True)
169 |
170 | # Calculate the KL-divergence
171 | loss = torch.mean(target_similarity * torch.log((target_similarity + eps) / (model_similarity + eps)))
172 |
173 | return loss
174 |
175 | class VIDLoss(nn.Module):
176 | """Variational Information Distillation for Knowledge Transfer (CVPR 2019),
177 | code from author: https://github.com/ssahn0215/variational-information-distillation"""
178 | def __init__(self,
179 | num_input_channels,
180 | num_mid_channel,
181 | num_target_channels,
182 | init_pred_var=5.0,
183 | eps=1e-5):
184 | super(VIDLoss, self).__init__()
185 |
186 | def conv1x1(in_channels, out_channels, stride=1):
187 | return nn.Conv2d(
188 | in_channels, out_channels,
189 | kernel_size=1, padding=0,
190 | bias=False, stride=stride)
191 |
192 | self.regressor = nn.Sequential(
193 | conv1x1(num_input_channels, num_mid_channel),
194 | nn.ReLU(),
195 | conv1x1(num_mid_channel, num_mid_channel),
196 | nn.ReLU(),
197 | conv1x1(num_mid_channel, num_target_channels),
198 | )
199 | self.log_scale = torch.nn.Parameter(
200 | np.log(np.exp(init_pred_var-eps)-1.0) * torch.ones(num_target_channels)
201 | )
202 | self.eps = eps
203 |
204 | def forward(self, input, target):
205 | # pool for dimentsion match
206 |
207 | # s_H, t_H = input.shape[2], target.shape[2]
208 | # if s_H > t_H:
209 | # input = F.adaptive_avg_pool2d(input, (t_H, t_H))
210 | # elif s_H < t_H:
211 | # target = F.adaptive_avg_pool2d(target, (s_H, s_H))
212 | # else:
213 | # pass
214 | if input.dim() == 2:
215 | input = input.unsqueeze(2).unsqueeze(2)
216 | target = target.unsqueeze(2).unsqueeze(2)
217 |
218 | pred_mean = self.regressor(input)
219 | pred_var = torch.log(1.0+torch.exp(self.log_scale))+self.eps
220 | pred_var = pred_var.view(1, -1, 1, 1)
221 | neg_log_prob = 0.5*(
222 | (pred_mean-target)**2/pred_var+torch.log(pred_var)
223 | )
224 | loss = torch.mean(neg_log_prob)
225 |
226 | return loss
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: CMP
2 | channels:
3 | - pytorch
4 | - defaults
5 | dependencies:
6 | - _libgcc_mutex=0.1=main
7 | - _openmp_mutex=4.5=1_gnu
8 | - blas=1.0=mkl
9 | - ca-certificates=2021.7.5=h06a4308_1
10 | - certifi=2021.5.30=py38h06a4308_0
11 | - cudatoolkit=10.2.89=hfd86e86_1
12 | - freetype=2.10.4=h5ab3b9f_0
13 | - intel-openmp=2021.3.0=h06a4308_3350
14 | - jpeg=9b=h024ee3a_2
15 | - lcms2=2.12=h3be6417_0
16 | - ld_impl_linux-64=2.35.1=h7274673_9
17 | - libffi=3.3=he6710b0_2
18 | - libgcc-ng=9.3.0=h5101ec6_17
19 | - libgomp=9.3.0=h5101ec6_17
20 | - libpng=1.6.37=hbc83047_0
21 | - libstdcxx-ng=9.3.0=hd4cf53a_17
22 | - libtiff=4.2.0=h85742a9_0
23 | - libuv=1.40.0=h7b6447c_0
24 | - libwebp-base=1.2.0=h27cfd23_0
25 | - lz4-c=1.9.3=h295c915_1
26 | - mkl=2021.3.0=h06a4308_520
27 | - mkl-service=2.4.0=py38h7f8727e_0
28 | - mkl_fft=1.3.0=py38h42c9631_2
29 | - mkl_random=1.2.2=py38h51133e4_0
30 | - ncurses=6.2=he6710b0_1
31 | - ninja=1.10.2=hff7bd54_1
32 | - numpy=1.20.3=py38hf144106_0
33 | - numpy-base=1.20.3=py38h74d4b33_0
34 | - olefile=0.46=py_0
35 | - openjpeg=2.4.0=h3ad879b_0
36 | - openssl=1.1.1l=h7f8727e_0
37 | - pillow=8.3.1=py38h2c7a002_0
38 | - pip=21.0.1=py38h06a4308_0
39 | - python=3.8.11=h12debd9_0_cpython
40 | - pytorch=1.7.1=py3.8_cuda10.2.89_cudnn7.6.5_0
41 | - readline=8.1=h27cfd23_0
42 | - setuptools=52.0.0=py38h06a4308_0
43 | - six=1.16.0=pyhd3eb1b0_0
44 | - sqlite=3.36.0=hc218d9a_0
45 | - tk=8.6.10=hbc83047_0
46 | - torchaudio=0.7.2=py38
47 | - torchvision=0.8.2=py38_cu102
48 | - typing_extensions=3.10.0.0=pyh06a4308_0
49 | - wheel=0.37.0=pyhd3eb1b0_0
50 | - xz=5.2.5=h7b6447c_0
51 | - zlib=1.2.11=h7b6447c_3
52 | - zstd=1.4.9=haebb681_0
53 | - pip:
54 | - cython==0.29.24
55 | - opencv-python==4.5.3.56
56 | - scipy==1.7.1
57 | - pyaudio
58 |
--------------------------------------------------------------------------------
/eval_sup.py:
--------------------------------------------------------------------------------
1 | # This script is for batch processing testing.
2 |
3 | import os
4 | import glob
5 | import torch
6 | import torchvision.utils as vutils
7 | import webrtcvad
8 | import scipy.io as sio
9 | import csv
10 | import numpy as np
11 |
12 | from mfcc import MFCC
13 | from config import NETWORKS_PARAMETERS
14 | from network import get_network, Generator1D_directMLP
15 | from utils import write_obj_with_colors, voice2face_processed_MeshOut
16 |
17 | # initialization
18 | vad_obj = webrtcvad.Vad(2)
19 | mfc_obj = MFCC(nfilt=64, lowerf=20., upperf=7200., samprate=16000, nfft=1024, wlen=0.025)
20 | e_net, _ = get_network('e', NETWORKS_PARAMETERS, train=False)
21 |
22 | g_net = Generator1D_directMLP().cuda().eval()
23 | g_net_ckpt = torch.load(NETWORKS_PARAMETERS['g']['model_path'])
24 | g_net.load_state_dict(g_net_ckpt)
25 |
26 | # test
27 | voice_list = sorted(glob.glob('data/fbank/*'))
28 | tri = sio.loadmat('./train.configs/tri.mat')['tri']
29 |
30 | id_name = {}
31 | csv_file = open('data/vox1_meta.csv')
32 | rows=csv.reader(csv_file, delimiter=' ')
33 | headers = next(rows)
34 | for row in rows:
35 | id_name.update({row[0]:row[1]})
36 | available_GT = list(map(lambda k: k.rsplit('/',1)[-1], sorted(glob.glob('data/A2E_val/*'))))
37 |
38 | # [TODO] Change this variable to yout result output folder
39 | FOLDER_ROOT = 'supervised_output/'
40 |
41 | if not os.path.exists(FOLDER_ROOT):
42 | os.mkdir(FOLDER_ROOT)
43 | coll = []
44 | for folder in voice_list:
45 | index = folder.rsplit('/',1)[-1]
46 | print(index)
47 | if index > 'id10309': # The end of E is 10309
48 | break
49 | corr_name = id_name[index]
50 | if not corr_name in available_GT: #check if the fbank id is in the fitted model database
51 | continue
52 | count = 0
53 |
54 | if not os.path.exists(FOLDER_ROOT+corr_name):
55 | os.mkdir(FOLDER_ROOT + corr_name)
56 |
57 | all_sequences = sorted(glob.glob(folder+'/*'))
58 |
59 | for sequence in all_sequences:
60 | print(sequence)
61 | all_fbanks = sorted(glob.glob(sequence+'/*.npy'))
62 | sequence_name = sequence.rsplit('/',1)[-1]
63 |
64 | for fbank in all_fbanks:
65 | print(fbank)
66 | fbank_name = fbank.rsplit('/',1)[-1][:-4]
67 | prediction = voice2face_processed_MeshOut(e_net, g_net, fbank,NETWORKS_PARAMETERS['GPU']).squeeze().detach().cpu()
68 | save_name = FOLDER_ROOT+ corr_name + '/' + sequence_name + '_' + fbank_name
69 | write_obj_with_colors(save_name+'.obj', prediction, triangles=tri)
70 |
71 | count += 1
72 | # the first three in all the fbank sequences
73 | if count >= 3:
74 | break
75 |
76 | if count >= 3:
77 | break
78 |
79 |
--------------------------------------------------------------------------------
/face_types/.placeholder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/face_types/.placeholder
--------------------------------------------------------------------------------
/gan_train_cascade.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import torch
4 | import torch.nn.functional as F
5 | import torchvision.utils as vutils
6 | import scipy.io as sio
7 |
8 | from torch.utils.data import DataLoader
9 | from config import DATASET_PARAMETERS, NETWORKS_PARAMETERS
10 | from parse_dataset import get_dataset
11 | from network import get_network, SynergyNet
12 | from utils import Meter, cycle, cycle_4, save_model, read_xyz, voice2face_processed, write_obj_with_colors
13 | from distiller_zoo import PKT
14 |
15 | import torch.optim as optim
16 | import glob
17 | import numpy as np
18 | from statistics import mean
19 | import logging
20 | from datetime import datetime
21 |
22 | if not os.path.exists(NETWORKS_PARAMETERS['SAVE_DIR']):
23 | os.makedirs(NETWORKS_PARAMETERS['SAVE_DIR'])
24 | logging.basicConfig(
25 | format='[%(asctime)s] [p%(process)s] [%(pathname)s:%(lineno)d] [%(levelname)s] %(message)s',
26 | level=logging.INFO,
27 | handlers=[
28 | logging.FileHandler(NETWORKS_PARAMETERS['SAVE_DIR']+'/{:%Y-%m-%d-%H-%M-%S}.log'.format(datetime.now()), mode='w'),
29 | logging.StreamHandler()
30 | ]
31 | )
32 | logging.info(f'Save the pth at {NETWORKS_PARAMETERS["SAVE_DIR"]}')
33 |
34 | # dataset and dataloader
35 | print('Parsing your dataset...')
36 | voice_list, face_list, id_class_num = get_dataset(DATASET_PARAMETERS)
37 | NETWORKS_PARAMETERS['c']['output_channel'] = id_class_num
38 |
39 |
40 | print('Preparing the datasets...')
41 | voice_dataset = DATASET_PARAMETERS['voice_dataset'](voice_list,
42 | DATASET_PARAMETERS['nframe_range'])
43 | face_dataset = DATASET_PARAMETERS['face_dataset'](face_list)
44 |
45 | print('Preparing the dataloaders...')
46 | collate_fn = DATASET_PARAMETERS['collate_fn'](DATASET_PARAMETERS['nframe_range'])
47 | collate_fn_4 = DATASET_PARAMETERS['collate_fn_4'](DATASET_PARAMETERS['nframe_range'])
48 | voice_loader = DataLoader(voice_dataset, shuffle=True, drop_last=True,
49 | batch_size=DATASET_PARAMETERS['batch_size'],
50 | num_workers=DATASET_PARAMETERS['workers_num'],
51 | collate_fn=collate_fn_4)
52 | face_loader = DataLoader(face_dataset, shuffle=True, drop_last=True,
53 | batch_size=DATASET_PARAMETERS['batch_size'],
54 | num_workers=DATASET_PARAMETERS['workers_num'])
55 |
56 | voice_iterator = iter(cycle_4(voice_loader))
57 | face_iterator = iter(cycle(face_loader))
58 |
59 | # networks, Fe, Fg, Fd (f+d), Fc (f+c)
60 | print('Initializing networks...')
61 | e_net, e_optimizer = get_network('e', NETWORKS_PARAMETERS, train=False)
62 | g_net, g_optimizer = get_network('g', NETWORKS_PARAMETERS, train=True)
63 | f_net, f_optimizer = get_network('f', NETWORKS_PARAMETERS, train=True)
64 | d_net, d_optimizer = get_network('d', NETWORKS_PARAMETERS, train=True)
65 | c_net, c_optimizer = get_network('c', NETWORKS_PARAMETERS, train=True)
66 |
67 | # for image to 3D part
68 | image3D_pretrained = SynergyNet(pretrained=True).cuda().eval()
69 | image3D = SynergyNet().cuda()
70 | up_layer = torch.nn.Upsample((120,120), mode='bilinear', align_corners=True)
71 | dis_optimizer = optim.Adam(image3D.parameters(), lr=0.0002, betas=(0.5, 0.999))
72 | g_optimizer = optim.Adam(list(g_net.parameters())+list(image3D.parameters()), lr=0.0002, betas=(0.5, 0.999))
73 | voice_list = sorted(glob.glob('data/val_sub/*'))
74 | tri = sio.loadmat('./train.configs/tri.mat')['tri']
75 |
76 | # distiller zoo- we use PKT here; refer to the zoo for more options.
77 | distiller = PKT()
78 | tripLoss = torch.nn.TripletMarginLoss()
79 |
80 | # label for real/fake faces
81 | real_label = torch.full((DATASET_PARAMETERS['batch_size'], 1), 1).float()
82 | fake_label = torch.full((DATASET_PARAMETERS['batch_size'], 1), 0).float()
83 |
84 | # Meters for recording the training status
85 | iteration = Meter('Iter', 'sum', ':5d')
86 | data_time = Meter('Data', 'sum', ':4.2f')
87 | batch_time = Meter('Time', 'sum', ':4.2f')
88 | D_real = Meter('D_real', 'avg', ':3.2f')
89 | D_fake = Meter('D_fake', 'avg', ':3.2f')
90 | C_real = Meter('C_real', 'avg', ':3.2f')
91 | GD_fake = Meter('G_D_fake', 'avg', ':3.2f')
92 | GC_fake = Meter('G_C_fake', 'avg', ':3.2f')
93 | Distill = Meter('Distill', 'avg', ':3.2f')
94 | Trip = Meter('Triplet', 'avg', ':3.2f')
95 |
96 | # Validation point set
97 |
98 | print('Training models...')
99 | for it in range(50000):
100 | # data
101 | start_time = time.time()
102 |
103 | voice, voice_label, voice_p, voice_n = next(voice_iterator)
104 | face, face_label = next(face_iterator)
105 | noise = 0.05*torch.randn(DATASET_PARAMETERS['batch_size'], 64, 1, 1)
106 |
107 | # use GPU or not
108 | if NETWORKS_PARAMETERS['GPU']:
109 | voice, voice_label = voice.cuda(), voice_label.cuda()
110 | face, face_label = face.cuda(), face_label.cuda()
111 | real_label, fake_label = real_label.cuda(), fake_label.cuda()
112 | noise = noise.cuda()
113 | voice_p, voice_n = voice_p.cuda(), voice_n.cuda()
114 | data_time.update(time.time() - start_time)
115 |
116 | # get embeddings and generated faces
117 | embeddings = e_net(voice)
118 | embeddings = F.normalize(embeddings)
119 | # introduce some permutations
120 | embeddings = embeddings + noise
121 | embeddings = F.normalize(embeddings)
122 | fake = g_net(embeddings)
123 |
124 | # get embeddings and generated faces
125 | embeddings_p = e_net(voice_p)
126 | embeddings_p = F.normalize(embeddings_p)
127 | # introduce some permutations
128 | embeddings_p = embeddings_p + noise
129 | embeddings_p = F.normalize(embeddings_p)
130 | fake_p = g_net(embeddings_p)
131 |
132 | # get embeddings and generated faces
133 | embeddings_n = e_net(voice_n)
134 | embeddings_n = F.normalize(embeddings_n)
135 | # introduce some permutations
136 | embeddings_n = embeddings_n + noise
137 | embeddings_n = F.normalize(embeddings_n)
138 | fake_n = g_net(embeddings_n)
139 |
140 | # Discriminator
141 | f_optimizer.zero_grad()
142 | d_optimizer.zero_grad()
143 | c_optimizer.zero_grad()
144 | real_score_out = d_net(f_net(face))
145 | fake_score_out = d_net(f_net(fake.detach()))
146 | real_label_out = c_net(f_net(face))
147 | D_real_loss = F.binary_cross_entropy(torch.sigmoid(real_score_out), real_label)
148 | D_fake_loss = F.binary_cross_entropy(torch.sigmoid(fake_score_out), fake_label)
149 | C_real_loss = F.nll_loss(F.log_softmax(real_label_out, 1), face_label)
150 | D_real.update(D_real_loss.item())
151 | D_fake.update(D_fake_loss.item())
152 | C_real.update(C_real_loss.item())
153 | (D_real_loss + D_fake_loss + C_real_loss).backward()
154 | f_optimizer.step()
155 | d_optimizer.step()
156 | c_optimizer.step()
157 |
158 |
159 | ## Joint training
160 | g_optimizer.zero_grad()
161 | fake_score_out = d_net(f_net(fake))
162 | fake_label_out = c_net(f_net(fake))
163 | face_image = up_layer(fake)
164 | face_image_p = up_layer(fake_p)
165 | face_image_n = up_layer(fake_n)
166 | prediction_pre, pool_pre, inter_pre = image3D_pretrained(face_image, return_interFeature=True)
167 | prediction, pool, inter = image3D(face_image, return_interFeature=True)
168 | prediction_p = image3D(face_image_p)
169 | prediction_n = image3D(face_image_n)
170 |
171 | GD_fake_loss = F.binary_cross_entropy(torch.sigmoid(fake_score_out), real_label)
172 | GC_fake_loss = F.nll_loss(F.log_softmax(fake_label_out, 1), voice_label)
173 | # distillation loss
174 | distill_loss = 0.5 * F.mse_loss(prediction_pre, prediction) + 10000*(distiller(pool_pre, pool) + distiller(inter_pre.view(inter_pre.shape[0],-1), inter.view(inter.shape[0],-1)))
175 | # triplet loss
176 | triplet_loss = 1.5 * tripLoss(prediction, prediction_p, prediction_n)
177 | (GD_fake_loss + GC_fake_loss + distill_loss + triplet_loss).backward()
178 | GD_fake.update(GD_fake_loss)
179 | GC_fake.update(GC_fake_loss.item())
180 | Distill.update(distill_loss.item())
181 | Trip.update(triplet_loss.item())
182 | g_optimizer.step()
183 |
184 | batch_time.update(time.time() - start_time)
185 |
186 | # print status
187 | if it % 2000 == 0:
188 | msg = str(iteration)+str(data_time)+str(batch_time)+str(D_real)+str(D_fake)+str(C_real)+str(GD_fake)+str(GC_fake)+str(Distill)+str(Trip)
189 |
190 | logging.info(msg)
191 |
192 | data_time.reset()
193 | batch_time.reset()
194 | D_real.reset()
195 | D_fake.reset()
196 | C_real.reset()
197 | GD_fake.reset()
198 | GC_fake.reset()
199 | Distill.reset()
200 | Trip.reset()
201 |
202 | e_net.eval()
203 | g_net.eval()
204 | image3D.eval()
205 | fore_err, cheek_err, ear_err, mid_err = [],[],[],[]
206 |
207 | for folder in voice_list:
208 | name = folder.split('/',1)[-1]
209 | all_fbanks = glob.glob(folder+'/*.npy')
210 | target_pts = read_xyz(glob.glob('data/AtoE_sub/'+name+'/*.xyz')[0])
211 |
212 | target_OICD = np.linalg.norm(target_pts[2217]-target_pts[14607])
213 | target_foreD = np.linalg.norm(target_pts[1678]-target_pts[42117])
214 | target_cheekD = np.linalg.norm(target_pts[2294]-target_pts[13635])
215 | target_earD = np.linalg.norm(target_pts[20636]-target_pts[34153])
216 | target_midD = np.linalg.norm(target_pts[2130]-target_pts[15003])
217 |
218 | target_foreOICD = target_foreD/target_OICD
219 | target_cheekOICD = target_cheekD/target_OICD
220 | target_earOICD = target_earD/target_OICD
221 | target_midOICD = target_midD/target_OICD
222 |
223 | for fbank in all_fbanks:
224 | face_image = voice2face_processed(e_net, g_net, fbank,NETWORKS_PARAMETERS['GPU'])
225 | face_image = up_layer(face_image)
226 | pred_pts = image3D(face_image)[0].squeeze().transpose(1,0).detach().cpu()
227 |
228 | # simple validation
229 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607])
230 | pred_pts *= (target_OICD/pred_OICD)
231 | pred_OICD = np.linalg.norm(pred_pts[2217]-pred_pts[14607])
232 | pred_midD = np.linalg.norm(pred_pts[2130]-pred_pts[15003])
233 | pred_foreD = np.linalg.norm(pred_pts[1678]-pred_pts[42117])
234 | pred_cheekD = np.linalg.norm(pred_pts[2294]-pred_pts[13635])
235 | pred_earD = np.linalg.norm(pred_pts[20636]-pred_pts[34153])
236 |
237 | pred_midOICD = pred_midD/pred_OICD
238 | pred_foreOICD = pred_foreD/pred_OICD
239 | pred_cheekOICD = pred_cheekD/pred_OICD
240 | pred_earOICD = pred_earD/pred_OICD
241 |
242 | fore_err.append(abs(pred_foreOICD-target_foreOICD))
243 | cheek_err.append(abs(pred_cheekOICD-target_cheekOICD))
244 | ear_err.append(abs(pred_earOICD-target_earOICD))
245 | mid_err.append(abs(pred_midOICD-target_midOICD))
246 |
247 | fore_err_mean, cheek_err_mean, ear_err_mean, mid_err_mean = mean(fore_err), mean(cheek_err), mean(ear_err), mean(mid_err)
248 | val_msg = f'Val forehead: {fore_err_mean:.4f}, cheek: {cheek_err_mean:.4f}, ear: {ear_err_mean:.4f}, mid: {mid_err_mean:.4f}'
249 |
250 | logging.info(val_msg)
251 |
252 | # reset to train
253 | e_net.train()
254 | g_net.train()
255 | image3D.train()
256 |
257 | # snapshot
258 | save_model(g_net, NETWORKS_PARAMETERS['g']['model_path'][:-4]+'_'+str(it)+'.pth')
259 | save_model(image3D, NETWORKS_PARAMETERS['image3D']['model_path'][:-4]+'_'+str(it)+'.pth')
260 |
261 | iteration.update(1)
262 |
263 |
--------------------------------------------------------------------------------
/mfcc.py:
--------------------------------------------------------------------------------
1 | """ This code is from
2 | https://github.com/skerit/cmusphinx/blob/master/SphinxTrain/python/cmusphinx/mfcc.py
3 | We fix some bugs and modify the pre-emphasis
4 | """
5 |
6 | # Copyright (c) 2006 Carnegie Mellon University
7 | #
8 | # You may copy and modify this freely under the same terms as
9 | # Sphinx-III
10 |
11 | """Compute MFCC coefficients.
12 |
13 | This module provides functions for computing MFCC (mel-frequency
14 | cepstral coefficients) as used in the Sphinx speech recognition
15 | system.
16 | """
17 |
18 | __author__ = "David Huggins-Daines "
19 | __version__ = "$Revision$"
20 |
21 |
22 |
23 | import numpy, numpy.fft
24 |
25 | def mel(f):
26 | return 2595. * numpy.log10(1. + f / 700.)
27 |
28 | def melinv(m):
29 | return 700. * (numpy.power(10., m / 2595.) - 1.)
30 |
31 | class MFCC(object):
32 | def __init__(self, nfilt=40, ncep=13,
33 | lowerf=133.3333, upperf=6855.4976, alpha=0.97,
34 | samprate=16000, frate=100, wlen=0.0256,
35 | nfft=512):
36 | # Store parameters
37 | self.lowerf = lowerf
38 | self.upperf = upperf
39 | self.nfft = nfft
40 | self.ncep = ncep
41 | self.nfilt = nfilt
42 | self.frate = frate
43 | self.fshift = float(samprate) / frate
44 |
45 | # Build Hamming window
46 | self.wlen = int(wlen * samprate)
47 | self.win = numpy.hamming(self.wlen)
48 |
49 | # Prior sample for pre-emphasis
50 | self.prior = 0
51 | self.alpha = alpha
52 |
53 | # Build mel filter matrix
54 | self.filters = numpy.zeros((int(nfft/2)+1,nfilt), 'd')
55 | dfreq = float(samprate) / nfft
56 | if upperf > samprate/2:
57 | raise(Exception,
58 | "Upper frequency %f exceeds Nyquist %f" % (upperf, samprate/2))
59 | melmax = mel(upperf)
60 | melmin = mel(lowerf)
61 | dmelbw = (melmax - melmin) / (nfilt + 1)
62 | # Filter edges, in Hz
63 | filt_edge = melinv(melmin + dmelbw * numpy.arange(nfilt + 2, dtype='d'))
64 |
65 | for whichfilt in range(0, nfilt):
66 | # Filter triangles, in DFT points
67 | leftfr = int(round(filt_edge[whichfilt] / dfreq))
68 | centerfr = int(round(filt_edge[whichfilt + 1] / dfreq))
69 | rightfr = int(round(filt_edge[whichfilt + 2] / dfreq))
70 | # For some reason this is calculated in Hz, though I think
71 | # it doesn't really matter
72 | fwidth = (rightfr - leftfr) * dfreq
73 | height = 2. / fwidth
74 |
75 | if centerfr != leftfr:
76 | leftslope = height / (centerfr - leftfr)
77 | else:
78 | leftslope = 0
79 | freq = leftfr + 1
80 | while freq < centerfr:
81 | self.filters[freq,whichfilt] = (freq - leftfr) * leftslope
82 | freq = freq + 1
83 | if freq == centerfr: # This is always true
84 | self.filters[freq,whichfilt] = height
85 | freq = freq + 1
86 | if centerfr != rightfr:
87 | rightslope = height / (centerfr - rightfr)
88 | while freq < rightfr:
89 | self.filters[freq,whichfilt] = (freq - rightfr) * rightslope
90 | freq = freq + 1
91 |
92 | # Build DCT matrix
93 | self.s2dct = s2dctmat(nfilt, ncep, 1./nfilt)
94 | self.dct = dctmat(nfilt, ncep, numpy.pi/nfilt)
95 |
96 | def sig2s2mfc(self, sig):
97 | nfr = int(len(sig) / self.fshift + 1)
98 | mfcc = numpy.zeros((nfr, self.ncep), 'd')
99 | fr = 0
100 | while fr < nfr:
101 | start = round(fr * self.fshift)
102 | end = min(len(sig), start + self.wlen)
103 | frame = sig[start:end]
104 | if len(frame) < self.wlen:
105 | frame = numpy.resize(frame,self.wlen)
106 | frame[self.wlen:] = 0
107 | mfcc[fr] = self.frame2s2mfc(frame)
108 | fr = fr + 1
109 | return mfcc
110 |
111 | def sig2logspec(self, sig):
112 | nfr = int(len(sig) / self.fshift + 1)
113 | mfcc = numpy.zeros((nfr, self.nfilt), 'd')
114 | fr = 0
115 | while fr < nfr:
116 | start = round(fr * self.fshift)
117 | end = min(len(sig), start + self.wlen)
118 | frame = sig[start:end]
119 | if len(frame) < self.wlen:
120 | frame = numpy.resize(frame,self.wlen)
121 | frame[self.wlen:] = 0
122 | mfcc[fr] = self.frame2logspec(frame)
123 | fr = fr + 1
124 | return mfcc
125 |
126 | def pre_emphasis(self, frame):
127 | '''
128 | # FIXME: Do this with matrix multiplication
129 | outfr = numpy.empty(len(frame), 'd')
130 | outfr[0] = frame[0] - self.alpha * self.prior
131 | for i in range(1,len(frame)):
132 | outfr[i] = frame[i] - self.alpha * frame[i-1]
133 | self.prior = frame[-1]
134 | '''
135 | # NOTE: slightly different pre-emphasis for speed up
136 | frame = numpy.insert(frame, 0, self.prior)
137 | self.prior = frame[-1]
138 | return frame[1:] - self.alpha * frame[:-1]
139 |
140 | def frame2logspec(self, frame):
141 | frame = self.pre_emphasis(frame) * self.win
142 | fft = numpy.fft.rfft(frame, self.nfft)
143 | # Square of absolute value
144 | power = fft.real * fft.real + fft.imag * fft.imag
145 | return numpy.log(numpy.dot(power, self.filters).clip(1e-5,numpy.inf))
146 |
147 | def frame2s2mfc(self, frame):
148 | logspec = self.frame2logspec(frame)
149 | return numpy.dot(logspec, self.s2dct.T) / self.nfilt
150 |
151 | def s2dctmat(nfilt,ncep,freqstep):
152 | """Return the 'legacy' not-quite-DCT matrix used by Sphinx"""
153 | melcos = numpy.empty((ncep, nfilt), 'double')
154 | for i in range(0,ncep):
155 | freq = numpy.pi * float(i) / nfilt
156 | melcos[i] = numpy.cos(freq * numpy.arange(0.5, float(nfilt)+0.5, 1.0, 'double'))
157 | melcos[:,0] = melcos[:,0] * 0.5
158 | return melcos
159 |
160 | def logspec2s2mfc(logspec, ncep=13):
161 | """Convert log-power-spectrum bins to MFCC using the 'legacy'
162 | Sphinx transform"""
163 | nframes, nfilt = logspec.shape
164 | melcos = s2dctmat(nfilt, ncep, 1./nfilt)
165 | return numpy.dot(logspec, melcos.T) / nfilt
166 |
167 | def dctmat(N,K,freqstep,orthogonalize=True):
168 | """Return the orthogonal DCT-II/DCT-III matrix of size NxK.
169 | For computing or inverting MFCCs, N is the number of
170 | log-power-spectrum bins while K is the number of cepstra."""
171 | cosmat = numpy.zeros((N, K), 'double')
172 | for n in range(0,N):
173 | for k in range(0, K):
174 | cosmat[n,k] = numpy.cos(freqstep * (n + 0.5) * k)
175 | if orthogonalize:
176 | cosmat[:,0] = cosmat[:,0] * 1./numpy.sqrt(2)
177 | return cosmat
178 |
179 | def dct(input, K=13):
180 | """Convert log-power-spectrum to MFCC using the orthogonal DCT-II"""
181 | nframes, N = input.shape
182 | freqstep = numpy.pi / N
183 | cosmat = dctmat(N,K,freqstep)
184 | return numpy.dot(input, cosmat) * numpy.sqrt(2.0 / N)
185 |
186 | def dct2(input, K=13):
187 | """Convert log-power-spectrum to MFCC using the normalized DCT-II"""
188 | nframes, N = input.shape
189 | freqstep = numpy.pi / N
190 | cosmat = dctmat(N,K,freqstep,False)
191 | return numpy.dot(input, cosmat) * (2.0 / N)
192 |
193 | def idct(input, K=40):
194 | """Convert MFCC to log-power-spectrum using the orthogonal DCT-III"""
195 | nframes, N = input.shape
196 | freqstep = numpy.pi / K
197 | cosmat = dctmat(K,N,freqstep).T
198 | return numpy.dot(input, cosmat) * numpy.sqrt(2.0 / K)
199 |
200 | def dct3(input, K=40):
201 | """Convert MFCC to log-power-spectrum using the unnormalized DCT-III"""
202 | nframes, N = input.shape
203 | freqstep = numpy.pi / K
204 | cosmat = dctmat(K,N,freqstep,False)
205 | cosmat[:,0] = cosmat[:,0] * 0.5
206 | return numpy.dot(input, cosmat.T)
207 |
--------------------------------------------------------------------------------
/network.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | import torch.nn.functional as F
5 | from backbone_nets import mobilenetv2_backbone
6 |
7 | class VoiceEmbedNet(nn.Module):
8 | def __init__(self, input_channel, channels, output_channel):
9 | super(VoiceEmbedNet, self).__init__()
10 | self.model = nn.Sequential(
11 | nn.Conv1d(input_channel, channels[0], 3, 2, 1, bias=False),
12 | nn.BatchNorm1d(channels[0], affine=True),
13 | nn.ReLU(inplace=True),
14 | nn.Conv1d(channels[0], channels[1], 3, 2, 1, bias=False),
15 | nn.BatchNorm1d(channels[1], affine=True),
16 | nn.ReLU(inplace=True),
17 | nn.Conv1d(channels[1], channels[2], 3, 2, 1, bias=False),
18 | nn.BatchNorm1d(channels[2], affine=True),
19 | nn.ReLU(inplace=True),
20 | nn.Conv1d(channels[2], channels[3], 3, 2, 1, bias=False),
21 | nn.BatchNorm1d(channels[3], affine=True),
22 | nn.ReLU(inplace=True),
23 | nn.Conv1d(channels[3], output_channel, 3, 2, 1, bias=True),
24 | )
25 |
26 | def forward(self, x):
27 | x = self.model(x)
28 | x = F.avg_pool1d(x, x.size()[2], stride=1)
29 | x = x.view(x.size()[0], -1, 1, 1)
30 | return x
31 |
32 | class Generator(nn.Module):
33 | def __init__(self, input_channel, channels, output_channel):
34 | super(Generator, self).__init__()
35 | self.model = nn.Sequential(
36 | nn.ConvTranspose2d(input_channel, channels[0], 4, 1, 0, bias=True),
37 | nn.ReLU(inplace=True),
38 | nn.ConvTranspose2d(channels[0], channels[1], 4, 2, 1, bias=True),
39 | nn.ReLU(inplace=True),
40 | nn.ConvTranspose2d(channels[1], channels[2], 4, 2, 1, bias=True),
41 | nn.ReLU(inplace=True),
42 | nn.ConvTranspose2d(channels[2], channels[3], 4, 2, 1, bias=True),
43 | nn.ReLU(inplace=True),
44 | nn.ConvTranspose2d(channels[3], channels[4], 4, 2, 1, bias=True),
45 | nn.ReLU(inplace=True),
46 | nn.ConvTranspose2d(channels[4], output_channel, 1, 1, 0, bias=True),
47 | )
48 | def forward(self, x):
49 | x = self.model(x)
50 | return x
51 |
52 | class FaceEmbedNet(nn.Module):
53 | def __init__(self, input_channel, channels, output_channel):
54 | super(FaceEmbedNet, self).__init__()
55 | self.model = nn.Sequential(
56 | nn.Conv2d(input_channel, channels[0], 1, 1, 0, bias=True),
57 | nn.LeakyReLU(0.2, inplace=True),
58 | nn.Conv2d(channels[0], channels[1], 4, 2, 1, bias=True),
59 | nn.LeakyReLU(0.2, inplace=True),
60 | nn.Conv2d(channels[1], channels[2], 4, 2, 1, bias=True),
61 | nn.LeakyReLU(0.2, inplace=True),
62 | nn.Conv2d(channels[2], channels[3], 4, 2, 1, bias=True),
63 | nn.LeakyReLU(0.2, inplace=True),
64 | nn.Conv2d(channels[3], channels[4], 4, 2, 1, bias=True),
65 | nn.LeakyReLU(0.2, inplace=True),
66 | nn.Conv2d(channels[4], output_channel, 4, 1, 0, bias=True),
67 | )
68 |
69 | def forward(self, x):
70 | x = self.model(x)
71 | return x
72 |
73 | class Classifier(nn.Module):
74 | def __init__(self, input_channel, channels, output_channel):
75 | super(Classifier, self).__init__()
76 | self.model = nn.Linear(input_channel, output_channel, bias=False)
77 |
78 | def forward(self, x):
79 | x = x.view(x.size()[0], -1)
80 | x = self.model(x)
81 | return x
82 |
83 | def get_network(net_type, params, train=True):
84 | net_params = params[net_type]
85 | net = net_params['network'](net_params['input_channel'],
86 | net_params['channels'],
87 | net_params['output_channel'])
88 | if params['GPU']:
89 | net.cuda()
90 |
91 | if train:
92 | net.train()
93 | optimizer = optim.Adam(net.parameters(),
94 | lr=params['lr'],
95 | betas=(params['beta1'], params['beta2']))
96 | else:
97 | net.eval()
98 | net.load_state_dict(torch.load(net_params['model_path']))
99 | optimizer = None
100 | return net, optimizer
101 |
102 | # SynergyNet module definition
103 | class SynergyNet(nn.Module):
104 | '''Defintion of 2D-to-3D-part'''
105 | def __init__(self, pretrained=False, last_CN=None):
106 | super(SynergyNet, self).__init__()
107 | self.backbone = getattr(mobilenetv2_backbone, 'mobilenet_v2')(last_CN=last_CN)
108 |
109 | # load the pretained model for 2D-to-3D
110 | ckpt = torch.load('pretrained_models/2D-to-3D-pretrained.tar')['state_dict']
111 | model_dict = self.backbone.state_dict()
112 | for k,v in ckpt.items():
113 | if 'IGM' in k:
114 | name_reduced = k.split('.',3)[-1]
115 | model_dict[name_reduced] = v
116 |
117 | if pretrained: # SynergyNet pretrain
118 | self.backbone.load_state_dict(model_dict)
119 |
120 | # 3DMM parameters and whitening parameters
121 | self.param_std = ckpt['module.param_std']
122 | self.param_mean = ckpt['module.param_mean']
123 | self.w_shp = ckpt['module.w_shp']
124 | self.w_exp = ckpt['module.w_exp']
125 | self.u = ckpt['module.u'].unsqueeze(0)
126 |
127 | def forward(self, input, return_onlypose=False, return_interFeature=False):
128 | _3D_attr, pool_x, inter = self.backbone(input)
129 | if return_onlypose:
130 | # only return pose
131 | return _3D_attr[:,:12] * self.param_std[:12] + self.param_mean[:12]
132 | else:
133 | # return dense mesh face
134 | _3D_face = self.reconstruct_vertex(_3D_attr, dense=True)
135 | if return_interFeature:
136 | return _3D_face, pool_x, inter
137 | return _3D_face
138 |
139 | def reconstruct_vertex(self, param, whitening=True, dense=False):
140 | '''
141 | Whitening param -> 3d vertex, based on the 3dmm param: u_base, w_shp, w_exp
142 | dense: if True, return dense vertex, else return 68 sparse landmarks.
143 | Working with batched tensors. Using Fortan-type reshape.
144 | '''
145 | # 12 transformation + 40 shape + 10 expr + 40 (discarded) texture
146 | if whitening:
147 | if param.shape[1] == 102:
148 | param_ = param * self.param_std + self.param_mean
149 | else:
150 | raise RuntimeError('length of params mismatch')
151 | p, _, alpha_shp, alpha_exp = self.parse_param_102(param_)
152 | _, s = self.p_to_Rs(p)
153 |
154 | # frontal mesh construction with 53215 vertics (BFM Face)
155 | if dense:
156 | vertex = s.unsqueeze(1).unsqueeze(1)*(self.u + self.w_shp @ alpha_shp + self.w_exp @ alpha_exp).squeeze().contiguous().view(-1, 53215, 3).transpose(1,2)
157 | else:
158 | raise NotImplementedError("Only dense mesh reconstruction supported")
159 |
160 | return vertex
161 |
162 | def parse_param_102(self, param):
163 | ''' Parse param into 3DMM semantics'''
164 | p_ = param[:, :12].reshape(-1, 3, 4)
165 | p = p_[:, :, :3]
166 | offset = p_[:, :, -1].reshape(-1, 3, 1)
167 | alpha_shp = param[:, 12:52].reshape(-1, 40, 1)
168 | alpha_exp = param[:, 52:62].reshape(-1, 10, 1)
169 | return p, offset, alpha_shp, alpha_exp
170 |
171 | def parse_param_102_pose(self, param):
172 | ''' Parse only pose params'''
173 | p_ = param[:, :12].reshape(-1, 3, 4)
174 | p = p_[:, :, :3]
175 | R, s = self.p_to_Rs(p)
176 | offset = p_[:, :, -1].reshape(-1, 3, 1)
177 | return R, offset
178 |
179 | def p_to_Rs(self, R):
180 | '''Convert P to R and s as in 3DDFA-V2'''
181 | s = (R[:, 0, :3].norm(dim=1) + R[:, 1, :3].norm(dim=1))/2.0
182 | return F.normalize(R, p=2, dim=2), s
183 |
184 | class Generator1D_directMLP(nn.Module):
185 | def __init__(self):
186 | super(Generator1D_directMLP, self).__init__()
187 |
188 | # building classifier
189 | self.num_scale = 1
190 | self.num_shape = 40
191 | self.num_exp = 10
192 | self.last_channel = 64
193 |
194 | self.classifier_scale = nn.Sequential(
195 | nn.Dropout(0.2),
196 | nn.Linear(self.last_channel, self.num_scale),
197 | )
198 | self.classifier_shape = nn.Sequential(
199 | nn.Dropout(0.2),
200 | nn.Linear(self.last_channel, self.num_shape),
201 | )
202 | self.classifier_exp = nn.Sequential(
203 | nn.Dropout(0.2),
204 | nn.Linear(self.last_channel, self.num_exp),
205 | )
206 |
207 | ckpt = torch.load('pretrained_models/2D-to-3D-pretrained.tar')['state_dict']
208 | print('Loading whitening parameters from: models/2D-to-3D-pretrained.tar')
209 | self.param_std = ckpt['module.param_std']
210 | self.param_mean = ckpt['module.param_mean']
211 | self.w_shp = ckpt['module.w_shp']
212 | self.w_exp = ckpt['module.w_exp']
213 | self.u = ckpt['module.u'].unsqueeze(0)
214 |
215 | def forward_test(self, x):
216 | """return mesh
217 | """
218 | x = x.reshape(x.shape[0], -1)
219 | x_scale = self.classifier_scale(x)
220 | x_shape = self.classifier_shape(x)
221 | x_exp = self.classifier_exp(x)
222 | _3D_attr = torch.cat((x_scale, x_shape, x_exp), dim=1)
223 | _3D_face = self.reconstruct_vertex_51_onlyDeform(_3D_attr, dense=True)
224 | return _3D_face
225 |
226 | def forward_test_param(self, x):
227 | """return 3dmm parameters
228 | """
229 | x = x.reshape(x.shape[0], -1)
230 | x_scale = self.classifier_scale(x)
231 | x_shape = self.classifier_shape(x)
232 | x_exp = self.classifier_exp(x)
233 | _3D_attr = torch.cat((x_scale, x_shape, x_exp), dim=1)
234 | return _3D_attr
235 |
236 | def reconstruct_vertex_51_onlyDeform(self, param, whitening=True, dense=False):
237 | """51 = 1 (scale) + 40 (shape) + 10 (expr)
238 | """
239 | if whitening:
240 | if param.shape[1] == 51: # manually mine out whitening params for scale
241 | s = (param[:, 0]*1.538597731841497e-05) + 0.0005920184194110334
242 | param_ = param[:, 1:] * self.param_std[12:62] + self.param_mean[12:62]
243 | else:
244 | raise RuntimeError('length of params mismatch')
245 | alpha_shp, alpha_exp = self.parse_param_50(param_)
246 | if dense:
247 | # since we are predicting 3D face from speech
248 | # only use scale, do not use rotation nor translation
249 | vertex = s.unsqueeze(1).unsqueeze(1)*(self.u + self.w_shp @ alpha_shp + self.w_exp @ alpha_exp).squeeze().contiguous().view(-1, 53215, 3).transpose(1,2)
250 | return vertex
251 |
252 | def parse_param_50(self, param):
253 | """Work for only tensor"""
254 | alpha_shp = param[:, :40].reshape(-1, 40, 1)
255 | alpha_exp = param[:, 40:50].reshape(-1, 10, 1)
256 | return alpha_shp, alpha_exp
--------------------------------------------------------------------------------
/parse_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | def parse_metafile(meta_file):
4 | with open(meta_file, 'r') as f:
5 | lines = f.readlines()[1:]
6 | celeb_ids = {}
7 | for line in lines:
8 | ID, name, _, _, _ = line.rstrip().split('\t')
9 | celeb_ids[ID] = name
10 | return celeb_ids
11 |
12 | def get_labels(voice_list, face_list):
13 | voice_names = {item['name'] for item in voice_list}
14 | face_names = {item['name'] for item in face_list}
15 | names = voice_names & face_names
16 |
17 | voice_list = [item for item in voice_list if item['name'] in names]
18 | face_list = [item for item in face_list if item['name'] in names]
19 |
20 | names = sorted(list(names))
21 | label_dict = dict(zip(names, range(len(names))))
22 | for item in voice_list+face_list:
23 | item['label_id'] = label_dict[item['name']]
24 | return voice_list, face_list, len(names)
25 |
26 |
27 | def get_dataset_files(data_dir, data_ext, celeb_ids, split):
28 | data_list = []
29 | # read data directory
30 | for root, dirs, filenames in os.walk(data_dir):
31 | for filename in filenames:
32 | if filename.endswith(data_ext):
33 | filepath = os.path.join(root, filename)
34 | # so hacky, be careful!
35 | folder = filepath[len(data_dir):].split('/')[1]
36 | celeb_name = celeb_ids.get(folder, folder)
37 | if celeb_name.startswith(tuple(split)):
38 | data_list.append({'filepath': filepath, 'name': celeb_name})
39 | return data_list
40 |
41 | def get_dataset(data_params):
42 | celeb_ids = parse_metafile(data_params['meta_file'])
43 |
44 | voice_list = get_dataset_files(data_params['voice_dir'],
45 | data_params['voice_ext'],
46 | celeb_ids,
47 | data_params['split'])
48 | face_list = get_dataset_files(data_params['face_dir'],
49 | data_params['face_ext'],
50 | celeb_ids,
51 | data_params['split'])
52 | return get_labels(voice_list, face_list)
53 |
54 |
--------------------------------------------------------------------------------
/pyaudio_recording.py:
--------------------------------------------------------------------------------
1 | import wave
2 | from dataclasses import dataclass, asdict
3 |
4 | import pyaudio
5 |
6 |
7 | @dataclass
8 | class StreamParams:
9 | format: int = pyaudio.paInt16
10 | channels: int = 2
11 | rate: int = 44100
12 | frames_per_buffer: int = 1024
13 | input: bool = True
14 | output: bool = False
15 |
16 | def to_dict(self) -> dict:
17 | return asdict(self)
18 |
19 |
20 | class Recorder:
21 | """Recorder uses the blocking I/O facility from pyaudio to record sound
22 | from mic.
23 | Attributes:
24 | - stream_params: StreamParams object with values for pyaudio Stream
25 | object
26 | """
27 | def __init__(self, stream_params: StreamParams) -> None:
28 | self.stream_params = stream_params
29 | self._pyaudio = None
30 | self._stream = None
31 | self._wav_file = None
32 |
33 | def record(self, duration: int, save_path: str) -> None:
34 | """Record sound from mic for a given amount of seconds.
35 | :param duration: Number of seconds we want to record for
36 | :param save_path: Where to store recording
37 | """
38 | print(f"Start recording for {duration} seconds...")
39 | self._create_recording_resources(save_path)
40 | self._write_wav_file_reading_from_stream(duration)
41 | self._close_recording_resources()
42 | print("Stop recording")
43 |
44 | def _create_recording_resources(self, save_path: str) -> None:
45 | self._pyaudio = pyaudio.PyAudio()
46 | self._stream = self._pyaudio.open(**self.stream_params.to_dict())
47 | self._create_wav_file(save_path)
48 |
49 | def _create_wav_file(self, save_path: str):
50 | self._wav_file = wave.open(save_path, "wb")
51 | self._wav_file.setnchannels(self.stream_params.channels)
52 | self._wav_file.setsampwidth(self._pyaudio.get_sample_size(self.stream_params.format))
53 | self._wav_file.setframerate(self.stream_params.rate)
54 |
55 | def _write_wav_file_reading_from_stream(self, duration: int) -> None:
56 | for _ in range(int(self.stream_params.rate * duration / self.stream_params.frames_per_buffer)):
57 | audio_data = self._stream.read(self.stream_params.frames_per_buffer)
58 | self._wav_file.writeframes(audio_data)
59 |
60 | def _close_recording_resources(self) -> None:
61 | self._wav_file.close()
62 | self._stream.close()
63 | self._pyaudio.terminate()
64 |
65 |
66 | if __name__ == "__main__":
67 | stream_params = StreamParams()
68 | recorder = Recorder(stream_params)
69 | recorder.record(5, "audio.wav")
70 |
--------------------------------------------------------------------------------
/utilf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/choyingw/Cross-Modal-Perceptionist/5191c3766f2388ceb3300f86b1717aa1f7496554/utilf/__init__.py
--------------------------------------------------------------------------------
/utilf/render.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('..')
4 |
5 | import cv2
6 | import numpy as np
7 | import scipy.io as sio
8 |
9 | from Sim3DR import RenderPipeline
10 |
11 | # to continuous
12 | def _to_ctype(arr):
13 | if not arr.flags.c_contiguous:
14 | return arr.copy(order='C')
15 | return arr
16 |
17 | # load BFM connectivity of triangles
18 | tri = sio.loadmat('./train.configs/tri.mat')['tri'] - 1
19 | tri = _to_ctype(tri.T).astype(np.int32)
20 |
21 | # Sim3DR definition
22 | cfg = {
23 | 'intensity_ambient': 0.3,
24 | 'color_ambient': (1, 1, 1),
25 | 'intensity_directional': 0.6,
26 | 'color_directional': (1, 1, 1),
27 | 'intensity_specular': 0.1,
28 | 'specular_exp': 5,
29 | 'light_pos': (0, 0, 5),
30 | 'view_pos': (0, 0, 5)
31 | }
32 |
33 | render_app = RenderPipeline(**cfg)
34 |
35 | def render_vert(img, vert, alpha=1.0, wfp=None):
36 | print(f'Save visualization result to {wfp}')
37 | overlap = img.copy()
38 | vert = vert.astype(np.float32)
39 | ver = _to_ctype(vert.T) # transpose
40 | overlap = render_app(ver, tri, overlap)
41 | overlap = cv2.addWeighted(img, 1 - alpha, overlap, alpha, 0)
42 | cv2.imwrite(wfp[:-4]+'.png', overlap)
43 |
44 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import shutil
4 | import numpy as np
5 | import torch.nn.functional as F
6 | import pickle
7 | import os.path as osp
8 |
9 | from PIL import Image
10 | from scipy.io import wavfile
11 | from torch.utils.data.dataloader import default_collate
12 | from vad import read_wave, write_wave, frame_generator, vad_collector
13 |
14 | def make_abs_path(d):
15 | return osp.join(osp.dirname(osp.realpath(__file__)), d)
16 |
17 | def _get_suffix(filename):
18 | """a.jpg -> jpg"""
19 | pos = filename.rfind('.')
20 | if pos == -1:
21 | return ''
22 | return filename[pos + 1:]
23 |
24 | def _load(fp):
25 | suffix = _get_suffix(fp)
26 | if suffix == 'npy':
27 | return np.load(fp)
28 | elif suffix == 'pkl':
29 | return pickle.load(open(fp, 'rb'))
30 |
31 | def _load_tensor(fp, mode='cpu'):
32 | if mode.lower() == 'cpu':
33 | return torch.from_numpy(_load(fp))
34 | elif mode.lower() == 'gpu':
35 | return torch.from_numpy(_load(fp)).cuda()
36 |
37 | def parse_param_102(param):
38 | """Work for only tensor"""
39 | p_ = param[:, :12].reshape(-1, 3, 4)
40 | p = p_[:, :, :3]
41 | offset = p_[:, :, -1].reshape(-1, 3, 1)
42 | alpha_shp = param[:, 12:52].reshape(-1, 40, 1)
43 | alpha_exp = param[:, 52:62].reshape(-1, 10, 1)
44 | alpha_tex = param[:, 62:102].reshape(-1, 40, 1)
45 | return p, offset, alpha_shp, alpha_exp, alpha_tex
46 |
47 | def to_rotation_mat_renorm(R):
48 | s = (R[:, 0, :3].norm(dim=1) + R[:, 1, :3].norm(dim=1))/2.0
49 | return F.normalize(R, p=2, dim=2), s
50 |
51 | class ParamsPack():
52 | """3DMM configuration data loading from ./train.configs"""
53 | def __init__(self, version):
54 | data_ver = version
55 |
56 | d = make_abs_path('./train.configs')
57 |
58 | # PCA basis for shape, expression, texture
59 | self.w_shp = _load_tensor(osp.join(d, 'w_shp_{}.npy'.format(data_ver)), mode='gpu')
60 | self.w_exp = _load_tensor(osp.join(d, 'w_exp_{}.npy'.format(data_ver)), mode='gpu')
61 | #self.w_tex = torch.from_numpy(_load(osp.join(d, 'w_tex_sim.npy'))[:,:40]).cuda()
62 |
63 | # param_mean and param_std are used for re-whitening
64 | meta = _load(osp.join(d, 'param_whitening_{}.pkl'.format(data_ver)))
65 | self.param_mean = torch.from_numpy(meta.get('param_mean')).float().cuda()
66 | self.param_std = torch.from_numpy(meta.get('param_std')).float().cuda()
67 |
68 | # mean values
69 | self.u_shp = _load_tensor(osp.join(d, 'u_shp.npy'), mode='gpu')
70 | self.u_exp = _load_tensor(osp.join(d, 'u_exp.npy'), mode='gpu')
71 | #self.u_tex = _load_tensor(osp.join(d, 'u_tex.npy'), mode='gpu')
72 | self.u = self.u_shp + self.u_exp
73 | self.w = torch.cat((self.w_shp, self.w_exp), dim=1)
74 |
75 | # base vector for landmarks
76 | self.std_size = 120
77 | self.dim = self.w_shp.shape[0] // 3
78 |
79 | param_pack = ParamsPack('v201')
80 |
81 | class Meter(object):
82 | # Computes and stores the average and current value
83 | def __init__(self, name, display, fmt=':f'):
84 | self.name = name
85 | self.display = display
86 | self.fmt = fmt
87 | self.reset()
88 |
89 | def reset(self):
90 | self.val = 0
91 | self.avg = 0
92 | self.sum = 0
93 | self.count = 0
94 |
95 | def update(self, val, n=1):
96 | self.val = val
97 | self.sum += val * n
98 | self.count += n
99 | self.avg = self.sum / self.count
100 |
101 | def __str__(self):
102 | fmtstr = '{name}:{' + self.display + self.fmt + '},'
103 | return fmtstr.format(**self.__dict__)
104 |
105 | def get_collate_fn(nframe_range):
106 | def collate_fn(batch):
107 | min_nframe, max_nframe = nframe_range
108 | assert min_nframe <= max_nframe
109 | num_frame = np.random.randint(min_nframe, max_nframe+1)
110 | pt = np.random.randint(0, max_nframe-num_frame+1)
111 | batch = [(item[0][..., pt:pt+num_frame], item[1])
112 | for item in batch]
113 | return default_collate(batch)
114 | return collate_fn
115 |
116 | def get_collate_fn_4(nframe_range):
117 | def collate_fn(batch):
118 | min_nframe, max_nframe = nframe_range
119 | assert min_nframe <= max_nframe
120 | num_frame = np.random.randint(min_nframe, max_nframe+1)
121 | pt = np.random.randint(0, max_nframe-num_frame+1)
122 | batch = [(item[0][..., pt:pt+num_frame], item[1], item[2][..., pt:pt+num_frame], item[3][..., pt:pt+num_frame]) for item in batch]
123 | return default_collate(batch)
124 | return collate_fn
125 |
126 | def cycle(dataloader):
127 | while True:
128 | for data, label in dataloader:
129 | yield data, label
130 |
131 | def cycle_4(dataloader):
132 | while True:
133 | for data, label, data_p, data_n in dataloader:
134 | yield data, label, data_p, data_n
135 |
136 | def save_model(net, model_path):
137 | model_dir = os.path.dirname(model_path)
138 | if not os.path.exists(model_dir):
139 | os.makedirs(model_dir)
140 | torch.save(net.state_dict(), model_path)
141 |
142 | def rm_sil(voice_file, vad_obj):
143 | """
144 | This code snippet is basically taken from the repository
145 | 'https://github.com/wiseman/py-webrtcvad'
146 |
147 | It removes the silence clips in a speech recording
148 | """
149 | audio, sample_rate = read_wave(voice_file)
150 | frames = frame_generator(20, audio, sample_rate)
151 | frames = list(frames)
152 | segments = vad_collector(sample_rate, 20, 50, vad_obj, frames)
153 |
154 | if os.path.exists('tmp/'):
155 | shutil.rmtree('tmp/')
156 | os.makedirs('tmp/')
157 |
158 | wave_data = []
159 | for i, segment in enumerate(segments):
160 | segment_file = 'tmp/' + str(i) + '.wav'
161 | write_wave(segment_file, segment, sample_rate)
162 | wave_data.append(wavfile.read(segment_file)[1])
163 | shutil.rmtree('tmp/')
164 |
165 | if wave_data:
166 | vad_voice = np.concatenate(wave_data).astype('int16')
167 | return vad_voice
168 |
169 | def get_fbank(voice, mfc_obj):
170 | # Extract log mel-spectrogra
171 | fbank = mfc_obj.sig2logspec(voice).astype('float32')
172 |
173 | # print(fbank.shape)
174 | # m=fbank.mean(axis=0)
175 | # print(m.shape)
176 | # exit()
177 |
178 | # Mean and variance normalization of each mel-frequency
179 | fbank = fbank - fbank.mean(axis=0)
180 | fbank = fbank / (fbank.std(axis=0)+np.finfo(np.float32).eps)
181 |
182 | # If the duration of a voice recording is less than 10 seconds (1000 frames),
183 | # repeat the recording until it is longer than 10 seconds and crop.
184 | full_frame_number = 1000
185 | init_frame_number = fbank.shape[0]
186 | while fbank.shape[0] < full_frame_number:
187 | fbank = np.append(fbank, fbank[0:init_frame_number], axis=0)
188 | fbank = fbank[0:full_frame_number,:]
189 | return fbank
190 |
191 |
192 | def voice2face(e_net, g_net, voice_file, vad_obj, mfc_obj, GPU=True):
193 | vad_voice = rm_sil(voice_file, vad_obj)
194 | fbank = get_fbank(vad_voice, mfc_obj)
195 | fbank = fbank.T[np.newaxis, ...]
196 | fbank = torch.from_numpy(fbank.astype('float32'))
197 |
198 | if GPU:
199 | fbank = fbank.cuda()
200 | embedding = e_net(fbank)
201 | embedding = F.normalize(embedding)
202 | face = g_net(embedding)
203 | return face
204 |
205 |
206 | def voice2face_processed(e_net, g_net, fbank_obj, GPU=True, return_embeddings=False):
207 | fbank = np.load(fbank_obj)
208 | fbank = fbank.T[np.newaxis, ...]
209 | fbank = torch.from_numpy(fbank.astype('float32'))
210 |
211 | if GPU:
212 | fbank = fbank.cuda()
213 | embedding = e_net(fbank)
214 | embedding = F.normalize(embedding)
215 |
216 | face = g_net(embedding)
217 |
218 | if return_embeddings:
219 | return face, embedding
220 |
221 | return face
222 |
223 | def voice2face_processed_ParamOut(e_net, g_net, fbank_obj, GPU=True):
224 | fbank = np.load(fbank_obj)
225 | fbank = fbank.T[np.newaxis, ...]
226 | fbank = torch.from_numpy(fbank.astype('float32'))
227 |
228 | if GPU:
229 | fbank = fbank.cuda()
230 | embedding = e_net(fbank)
231 | embedding = F.normalize(embedding)
232 | face = g_net.forward_test(embedding)
233 |
234 | return face
235 |
236 | def voice2face_processed_MeshOut(e_net, g_net, fbank_obj, GPU=True):
237 | fbank = np.load(fbank_obj)
238 | fbank = fbank.T[np.newaxis, ...]
239 | fbank = torch.from_numpy(fbank.astype('float32'))
240 |
241 | if GPU:
242 | fbank = fbank.cuda()
243 | embedding = e_net(fbank)
244 | embedding = F.normalize(embedding)
245 | face = g_net.forward_test(embedding)
246 |
247 | return face
248 |
249 | def write_obj_with_colors(obj_name, vertices, triangles):
250 | """
251 | write out obj mesh files.
252 | """
253 | if obj_name.split('.')[-1] != 'obj':
254 | obj_name = obj_name + '.obj'
255 |
256 | # write obj
257 | with open(obj_name, 'w') as f:
258 | # write vertices & colors
259 | for i in range(vertices.shape[1]):
260 | s = 'v {} {} {}\n'.format(vertices[0, i], vertices[1, i], vertices[2, i])
261 | f.write(s)
262 |
263 | # write f: ver ind/ uv ind
264 | for i in range(triangles.shape[1]):
265 | s = 'f {} {} {}\n'.format(triangles[0, i], triangles[1, i], triangles[2, i])
266 | f.write(s)
267 |
268 | def read_obj(filename):
269 | f = open(filename)
270 | lines = f.readlines()
271 | coll = []
272 | for l in lines:
273 | if l[0] != 'v':
274 | break
275 | comp = l.split()[1:]
276 | comp = list(map(float, comp))
277 | coll.append(comp)
278 |
279 | a = np.asarray(coll)
280 | return a
281 |
282 | def read_xyz(filename):
283 | f = open(filename)
284 | lines = f.readlines()
285 | coll = []
286 | for l in lines:
287 | comp = l.split()
288 | comp = list(map(float, comp))
289 | coll.append(comp)
290 | a=np.asarray(coll)
291 | return a
292 |
--------------------------------------------------------------------------------
/vad.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import contextlib
3 | import sys
4 | import wave
5 |
6 | def read_wave(path):
7 | with contextlib.closing(wave.open(path, 'rb')) as wf:
8 | num_channels = wf.getnchannels()
9 | assert num_channels == 1
10 | sample_width = wf.getsampwidth()
11 | assert sample_width == 2
12 | sample_rate = wf.getframerate()
13 | assert sample_rate in (8000, 16000, 32000)
14 | pcm_data = wf.readframes(wf.getnframes())
15 | return pcm_data, sample_rate
16 |
17 |
18 | def write_wave(path, audio, sample_rate):
19 | with contextlib.closing(wave.open(path, 'wb')) as wf:
20 | wf.setnchannels(1)
21 | wf.setsampwidth(2)
22 | wf.setframerate(sample_rate)
23 | wf.writeframes(audio)
24 |
25 |
26 | class Frame(object):
27 | def __init__(self, bytes, timestamp, duration):
28 | self.bytes = bytes
29 | self.timestamp = timestamp
30 | self.duration = duration
31 |
32 |
33 | def frame_generator(frame_duration_ms, audio, sample_rate):
34 | n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
35 | offset = 0
36 | timestamp = 0.0
37 | duration = (float(n) / sample_rate) / 2.0
38 | while offset + n < len(audio):
39 | yield Frame(audio[offset:offset + n], timestamp, duration)
40 | timestamp += duration
41 | offset += n
42 |
43 |
44 | def vad_collector(sample_rate, frame_duration_ms,
45 | padding_duration_ms, vad, frames):
46 | num_padding_frames = int(padding_duration_ms / frame_duration_ms)
47 | ring_buffer = collections.deque(maxlen=num_padding_frames)
48 | triggered = False
49 | voiced_frames = []
50 | for frame in frames:
51 | #sys.stdout.write(
52 | # '1' if vad.is_speech(frame.bytes, sample_rate) else '0')
53 | if not triggered:
54 | ring_buffer.append(frame)
55 | num_voiced = len([f for f in ring_buffer
56 | if vad.is_speech(f.bytes, sample_rate)])
57 | if num_voiced > 0.9 * ring_buffer.maxlen:
58 | # sys.stdout.write('+(%s)' % (ring_buffer[0].timestamp,))
59 | triggered = True
60 | voiced_frames.extend(ring_buffer)
61 | ring_buffer.clear()
62 | else:
63 | voiced_frames.append(frame)
64 | ring_buffer.append(frame)
65 | num_unvoiced = len([f for f in ring_buffer
66 | if not vad.is_speech(f.bytes, sample_rate)])
67 | if num_unvoiced > 0.9 * ring_buffer.maxlen:
68 | #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
69 | triggered = False
70 | yield b''.join([f.bytes for f in voiced_frames])
71 | ring_buffer.clear()
72 | voiced_frames = []
73 | if voiced_frames:
74 | yield b''.join([f.bytes for f in voiced_frames])
75 |
--------------------------------------------------------------------------------