├── .gitignore ├── LICENSE ├── README.md ├── WORKSPACE ├── background ├── 1.jpg ├── 10.jpg ├── 100.jpg ├── 11.jpg ├── 12.jpg ├── 13.jpg ├── 14.jpg ├── 15.jpg ├── 16.jpg ├── 17.jpg ├── 18.jpg ├── 19.jpg ├── 2.jpg ├── 20.jpg ├── 21.jpg ├── 22.jpg ├── 23.jpg ├── 24.jpg ├── 25.jpg ├── 26.jpg ├── 27.jpg ├── 28.jpg ├── 29.jpg ├── 3.jpg ├── 30.jpg ├── 31.jpg ├── 32.jpg ├── 33.jpg ├── 34.jpg ├── 35.jpg ├── 36.jpg ├── 37.jpg ├── 38.jpg ├── 39.jpg ├── 4.jpg ├── 40.jpg ├── 41.jpg ├── 42.jpg ├── 43.jpg ├── 44.jpg ├── 45.jpg ├── 46.jpg ├── 47.jpg ├── 48.jpg ├── 49.jpg ├── 5.jpg ├── 50.jpg ├── 51.jpg ├── 52.jpg ├── 53.jpg ├── 54.jpg ├── 55.jpg ├── 56.jpg ├── 57.jpg ├── 58.jpg ├── 59.jpg ├── 6.jpg ├── 60.jpg ├── 61.jpg ├── 62.jpg ├── 63.jpg ├── 64.jpg ├── 65.jpg ├── 66.jpg ├── 67.jpg ├── 68.jpg ├── 69.jpg ├── 7.jpg ├── 70.jpg ├── 71.jpg ├── 72.jpg ├── 73.jpg ├── 74.jpg ├── 75.jpg ├── 76.jpg ├── 77.jpg ├── 78.jpg ├── 79.jpg ├── 8.jpg ├── 80.jpg ├── 81.jpg ├── 82.jpg ├── 83.jpg ├── 84.jpg ├── 85.jpg ├── 86.jpg ├── 87.jpg ├── 88.jpg ├── 89.jpg ├── 9.jpg ├── 90.jpg ├── 91.jpg ├── 92.jpg ├── 93.jpg ├── 94.jpg ├── 95.jpg ├── 96.jpg ├── 97.jpg ├── 98.jpg └── 99.jpg ├── config ├── BUILD ├── configure.py └── params.yml ├── datasets ├── BUILD ├── make_data_from_GRID.py ├── makelist_bfm.py ├── makelist_pixrefer.py └── models.py ├── generator ├── BUILD ├── generator.py ├── loader.py └── test_generator.py ├── res ├── 1.png ├── 2.png └── 3.jpg ├── sample ├── 22.jpg └── test.aac ├── utils ├── BUILD ├── bfm_load_data.py ├── bfm_visual.py ├── cython │ ├── mesh_core.cpp │ ├── mesh_core.h │ ├── mesh_core_cython.pyx │ └── setup.py ├── reconstruct_mesh.py └── utils.py └── voicepuppet ├── BUILD ├── atvgnet ├── BUILD ├── __init__.py ├── atnet.py ├── backbone.py ├── infer.py ├── plot.py ├── test_atnet.py ├── test_vgnet.py ├── tinynet.py ├── train_atnet.py ├── train_vgnet.py └── vgnet.py ├── bfmnet ├── BUILD ├── bfmnet.py ├── infer_bfmnet.py ├── tinynet.py └── train_bfmnet.py ├── builder.py ├── pixflow ├── BUILD ├── infer_bfm_pixflow.py ├── infer_pixflow.py ├── pixflow.py └── train_pixflow.py └── pixrefer ├── BUILD ├── infer_bfmvid.py ├── infer_pixrefer.py ├── pixrefer.py ├── train_pixrefer.py └── vgg_simple.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 DongLu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # VoicePuppet # 4 | - This repository provided a common pipeline to generate speaking actor by voice input automatically. 5 | - For a better feeling, there's a [short video](https://youtu.be/h24MGPTTw5M) to demostrate it. 6 | 7 | ## The archecture of the network ## 8 | - Composed of 2 parts, one for predict 3D face coeffcients of each frame align to a certain stride window size of waveform, called BFMNet(basel face model network). The another for redraw the real face foreground using the rasterized face which produced by the rendered 3D face coeffcients of previous step, called PixReferNet. 9 | 10 | 11 | 12 | 15 | 16 | 19 | 22 | 23 | 26 |
13 | 14 |
17 | BFMNet component 18 |
20 | 21 |
24 | PixReferNet component 25 |
27 | 28 | 29 | ## Run the prediction pipeline ## 30 | ------------------------ 31 | 32 | 1. Download the pretrained model and required models.
33 | Baidu Disk: [[ckpt.zip](https://pan.baidu.com/s/1cVIVFhSsEA1MbgqL7H7mMw), code: a6pn], [[allmodels.zip](https://pan.baidu.com/s/11FKHjGjnPtD2c7Ttg-mXng), code: brfh]
34 | or Google Drive: [[ckpt.zip](https://drive.google.com/file/d/1RgMSQUL2pzvwCWGgnkvwxHxHeEnZ7FlN/view?usp=sharing)], [[allmodels.zip](https://drive.google.com/file/d/1Z1Pm39sp977nED_HHZtvn5glRrmiThwB/view?usp=sharing)]
35 | Extract the `ckpt.zip` to `ckpt_bfmnet` and `ckpt_pixrefer`, extract the `allmodels.zip` to current root dir 36 | 2. `cd utils/cython` && `python3 setup.py install` 37 | 3. Install ffmpeg tool if you want to merge the png sequence and audio file to video container like mp4. 38 | 4. `python3 voicepuppet/pixrefer/infer_bfmvid.py --config_path config/params.yml sample/22.jpg sample/test.aac` 39 | 40 | ## Run the training pipeline ## 41 | ------------------------ 42 | 43 | #### Requirements #### 44 | 45 | - tensorflow>=1.14.0 46 | - pytorch>=1.4.0, only for data preparation (face foreground segmentation and matting) 47 | - mxnet>=1.5.1, only for data preparation (face alignment) 48 | tips: you can use other models to do the same label marking instead, such as dlib 49 | 50 | #### Data preparation #### 51 | 52 | 1. Check your `config/params.yml` to make sure the dataset folder in specified structure (same as the [grid dataset](http://spandh.dcs.shef.ac.uk/gridcorpus/), you can extend the dataset by using the same folder structure which contains common video files) 53 | ``` 54 | |- srcdir/ 55 | | |- s10/ 56 | | |- video/ 57 | | |- mpg_6000/ 58 | | |- bbab8n.mpg 59 | | |- bbab9s.mpg 60 | | |- bbac1a.mpg 61 | | |- ... 62 | | |- s8/ 63 | | |- video/ 64 | | |- mpg_6000/ 65 | | |- bbae5n.mpg 66 | | |- bbae6s.mpg 67 | | |- bbae7p.mpg 68 | | |- ... 69 | ``` 70 | 2. Extract audio stream from mpg video file, `todir` was a output folder which you want to store the labels.
71 | `python3 datasets/make_data_from_GRID.py --gpu 0 --step 2 srcdir todir` 72 | 73 | 3. Face detection and alignment
74 | `python3 datasets/make_data_from_GRID.py --gpu 0 --step 3 srcdir todir ./allmodels` 75 | 76 | 4. 3D face reconstruction
77 | `python3 datasets/make_data_from_GRID.py --gpu 0 --step 4 todir ./allmodels` 78 | 79 | 5. It will take several hours to finish the above steps, subsequently, you'll find there's `*.jpg, landmark.txt, audio.wav, bfmcoeff.txt` in each output subfolder. The above labels(`audio.wav`, `bfmcoeff.txt`) are used for BFMNet training, the others are only temp files. 80 | ``` 81 | |- todir/ 82 | | |- s10/ 83 | | |- bbab8n/ 84 | | |- landmark.txt 85 | | |- audio.wav 86 | | |- bfmcoeff.txt 87 | | |- 0.jpg 88 | | |- 1.jpg 89 | | |- ... 90 | | |- bbab9s/ 91 | | |- ... 92 | | |- s8/ 93 | | |- bbae5n/ 94 | | |- landmark.txt 95 | | |- audio.wav 96 | | |- bfmcoeff.txt 97 | | |- 0.jpg 98 | | |- 1.jpg 99 | | |- ... 100 | | |- bbae6s/ 101 | | |- ... 102 | ``` 103 | 6. Face(human foreground) segmentation and matting for PixelReferNet training. Before invoke the python shell, you should make sure the width and height of the video was in the same size(1:1). In general, 3-5 minutes video was enough for training the PixelReferNet network, the trained model will only take effect on this specified person too.
104 | `python3 datasets/make_data_from_GRID.py --gpu 0 --step 6 src_dir to_dvp_dir ./allmodels`
105 | the `src_dir` has the same folder structure as [tip1 in Data preparation], when finish the above step, you will find `*.jpg` in subfolders, like this 106 |
107 | 108 |
109 | 110 | #### Train BFMNet #### 111 | 112 | 1. Prepare train and eval txt, check the `root_path` parameter in `config/params.yml` is the output folder of [tip1 in Data preparation]
113 | `python3 datasets/makelist_bfm.py --config_path config/params.yml` 114 | 2. train the model
115 | `python3 voicepuppet/bfmnet/train_bfmnet.py --config_path config/params.yml` 116 | 3. Watch the evalalute images every 1000 step in `log/eval_bfmnet`, the upper was the target sequence, and the under was the evaluated sequence. 117 |
118 | 119 |
120 | 121 | #### Train PixReferNet #### 122 | 123 | 1. Prepare train and eval txt, check the `root_path` parameter in `config/params.yml` is the output folder of [tip6 in Data preparation]
124 | `python3 datasets/makelist_pixrefer.py --config_path config/params.yml` 125 | 2. train the model
126 | `python3 voicepuppet/pixrefer/train_pixrefer.py --config_path config/params.yml` 127 | 3. Use tensorboard to watch the training process
128 | `tensorboard --logdir=log/summary_pixrefer` 129 | 130 | ## Acknowledgement ## 131 | 1. The face alignment model was refer to [Deepinx's work](https://github.com/deepinx/deep-face-alignment), it's more stable than Dlib. 132 | 2. 3D face reconstruction model was refer to [microsoft's work](https://github.com/microsoft/Deep3DFaceReconstruction) 133 | 3. Image segmentation model was refer to [gasparian's work](https://github.com/gasparian/PicsArtHack-binary-segmentation) 134 | 4. Image matting model was refer to [foamliu's work](https://github.com/foamliu/Deep-Image-Matting) 135 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | workspace(name = "voicepuppet") 2 | -------------------------------------------------------------------------------- /background/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/1.jpg -------------------------------------------------------------------------------- /background/10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/10.jpg -------------------------------------------------------------------------------- /background/100.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/100.jpg -------------------------------------------------------------------------------- /background/11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/11.jpg -------------------------------------------------------------------------------- /background/12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/12.jpg -------------------------------------------------------------------------------- /background/13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/13.jpg -------------------------------------------------------------------------------- /background/14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/14.jpg -------------------------------------------------------------------------------- /background/15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/15.jpg -------------------------------------------------------------------------------- /background/16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/16.jpg -------------------------------------------------------------------------------- /background/17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/17.jpg -------------------------------------------------------------------------------- /background/18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/18.jpg -------------------------------------------------------------------------------- /background/19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/19.jpg -------------------------------------------------------------------------------- /background/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/2.jpg -------------------------------------------------------------------------------- /background/20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/20.jpg -------------------------------------------------------------------------------- /background/21.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/21.jpg -------------------------------------------------------------------------------- /background/22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/22.jpg -------------------------------------------------------------------------------- /background/23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/23.jpg -------------------------------------------------------------------------------- /background/24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/24.jpg -------------------------------------------------------------------------------- /background/25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/25.jpg -------------------------------------------------------------------------------- /background/26.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/26.jpg -------------------------------------------------------------------------------- /background/27.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/27.jpg -------------------------------------------------------------------------------- /background/28.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/28.jpg -------------------------------------------------------------------------------- /background/29.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/29.jpg -------------------------------------------------------------------------------- /background/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/3.jpg -------------------------------------------------------------------------------- /background/30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/30.jpg -------------------------------------------------------------------------------- /background/31.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/31.jpg -------------------------------------------------------------------------------- /background/32.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/32.jpg -------------------------------------------------------------------------------- /background/33.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/33.jpg -------------------------------------------------------------------------------- /background/34.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/34.jpg -------------------------------------------------------------------------------- /background/35.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/35.jpg -------------------------------------------------------------------------------- /background/36.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/36.jpg -------------------------------------------------------------------------------- /background/37.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/37.jpg -------------------------------------------------------------------------------- /background/38.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/38.jpg -------------------------------------------------------------------------------- /background/39.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/39.jpg -------------------------------------------------------------------------------- /background/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/4.jpg -------------------------------------------------------------------------------- /background/40.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/40.jpg -------------------------------------------------------------------------------- /background/41.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/41.jpg -------------------------------------------------------------------------------- /background/42.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/42.jpg -------------------------------------------------------------------------------- /background/43.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/43.jpg -------------------------------------------------------------------------------- /background/44.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/44.jpg -------------------------------------------------------------------------------- /background/45.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/45.jpg -------------------------------------------------------------------------------- /background/46.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/46.jpg -------------------------------------------------------------------------------- /background/47.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/47.jpg -------------------------------------------------------------------------------- /background/48.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/48.jpg -------------------------------------------------------------------------------- /background/49.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/49.jpg -------------------------------------------------------------------------------- /background/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/5.jpg -------------------------------------------------------------------------------- /background/50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/50.jpg -------------------------------------------------------------------------------- /background/51.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/51.jpg -------------------------------------------------------------------------------- /background/52.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/52.jpg -------------------------------------------------------------------------------- /background/53.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/53.jpg -------------------------------------------------------------------------------- /background/54.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/54.jpg -------------------------------------------------------------------------------- /background/55.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/55.jpg -------------------------------------------------------------------------------- /background/56.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/56.jpg -------------------------------------------------------------------------------- /background/57.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/57.jpg -------------------------------------------------------------------------------- /background/58.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/58.jpg -------------------------------------------------------------------------------- /background/59.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/59.jpg -------------------------------------------------------------------------------- /background/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/6.jpg -------------------------------------------------------------------------------- /background/60.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/60.jpg -------------------------------------------------------------------------------- /background/61.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/61.jpg -------------------------------------------------------------------------------- /background/62.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/62.jpg -------------------------------------------------------------------------------- /background/63.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/63.jpg -------------------------------------------------------------------------------- /background/64.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/64.jpg -------------------------------------------------------------------------------- /background/65.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/65.jpg -------------------------------------------------------------------------------- /background/66.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/66.jpg -------------------------------------------------------------------------------- /background/67.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/67.jpg -------------------------------------------------------------------------------- /background/68.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/68.jpg -------------------------------------------------------------------------------- /background/69.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/69.jpg -------------------------------------------------------------------------------- /background/7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/7.jpg -------------------------------------------------------------------------------- /background/70.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/70.jpg -------------------------------------------------------------------------------- /background/71.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/71.jpg -------------------------------------------------------------------------------- /background/72.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/72.jpg -------------------------------------------------------------------------------- /background/73.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/73.jpg -------------------------------------------------------------------------------- /background/74.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/74.jpg -------------------------------------------------------------------------------- /background/75.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/75.jpg -------------------------------------------------------------------------------- /background/76.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/76.jpg -------------------------------------------------------------------------------- /background/77.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/77.jpg -------------------------------------------------------------------------------- /background/78.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/78.jpg -------------------------------------------------------------------------------- /background/79.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/79.jpg -------------------------------------------------------------------------------- /background/8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/8.jpg -------------------------------------------------------------------------------- /background/80.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/80.jpg -------------------------------------------------------------------------------- /background/81.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/81.jpg -------------------------------------------------------------------------------- /background/82.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/82.jpg -------------------------------------------------------------------------------- /background/83.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/83.jpg -------------------------------------------------------------------------------- /background/84.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/84.jpg -------------------------------------------------------------------------------- /background/85.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/85.jpg -------------------------------------------------------------------------------- /background/86.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/86.jpg -------------------------------------------------------------------------------- /background/87.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/87.jpg -------------------------------------------------------------------------------- /background/88.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/88.jpg -------------------------------------------------------------------------------- /background/89.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/89.jpg -------------------------------------------------------------------------------- /background/9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/9.jpg -------------------------------------------------------------------------------- /background/90.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/90.jpg -------------------------------------------------------------------------------- /background/91.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/91.jpg -------------------------------------------------------------------------------- /background/92.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/92.jpg -------------------------------------------------------------------------------- /background/93.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/93.jpg -------------------------------------------------------------------------------- /background/94.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/94.jpg -------------------------------------------------------------------------------- /background/95.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/95.jpg -------------------------------------------------------------------------------- /background/96.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/96.jpg -------------------------------------------------------------------------------- /background/97.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/97.jpg -------------------------------------------------------------------------------- /background/98.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/98.jpg -------------------------------------------------------------------------------- /background/99.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/background/99.jpg -------------------------------------------------------------------------------- /config/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "configure", 10 | srcs = ["configure.py"], 11 | deps = [ 12 | ], 13 | ) 14 | -------------------------------------------------------------------------------- /config/configure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import yaml 4 | from tensorflow.contrib.training import HParams 5 | 6 | 7 | class YParams(HParams): 8 | def __init__(self, yaml_fn, config_name): 9 | HParams.__init__(self) 10 | with open(yaml_fn) as fp: 11 | for k, v in yaml.load(fp, Loader=yaml.FullLoader)[config_name].items(): 12 | self.add_hparam(k, v) 13 | -------------------------------------------------------------------------------- /config/params.yml: -------------------------------------------------------------------------------- 1 | default: &DEFAULT 2 | train_dataset_path: config/train.txt 3 | eval_dataset_path: config/eval.txt 4 | 5 | root_path: /media/dong/DiskData/gridcorpus/todir # used by makelist_* 6 | # root_path: /media/dong/DiskData/gridcorpus/todir_vid2vid 7 | train_by_eval: 9 # train/eval 8 | 9 | sample_file: # used by generator 10 | landmark_name: landmark.txt 11 | wav_name: audio.wav 12 | bfmcoeff_name: bfmcoeff.txt 13 | 14 | model_dir: ./allmodels 15 | 16 | mel: 17 | sample_rate: 16000 18 | num_mel_bins: 80 19 | win_length: 512 20 | fft_length: 512 21 | hop_step: 128 22 | 23 | frame_rate: 25 24 | 25 | training: 26 | epochs: 100000 27 | drop_rate: 0.25 28 | learning_rate: 0.001 29 | max_grad_norm: 50 30 | decay_steps: 1000 31 | decay_rate: 0.95 32 | -------------------------------------------------------------------------------- /datasets/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "models", 10 | srcs = ["models.py"], 11 | deps = [ 12 | ], 13 | ) 14 | 15 | py_binary( 16 | name = "makelist_bfm", 17 | srcs = ["makelist_bfm.py"], 18 | deps = [ 19 | "//config:configure" 20 | ], 21 | ) 22 | 23 | py_binary( 24 | name = "makelist_pixrefer", 25 | srcs = ["makelist_pixrefer.py"], 26 | deps = [ 27 | "//config:configure" 28 | ], 29 | ) 30 | 31 | py_binary( 32 | name = "make_data_from_GRID", 33 | srcs = ["make_data_from_GRID.py"], 34 | deps = [ 35 | "//utils:bfm_load_data", 36 | "//utils:reconstruct_mesh", 37 | "//utils:utils", 38 | ":models" 39 | ], 40 | ) -------------------------------------------------------------------------------- /datasets/makelist_bfm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from optparse import OptionParser 4 | import json 5 | import logging 6 | import sys 7 | 8 | sys.path.append(os.getcwd()) 9 | from config.configure import YParams 10 | 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def write_dataset(params): 16 | train_dataset_path = params.train_dataset_path 17 | eval_dataset_path = params.eval_dataset_path 18 | root_path = params.root_path 19 | train_by_eval = params.train_by_eval 20 | 21 | landmark_name = params.sample_file['landmark_name'] 22 | wav_name = params.sample_file['wav_name'] 23 | bfmcoeff_name = params.sample_file['bfmcoeff_name'] 24 | 25 | sample_index = 0 26 | 27 | with open(train_dataset_path, "w") as train_file: 28 | with open(eval_dataset_path, "w") as eval_file: 29 | for root, subdirs, files in os.walk(root_path): 30 | if not subdirs: 31 | if (os.path.exists(os.path.join(root, landmark_name)) and 32 | os.path.exists(os.path.join(root, wav_name)) and 33 | os.path.exists(os.path.join(root, bfmcoeff_name))): 34 | 35 | logger.info('Processing {}'.format(root)) 36 | count = 0 37 | for file in files: 38 | if (file.endswith('.jpg')): 39 | count += 1 40 | 41 | sample_index += 1 42 | if (sample_index % (train_by_eval + 1) == 0): 43 | eval_file.write("{}|{}\n".format(root, count)) 44 | else: 45 | train_file.write("{}|{}\n".format(root, count)) 46 | 47 | 48 | if (__name__ == '__main__'): 49 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 50 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 51 | help='the config json file') 52 | 53 | opts, argv = cmd_parser.parse_args() 54 | 55 | if (not opts.config_path is None): 56 | config_path = opts.config_path 57 | 58 | if (not os.path.exists(config_path)): 59 | logger.error('config_path not exists') 60 | exit(0) 61 | 62 | params = YParams(config_path, 'default') 63 | write_dataset(params) 64 | else: 65 | print('Please check your parameters.') 66 | -------------------------------------------------------------------------------- /datasets/makelist_pixrefer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from optparse import OptionParser 4 | import json 5 | import logging 6 | import sys 7 | 8 | sys.path.append(os.getcwd()) 9 | from config.configure import YParams 10 | 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def write_dataset(params): 16 | train_dataset_path = params.train_dataset_path 17 | eval_dataset_path = params.eval_dataset_path 18 | root_path = params.root_path 19 | train_by_eval = params.train_by_eval 20 | 21 | sample_index = 0 22 | 23 | with open(train_dataset_path, "w") as train_file: 24 | with open(eval_dataset_path, "w") as eval_file: 25 | for root, subdirs, files in os.walk(root_path): 26 | if not subdirs: 27 | logger.info('Processing {}'.format(root)) 28 | count = 0 29 | for file in files: 30 | if (file.endswith('.jpg')): 31 | count += 1 32 | 33 | sample_index += 1 34 | if (sample_index % (train_by_eval + 1) == 0): 35 | eval_file.write("{}|{}\n".format(root, count)) 36 | else: 37 | train_file.write("{}|{}\n".format(root, count)) 38 | 39 | 40 | if (__name__ == '__main__'): 41 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 42 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 43 | help='the config json file') 44 | 45 | opts, argv = cmd_parser.parse_args() 46 | 47 | if (not opts.config_path is None): 48 | config_path = opts.config_path 49 | 50 | if (not os.path.exists(config_path)): 51 | logger.error('config_path not exists') 52 | exit(0) 53 | 54 | params = YParams(config_path, 'default') 55 | write_dataset(params) 56 | else: 57 | print('Please check your parameters.') 58 | -------------------------------------------------------------------------------- /generator/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "loader", 10 | srcs = ["loader.py"], 11 | deps = [ 12 | ], 13 | ) 14 | 15 | py_library( 16 | name = "generator", 17 | srcs = ["generator.py"], 18 | deps = [ 19 | ":loader", 20 | "//config:configure" 21 | ], 22 | ) 23 | 24 | py_library( 25 | name = "test_generator", 26 | srcs = ["test_generator.py"], 27 | deps = [ 28 | ":generator" 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /generator/loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import librosa 4 | import cv2 5 | from scipy.io import wavfile 6 | import resampy 7 | 8 | 9 | class Loader: 10 | ### root_path: None if the file_path is full path 11 | def __init__(self, root_path=None): 12 | self.root_path = root_path 13 | 14 | ### load txt data, each line split by comma, default float format 15 | ### file_path: file name in root_path, or full path. 16 | ### return: numpy array(float32) 17 | def get_text_data(self, file_path): 18 | if (self.root_path): 19 | file_path = os.path.join(self.root_path, file_path) 20 | 21 | with open(file_path) as f: 22 | lines = f.readlines() 23 | data_list = [] 24 | for line in lines: 25 | pts = line.strip().split(',') 26 | if (len(pts) != 0): 27 | pts = list(map(lambda x: np.float32(x), pts)) 28 | data_list.append(np.array(pts)) 29 | 30 | return np.array(data_list) 31 | 32 | ### load binary data of pickle format. 33 | ### file_path: file name in root_path, or full path. 34 | ### return: numpy array(float32) 35 | def get_bin_data(self, file_path): 36 | if (self.root_path): 37 | file_path = os.path.join(self.root_path, file_path) 38 | 39 | if (file_path.endswith('.npy') or file_path.endswith('.npz')): 40 | data = np.load(file_path) 41 | return data 42 | 43 | 44 | class EarLoader(Loader): 45 | 46 | def get_data(self, file_path): 47 | data = self.get_text_data(file_path) 48 | return data 49 | 50 | 51 | class PoseLoader(Loader): 52 | 53 | def get_data(self, file_path): 54 | data = self.get_text_data(file_path) 55 | return data 56 | 57 | 58 | class LandmarkLoader(Loader): 59 | def __init__(self, root_path=None, norm_size=128): 60 | Loader.__init__(self, root_path) 61 | self.norm_size = norm_size 62 | 63 | def get_data(self, file_path): 64 | data = self.get_text_data(file_path).astype(np.float32) 65 | data /= self.norm_size 66 | return data 67 | 68 | 69 | class BFMCoeffLoader(Loader): 70 | 71 | def get_data(self, file_path): 72 | data = self.get_text_data(file_path) 73 | return data 74 | 75 | 76 | class ImageLoader(Loader): 77 | def __init__(self, root_path=None, resize=None): 78 | Loader.__init__(self, root_path) 79 | self.resize = resize 80 | 81 | def get_data(self, file_path): 82 | if (self.root_path): 83 | file_path = os.path.join(self.root_path, file_path) 84 | 85 | data = cv2.imread(file_path).astype(np.float32) 86 | if (self.resize is not None): 87 | data = cv2.resize(data, (self.resize[0], self.resize[1])) 88 | data /= 255.0 89 | return data 90 | 91 | 92 | class WavLoader(Loader): 93 | def __init__(self, root_path=None, sr=16000): 94 | self.sr = sr 95 | Loader.__init__(self, root_path) 96 | 97 | def get_data(self, file_path): 98 | if (self.root_path): 99 | file_path = os.path.join(self.root_path, file_path) 100 | 101 | data, _ = librosa.load(file_path, sr=self.sr) 102 | return data 103 | 104 | 105 | class AudioLoader(Loader): 106 | def __init__(self, root_path=None, sr=16000): 107 | self.sr = sr 108 | Loader.__init__(self, root_path) 109 | 110 | def get_data(self, file_path): 111 | if (self.root_path): 112 | file_path = os.path.join(self.root_path, file_path) 113 | 114 | rate, data = wavfile.read(file_path) 115 | if data.ndim != 1: 116 | data = data[:,0] 117 | 118 | data = resampy.resample(data.astype(np.float32), rate, self.sr) 119 | return data 120 | -------------------------------------------------------------------------------- /generator/test_generator.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import numpy as np 4 | from generator import ATNetDataGenerator 5 | from generator import VGNetDataGenerator 6 | 7 | 8 | class GeneratorTest(tf.test.TestCase): 9 | 10 | def testATNetGenerator(self): 11 | config_path = 'config/params.yml' 12 | batch_size = 2 13 | landmark_size = 136 14 | ### Generator for training setting 15 | generator = ATNetDataGenerator(config_path) 16 | params = generator.params 17 | params.dataset_path = params.train_dataset_path 18 | params.batch_size = batch_size 19 | generator.set_params(params) 20 | dataset = generator.get_dataset() 21 | 22 | sess = tf.Session() 23 | tf.train.start_queue_runners(sess=sess) 24 | 25 | iterator = dataset.make_one_shot_iterator() 26 | landmark, ears, poses, mfccs, example_landmark, seq_len = sess.run(iterator.get_next()) 27 | 28 | frame_mfcc_scale = params.mel['sample_rate'] / params.frame_rate / params.mel['hop_step'] 29 | 30 | assert (frame_mfcc_scale - int(frame_mfcc_scale) == 0), "sample_rate/hop_step must divided by frame_rate." 31 | 32 | ## Test seq_len value range 33 | self.assertAllGreaterEqual(seq_len, params.min_squence_len) 34 | self.assertAllLessEqual(seq_len, params.max_squence_len) 35 | 36 | max_seq_len = np.max(seq_len) 37 | 38 | ## Test seq_len shape, [batch_size] 39 | self.assertAllEqual(seq_len.shape, [params.batch_size]) 40 | ## Test landmark shape, [batch_size, padding_time, landmark_size] 41 | self.assertAllEqual(landmark.shape, [params.batch_size, max_seq_len, landmark_size]) 42 | ## Test ears shape, [batch_size, padding_time, 1] 43 | self.assertAllEqual(ears.shape, [params.batch_size, max_seq_len, 1]) 44 | ## Test poses shape, [batch_size, padding_time, 3] 45 | self.assertAllEqual(poses.shape, [params.batch_size, max_seq_len, 3]) 46 | ## Test mfccs shape, [batch_size, padding_time, num_mel_bins] 47 | self.assertAllEqual(mfccs.shape, [params.batch_size, max_seq_len * frame_mfcc_scale, params.mel['num_mel_bins']]) 48 | ## Test example_landmark shape, [batch_size, landmark_size] 49 | self.assertAllEqual(example_landmark.shape, [params.batch_size, landmark_size]) 50 | 51 | ## Test the range of value, landmark [-1, 1] 52 | self.assertAllGreaterEqual(landmark, -1) 53 | self.assertAllLessEqual(landmark, 1) 54 | self.assertAllGreaterEqual(example_landmark, -1) 55 | self.assertAllLessEqual(example_landmark, 1) 56 | 57 | ## Test the range of value, ears [0, 1] 58 | self.assertAllGreaterEqual(ears, 0) 59 | self.assertAllLessEqual(ears, 1) 60 | 61 | ## Test the range of value, poses [-1, 1] 62 | self.assertAllGreaterEqual(poses, -1) 63 | self.assertAllLessEqual(poses, 1) 64 | 65 | def testVGNetGenerator(self): 66 | config_path = 'config/params.yml' 67 | batch_size = 2 68 | landmark_size = 136 69 | ### Generator for training setting 70 | generator = VGNetDataGenerator(config_path) 71 | params = generator.params 72 | params.dataset_path = params.train_dataset_path 73 | params.batch_size = batch_size 74 | generator.set_params(params) 75 | dataset = generator.get_dataset() 76 | 77 | sess = tf.Session() 78 | tf.train.start_queue_runners(sess=sess) 79 | 80 | iterator = dataset.make_one_shot_iterator() 81 | real_landmark_seq, real_mask_seq, real_img_seq, example_landmark, example_img, seq_len = sess.run( 82 | iterator.get_next()) 83 | 84 | ## Test seq_len value range 85 | self.assertAllGreaterEqual(seq_len, params.min_squence_len) 86 | self.assertAllLessEqual(seq_len, params.max_squence_len) 87 | 88 | max_seq_len = np.max(seq_len) 89 | 90 | ## Test seq_len shape, [batch_size] 91 | self.assertAllEqual(seq_len.shape, [params.batch_size]) 92 | ## Test real_landmark_seq shape, [batch_size, padding_time, landmark_size] 93 | self.assertAllEqual(real_landmark_seq.shape, [params.batch_size, max_seq_len, landmark_size]) 94 | ## Test real_mask_seq shape, [batch_size, padding_time, img_height, img_width, 1] 95 | self.assertAllEqual(real_mask_seq.shape, [params.batch_size, max_seq_len, params.img_size, params.img_size, 1]) 96 | ## Test real_img_seq shape, [batch_size, padding_time, img_height, img_width, 3] 97 | self.assertAllEqual(real_img_seq.shape, [params.batch_size, max_seq_len, params.img_size, params.img_size, 3]) 98 | ## Test example_landmark shape, [batch_size, 136] 99 | self.assertAllEqual(example_landmark.shape, [params.batch_size, landmark_size]) 100 | ## Test example_img shape, [batch_size, img_height, img_width, 3] 101 | self.assertAllEqual(example_img.shape, [params.batch_size, params.img_size, params.img_size, 3]) 102 | 103 | ## Test the range of value, real_landmark_seq [-1, 1] 104 | self.assertAllGreaterEqual(real_landmark_seq, -1) 105 | self.assertAllLessEqual(real_landmark_seq, 1) 106 | self.assertAllGreaterEqual(example_landmark, -1) 107 | self.assertAllLessEqual(example_landmark, 1) 108 | 109 | ## Test the range of value, real_mask_seq [0, 1] 110 | self.assertAllGreaterEqual(real_mask_seq, 0) 111 | self.assertAllLessEqual(real_mask_seq, 1) 112 | 113 | ## Test the range of value, real_img_seq [-1, 1] 114 | self.assertAllGreaterEqual(real_img_seq, -1) 115 | self.assertAllLessEqual(real_img_seq, 1) 116 | self.assertAllGreaterEqual(example_img, -1) 117 | self.assertAllLessEqual(example_img, 1) 118 | 119 | 120 | if (__name__ == '__main__'): 121 | tf.test.main() 122 | -------------------------------------------------------------------------------- /res/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/res/1.png -------------------------------------------------------------------------------- /res/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/res/2.png -------------------------------------------------------------------------------- /res/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/res/3.jpg -------------------------------------------------------------------------------- /sample/22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/sample/22.jpg -------------------------------------------------------------------------------- /sample/test.aac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taylorlu/voicepuppet/a0d3ca3296aca15abbfe75663a1bf682fb491efa/sample/test.aac -------------------------------------------------------------------------------- /utils/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "bfm_load_data", 10 | srcs = ["bfm_load_data.py"], 11 | deps = [ 12 | ], 13 | ) 14 | 15 | py_library( 16 | name = "reconstruct_mesh", 17 | srcs = ["reconstruct_mesh.py"], 18 | deps = [ 19 | ], 20 | ) 21 | 22 | py_library( 23 | name = "bfm_visual", 24 | srcs = ["bfm_visual.py"], 25 | deps = [ 26 | ], 27 | ) 28 | 29 | py_library( 30 | name = "utils", 31 | srcs = ["utils.py"], 32 | deps = [ 33 | ], 34 | ) -------------------------------------------------------------------------------- /utils/bfm_load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | from scipy.io import loadmat, savemat 4 | from array import array 5 | import os 6 | 7 | 8 | # define facemodel for reconstruction 9 | class BFM(): 10 | def __init__(self, model_dir='BFM'): 11 | model_path = os.path.join(model_dir, 'BFM_model_front.mat') 12 | model = loadmat(model_path) 13 | self.meanshape = model['meanshape'] # mean face shape 14 | self.idBase = model['idBase'] # identity basis 15 | self.exBase = model['exBase'] # expression basis 16 | self.meantex = model['meantex'] # mean face texture 17 | self.texBase = model['texBase'] # texture basis 18 | self.point_buf = model[ 19 | 'point_buf'] # adjacent face index for each vertex, starts from 1 (only used for calculating face normal) 20 | self.tri = model['tri'] # vertex index for each triangle face, starts from 1 21 | self.keypoints = np.squeeze(model['keypoints']).astype(np.int32) - 1 # 68 face landmark index, starts from 0 22 | 23 | 24 | # load expression basis 25 | def LoadExpBasis(model_dir='BFM'): 26 | n_vertex = 53215 27 | Expbin = open(os.path.join(model_dir, 'Exp_Pca.bin'), 'rb') 28 | exp_dim = array('i') 29 | exp_dim.fromfile(Expbin, 1) 30 | expMU = array('f') 31 | expPC = array('f') 32 | expMU.fromfile(Expbin, 3 * n_vertex) 33 | expPC.fromfile(Expbin, 3 * exp_dim[0] * n_vertex) 34 | 35 | expPC = np.array(expPC) 36 | expPC = np.reshape(expPC, [exp_dim[0], -1]) 37 | expPC = np.transpose(expPC) 38 | 39 | expEV = np.loadtxt(os.path.join(model_dir, 'std_exp.txt')) 40 | 41 | return expPC, expEV 42 | 43 | 44 | # transfer original BFM09 to our face model 45 | def transferBFM09(model_dir='BFM'): 46 | original_BFM = loadmat(os.path.join(model_dir, '01_MorphableModel.mat')) 47 | shapePC = original_BFM['shapePC'] # shape basis 48 | shapeEV = original_BFM['shapeEV'] # corresponding eigen value 49 | shapeMU = original_BFM['shapeMU'] # mean face 50 | texPC = original_BFM['texPC'] # texture basis 51 | texEV = original_BFM['texEV'] # eigen value 52 | texMU = original_BFM['texMU'] # mean texture 53 | 54 | expPC, expEV = LoadExpBasis() 55 | 56 | # transfer BFM09 to our face model 57 | 58 | idBase = shapePC * np.reshape(shapeEV, [-1, 199]) 59 | idBase = idBase / 1e5 # unify the scale to decimeter 60 | idBase = idBase[:, :80] # use only first 80 basis 61 | 62 | exBase = expPC * np.reshape(expEV, [-1, 79]) 63 | exBase = exBase / 1e5 # unify the scale to decimeter 64 | exBase = exBase[:, :64] # use only first 64 basis 65 | 66 | texBase = texPC * np.reshape(texEV, [-1, 199]) 67 | texBase = texBase[:, :80] # use only first 80 basis 68 | 69 | # our face model is cropped align face landmarks which contains only 35709 vertex. 70 | # original BFM09 contains 53490 vertex, and expression basis provided by JuYong contains 53215 vertex. 71 | # thus we select corresponding vertex to get our face model. 72 | 73 | index_exp = loadmat(os.path.join(model_dir, 'BFM_front_idx.mat')) 74 | index_exp = index_exp['idx'].astype(np.int32) - 1 # starts from 0 (to 53215) 75 | 76 | index_shape = loadmat(os.path.join(model_dir, 'BFM_exp_idx.mat')) 77 | index_shape = index_shape['trimIndex'].astype(np.int32) - 1 # starts from 0 (to 53490) 78 | index_shape = index_shape[index_exp] 79 | 80 | idBase = np.reshape(idBase, [-1, 3, 80]) 81 | idBase = idBase[index_shape, :, :] 82 | idBase = np.reshape(idBase, [-1, 80]) 83 | 84 | texBase = np.reshape(texBase, [-1, 3, 80]) 85 | texBase = texBase[index_shape, :, :] 86 | texBase = np.reshape(texBase, [-1, 80]) 87 | 88 | exBase = np.reshape(exBase, [-1, 3, 64]) 89 | exBase = exBase[index_exp, :, :] 90 | exBase = np.reshape(exBase, [-1, 64]) 91 | 92 | meanshape = np.reshape(shapeMU, [-1, 3]) / 1e5 93 | meanshape = meanshape[index_shape, :] 94 | meanshape = np.reshape(meanshape, [1, -1]) 95 | 96 | meantex = np.reshape(texMU, [-1, 3]) 97 | meantex = meantex[index_shape, :] 98 | meantex = np.reshape(meantex, [1, -1]) 99 | 100 | # other info contains triangles, region used for computing photometric loss, 101 | # region used for skin texture regularization, and 68 landmarks index etc. 102 | other_info = loadmat(os.path.join(model_dir, 'facemodel_info.mat')) 103 | frontmask2_idx = other_info['frontmask2_idx'] 104 | skinmask = other_info['skinmask'] 105 | keypoints = other_info['keypoints'] 106 | point_buf = other_info['point_buf'] 107 | tri = other_info['tri'] 108 | tri_mask2 = other_info['tri_mask2'] 109 | 110 | # save our face model 111 | savemat(os.path.join(model_dir, 'BFM_model_front.mat'), 112 | {'meanshape': meanshape, 'meantex': meantex, 'idBase': idBase, 'exBase': exBase, 'texBase': texBase, 113 | 'tri': tri, 'point_buf': point_buf, 'tri_mask2': tri_mask2 \ 114 | , 'keypoints': keypoints, 'frontmask2_idx': frontmask2_idx, 'skinmask': skinmask}) 115 | 116 | 117 | # load landmarks for standard face, which is used for image preprocessing 118 | def load_lm3d(model_dir='BFM'): 119 | Lm3D = loadmat(os.path.join(model_dir, 'similarity_Lm3D_all.mat')) 120 | Lm3D = Lm3D['lm'] 121 | 122 | # calculate 5 facial landmarks using 68 landmarks 123 | lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1 124 | Lm3D = np.stack( 125 | [Lm3D[lm_idx[0], :], np.mean(Lm3D[lm_idx[[1, 2]], :], 0), np.mean(Lm3D[lm_idx[[3, 4]], :], 0), Lm3D[lm_idx[5], :], 126 | Lm3D[lm_idx[6], :]], axis=0) 127 | Lm3D = Lm3D[[1, 2, 0, 3, 4], :] 128 | 129 | return Lm3D 130 | 131 | 132 | # save 3D face to obj file 133 | def save_obj(path, v, f, c): 134 | with open(path, 'w') as file: 135 | for i in range(len(v)): 136 | file.write('v %f %f %f %f %f %f\n' % (v[i, 0], v[i, 1], v[i, 2], c[i, 0], c[i, 1], c[i, 2])) 137 | # file.write('v %f %f %f\n'%(v[i,0],v[i,1],v[i,2])) 138 | 139 | file.write('\n') 140 | 141 | for i in range(len(f)): 142 | file.write('f %d %d %d\n' % (f[i, 0], f[i, 1], f[i, 2])) 143 | 144 | file.close() 145 | 146 | 147 | # calculating least sqaures problem 148 | def POS(xp, x): 149 | npts = xp.shape[1] 150 | 151 | A = np.zeros([2 * npts, 8]) 152 | 153 | A[0:2 * npts - 1:2, 0:3] = x.transpose() 154 | A[0:2 * npts - 1:2, 3] = 1 155 | 156 | A[1:2 * npts:2, 4:7] = x.transpose() 157 | A[1:2 * npts:2, 7] = 1; 158 | 159 | b = np.reshape(xp.transpose(), [2 * npts, 1]) 160 | 161 | k, _, _, _ = np.linalg.lstsq(A, b) 162 | 163 | R1 = k[0:3] 164 | R2 = k[4:7] 165 | sTx = k[3] 166 | sTy = k[7] 167 | s = (np.linalg.norm(R1) + np.linalg.norm(R2)) / 2 168 | t = np.stack([sTx, sTy], axis=0) 169 | 170 | return t, s 171 | 172 | 173 | def process_img(img, lm, t, s): 174 | w0, h0 = img.size 175 | img = img.transform(img.size, Image.AFFINE, (1, 0, t[0] - w0 / 2, 0, 1, h0 / 2 - t[1])) 176 | w = (w0 / s * 102).astype(np.int32) 177 | h = (h0 / s * 102).astype(np.int32) 178 | img = img.resize((w, h), resample=Image.BILINEAR) 179 | lm = np.stack([lm[:, 0] - t[0] + w0 / 2, lm[:, 1] - t[1] + h0 / 2], axis=1) / s * 102 180 | 181 | # crop the image to 224*224 from image center 182 | left = (w / 2 - 112).astype(np.int32) 183 | right = left + 224 184 | up = (h / 2 - 112).astype(np.int32) 185 | below = up + 224 186 | 187 | img = img.crop((left, up, right, below)) 188 | img = np.array(img) 189 | img = img[:, :, ::-1] 190 | img = np.expand_dims(img, 0) 191 | lm = lm - np.reshape(np.array([(w / 2 - 112), (h / 2 - 112)]), [1, 2]) 192 | 193 | return img, lm, t[0] - w0 / 2, h0 / 2 - t[1] 194 | 195 | 196 | # resize and crop input images before sending to the R-Net 197 | def Preprocess(img, lm, lm3D): 198 | w0, h0 = img.size 199 | 200 | # change from image plane coordinates to 3D sapce coordinates(X-Y plane) 201 | lm = np.stack([lm[:, 0], h0 - 1 - lm[:, 1]], axis=1) 202 | 203 | # calculate translation and scale factors using 5 facial landmarks and standard landmarks 204 | t, s = POS(lm.transpose(), lm3D.transpose()) 205 | # print('t = {}, s = {}'.format(t,s)) 206 | 207 | # processing the image 208 | img_new, lm_new, t0, t1 = process_img(img, lm, t, s) 209 | lm_new = np.stack([lm_new[:, 0], 223 - lm_new[:, 1]], axis=1) 210 | trans_params = np.array([w0, h0, 102.0 / s, t0, t1]) 211 | 212 | return img_new, lm_new, trans_params 213 | -------------------------------------------------------------------------------- /utils/bfm_visual.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from PIL import Image 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.join(os.getcwd(), 'utils')) 8 | from bfm_load_data import * 9 | from reconstruct_mesh import * 10 | import mesh_core_cython 11 | 12 | 13 | def isPointInTri(point, tri_points): 14 | ''' Judge whether the point is in the triangle 15 | Method: 16 | http://blackpawn.com/texts/pointinpoly/ 17 | Args: 18 | point: [u, v] or [x, y] 19 | tri_points: three vertices(2d points) of a triangle. 2 coords x 3 vertices 20 | Returns: 21 | bool: true for in triangle 22 | ''' 23 | tp = tri_points 24 | 25 | # vectors 26 | v0 = tp[:, 2] - tp[:, 0] 27 | v1 = tp[:, 1] - tp[:, 0] 28 | v2 = point - tp[:, 0] 29 | 30 | # dot products 31 | dot00 = np.dot(v0.T, v0) 32 | dot01 = np.dot(v0.T, v1) 33 | dot02 = np.dot(v0.T, v2) 34 | dot11 = np.dot(v1.T, v1) 35 | dot12 = np.dot(v1.T, v2) 36 | 37 | # barycentric coordinates 38 | if dot00 * dot11 - dot01 * dot01 == 0: 39 | inverDeno = 0 40 | else: 41 | inverDeno = 1 / (dot00 * dot11 - dot01 * dot01) 42 | 43 | u = (dot11 * dot02 - dot01 * dot12) * inverDeno 44 | v = (dot00 * dot12 - dot01 * dot02) * inverDeno 45 | 46 | # check if point in triangle 47 | return (u >= 0) & (v >= 0) & (u + v < 1) 48 | 49 | 50 | def render_texture(vertices, colors, triangles, h, w, c=3): 51 | ''' render mesh by z buffer 52 | Args: 53 | vertices: 3 x nver 54 | colors: 3 x nver 55 | triangles: 3 x ntri 56 | h: height 57 | w: width 58 | ''' 59 | # initial 60 | image = np.zeros((h, w, c), dtype=np.uint8) 61 | 62 | depth_buffer = np.zeros([h, w]) - 999999. 63 | # triangle depth: approximate the depth to the average value of z in each vertex(v0, v1, v2), since the vertices are closed to each other 64 | tri_depth = (vertices[2, triangles[0, :]] + vertices[2, triangles[1, :]] + vertices[2, triangles[2, :]]) / 3. 65 | tri_tex = (colors[:, triangles[0, :]] + colors[:, triangles[1, :]] + colors[:, triangles[2, :]]) / 3. 66 | 67 | for i in range(triangles.shape[1]): 68 | tri = triangles[:, i] # 3 vertex indices 69 | 70 | # the inner bounding box 71 | umin = max(int(np.ceil(np.min(vertices[0, tri]))), 0) 72 | umax = min(int(np.floor(np.max(vertices[0, tri]))), w - 1) 73 | 74 | vmin = max(int(np.ceil(np.min(vertices[1, tri]))), 0) 75 | vmax = min(int(np.floor(np.max(vertices[1, tri]))), h - 1) 76 | 77 | if umax < umin or vmax < vmin: 78 | continue 79 | 80 | for u in range(umin, umax + 1): 81 | for v in range(vmin, vmax + 1): 82 | if tri_depth[i] > depth_buffer[v, u] and isPointInTri([u, v], vertices[:2, tri]): 83 | depth_buffer[v, u] = tri_depth[i] 84 | image[v, u, :] = tri_tex[:, i] 85 | return image 86 | 87 | 88 | def plot_bfm_coeff_seq(save_dir, facemodel, step, seq_len, real_bfm_coeff_seq, bfm_coeff_seq, id_coeff=None, texture_coeff=None): 89 | ## 9*10 block 90 | block_x = 10 91 | block_y = 9 92 | img_size = 224 93 | 94 | def merge_seq(bfm_coeff_seq, big_img, time, h_index): 95 | 96 | for i in range(time): 97 | face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d, translation = Reconstruction( 98 | bfm_coeff_seq[0, i:i + 1, ...], facemodel) 99 | 100 | face_projection2 = np.concatenate([face_projection, z_buffer], axis=2) 101 | face_projection = np.squeeze(face_projection2, (0)) 102 | 103 | shape = np.squeeze(face_projection2, (0)) 104 | color = np.squeeze(face_color, (0)) 105 | color = np.clip(color, 0, 255).astype(np.int32) 106 | 107 | new_image = np.zeros((224 * 224 * 3), dtype=np.uint8) 108 | face_mask = np.zeros((224 * 224), dtype=np.uint8) 109 | 110 | vertices = shape.reshape(-1).astype(np.float32).copy() 111 | triangles = (facemodel.tri - 1).reshape(-1).astype(np.int32).copy() 112 | colors = color.reshape(-1).astype(np.float32).copy() 113 | depth_buffer = (np.zeros((224 * 224)) - 99999.0).astype(np.float32) 114 | mesh_core_cython.render_colors_core(new_image, face_mask, vertices, triangles, colors, depth_buffer, 115 | facemodel.tri.shape[0], 224, 224, 3) 116 | new_image = new_image.reshape([224, 224, 3]) 117 | 118 | 119 | # shape = np.squeeze(face_shape, (0)) 120 | # color = np.squeeze(face_color, (0)) 121 | # color = np.clip(color, 0, 255).astype(np.int32) 122 | # shape[:, :2] = 112 - shape[:, :2] * 112 123 | 124 | # new_image = render_texture(shape.T, color.T, (facemodel.tri - 1).astype(int).T, 224, 224, c=3) 125 | new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2RGB) 126 | 127 | big_img[(i // block_x + h_index) * img_size: (i // block_x + h_index + 1) * img_size, 128 | (i % block_x) * img_size: (i % block_x + 1) * img_size] = new_image 129 | 130 | return big_img 131 | 132 | ### We only pick the first sequence of the batch, trim length of 30. 133 | if (seq_len[0] > 30): 134 | time = 30 135 | else: 136 | time = seq_len[0] 137 | 138 | ### We only pick the first sequence of the batch, trim length of 30. 139 | if (seq_len[0] > 30): 140 | time = 30 141 | else: 142 | time = seq_len[0] 143 | 144 | big_img = np.zeros((img_size * block_y, img_size * block_x, 3), dtype=np.uint8) 145 | big_img = merge_seq(real_bfm_coeff_seq, big_img, time, 0) 146 | 147 | if(id_coeff is None or texture_coeff is None): 148 | bfm_coeff_seq = np.concatenate([real_bfm_coeff_seq[:, :, :80], bfm_coeff_seq[:, :, :], real_bfm_coeff_seq[:, :, 144:]], axis=2) 149 | else: 150 | bfm_coeff_seq = np.concatenate([np.tile(id_coeff, (1, real_bfm_coeff_seq.shape[1], 1)), bfm_coeff_seq[:, :, :], np.tile(texture_coeff, (1, real_bfm_coeff_seq.shape[1], 1)), real_bfm_coeff_seq[:, :, 224:]], axis=2) 151 | 152 | big_img = merge_seq(bfm_coeff_seq, big_img, time, 3) 153 | 154 | cv2.imwrite('{}/bfmnet_{}.jpg'.format(save_dir, step), big_img) 155 | 156 | -------------------------------------------------------------------------------- /utils/cython/mesh_core.h: -------------------------------------------------------------------------------- 1 | #ifndef MESH_CORE_HPP_ 2 | #define MESH_CORE_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | class point 14 | { 15 | public: 16 | float x; 17 | float y; 18 | 19 | float dot(point p) 20 | { 21 | return this->x * p.x + this->y * p.y; 22 | } 23 | 24 | point operator-(const point& p) 25 | { 26 | point np; 27 | np.x = this->x - p.x; 28 | np.y = this->y - p.y; 29 | return np; 30 | } 31 | 32 | point operator+(const point& p) 33 | { 34 | point np; 35 | np.x = this->x + p.x; 36 | np.y = this->y + p.y; 37 | return np; 38 | } 39 | 40 | point operator*(float s) 41 | { 42 | point np; 43 | np.x = s * this->x; 44 | np.y = s * this->y; 45 | return np; 46 | } 47 | }; 48 | 49 | 50 | bool isPointInTri(point p, point p0, point p1, point p2, int h, int w); 51 | void get_point_weight(float* weight, point p, point p0, point p1, point p2); 52 | 53 | void _get_normal_core( 54 | float* normal, float* tri_normal, int* triangles, 55 | int ntri); 56 | 57 | void _rasterize_triangles_core( 58 | float* vertices, int* triangles, 59 | float* depth_buffer, int* triangle_buffer, float* barycentric_weight, 60 | int nver, int ntri, 61 | int h, int w); 62 | 63 | void _render_colors_core( 64 | unsigned char* image, unsigned char *face_mask, float* vertices, int* triangles, 65 | float* colors, 66 | float* depth_buffer, 67 | int ntri, 68 | int h, int w, int c); 69 | 70 | void _render_texture_core( 71 | float* image, float* vertices, int* triangles, 72 | float* texture, float* tex_coords, int* tex_triangles, 73 | float* depth_buffer, 74 | int nver, int tex_nver, int ntri, 75 | int h, int w, int c, 76 | int tex_h, int tex_w, int tex_c, 77 | int mapping_type); 78 | 79 | #endif -------------------------------------------------------------------------------- /utils/cython/mesh_core_cython.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | from libcpp.string cimport string 4 | 5 | # use the Numpy-C-API from Cython 6 | np.import_array() 7 | 8 | # cdefine the signature of our c function 9 | cdef extern from "mesh_core.h": 10 | void _rasterize_triangles_core( 11 | float* vertices, int* triangles, 12 | float* depth_buffer, int* triangle_buffer, float* barycentric_weight, 13 | int nver, int ntri, 14 | int h, int w) 15 | 16 | void _render_colors_core( 17 | unsigned char* image, unsigned char *face_mask, float* vertices, int* triangles, 18 | float* colors, 19 | float* depth_buffer, 20 | int ntri, 21 | int h, int w, int c) 22 | 23 | void _render_texture_core( 24 | float* image, float* vertices, int* triangles, 25 | float* texture, float* tex_coords, int* tex_triangles, 26 | float* depth_buffer, 27 | int nver, int tex_nver, int ntri, 28 | int h, int w, int c, 29 | int tex_h, int tex_w, int tex_c, 30 | int mapping_type) 31 | 32 | void _get_normal_core( 33 | float* normal, float* tri_normal, int* triangles, 34 | int ntri) 35 | 36 | void _write_obj_with_colors_texture(string filename, string mtl_name, 37 | float* vertices, int* triangles, float* colors, float* uv_coords, 38 | int nver, int ntri, int ntexver) 39 | 40 | def get_normal_core(np.ndarray[float, ndim=2, mode = "c"] normal not None, 41 | np.ndarray[float, ndim=2, mode = "c"] tri_normal not None, 42 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 43 | int ntri 44 | ): 45 | _get_normal_core( 46 | np.PyArray_DATA(normal), np.PyArray_DATA(tri_normal), np.PyArray_DATA(triangles), 47 | ntri) 48 | 49 | def rasterize_triangles_core( 50 | np.ndarray[float, ndim=2, mode = "c"] vertices not None, 51 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 52 | np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None, 53 | np.ndarray[int, ndim=2, mode = "c"] triangle_buffer not None, 54 | np.ndarray[float, ndim=2, mode = "c"] barycentric_weight not None, 55 | int nver, int ntri, 56 | int h, int w 57 | ): 58 | _rasterize_triangles_core( 59 | np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), 60 | np.PyArray_DATA(depth_buffer), np.PyArray_DATA(triangle_buffer), np.PyArray_DATA(barycentric_weight), 61 | nver, ntri, 62 | h, w) 63 | 64 | def render_colors_core(np.ndarray[unsigned char, ndim=1, mode = "c"] image not None, 65 | np.ndarray[unsigned char, ndim=1, mode = "c"] face_mask not None, 66 | np.ndarray[float, ndim=1, mode = "c"] vertices not None, 67 | np.ndarray[int, ndim=1, mode="c"] triangles not None, 68 | np.ndarray[float, ndim=1, mode = "c"] colors not None, 69 | np.ndarray[float, ndim=1, mode = "c"] depth_buffer not None, 70 | int ntri, 71 | int h, int w, int c 72 | ): 73 | _render_colors_core( 74 | np.PyArray_DATA(image), np.PyArray_DATA(face_mask), np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), 75 | np.PyArray_DATA(colors), 76 | np.PyArray_DATA(depth_buffer), 77 | ntri, 78 | h, w, c) 79 | 80 | def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None, 81 | np.ndarray[float, ndim=2, mode = "c"] vertices not None, 82 | np.ndarray[int, ndim=2, mode="c"] triangles not None, 83 | np.ndarray[float, ndim=3, mode = "c"] texture not None, 84 | np.ndarray[float, ndim=2, mode = "c"] tex_coords not None, 85 | np.ndarray[int, ndim=2, mode="c"] tex_triangles not None, 86 | np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None, 87 | int nver, int tex_nver, int ntri, 88 | int h, int w, int c, 89 | int tex_h, int tex_w, int tex_c, 90 | int mapping_type 91 | ): 92 | _render_texture_core( 93 | np.PyArray_DATA(image), np.PyArray_DATA(vertices), np.PyArray_DATA(triangles), 94 | np.PyArray_DATA(texture), np.PyArray_DATA(tex_coords), np.PyArray_DATA(tex_triangles), 95 | np.PyArray_DATA(depth_buffer), 96 | nver, tex_nver, ntri, 97 | h, w, c, 98 | tex_h, tex_w, tex_c, 99 | mapping_type) 100 | -------------------------------------------------------------------------------- /utils/cython/setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | python setup.py build_ext -i 3 | to compile 4 | ''' 5 | 6 | # setup.py 7 | from distutils.core import setup, Extension 8 | from Cython.Build import cythonize 9 | from Cython.Distutils import build_ext 10 | from distutils.sysconfig import get_python_lib 11 | import numpy 12 | 13 | setup( 14 | name='mesh_core_cython', 15 | cmdclass={'build_ext': build_ext}, 16 | ext_modules=[Extension("mesh_core_cython", 17 | sources=["mesh_core_cython.pyx", "mesh_core.cpp"], 18 | language='c++', 19 | include_dirs=[get_python_lib(), numpy.get_include()])], 20 | ) 21 | -------------------------------------------------------------------------------- /utils/reconstruct_mesh.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # input: coeff with shape [1,257] 5 | def Split_coeff(coeff): 6 | id_coeff = coeff[:, :80] # identity(shape) coeff of dim 80 7 | ex_coeff = coeff[:, 80:144] # expression coeff of dim 64 8 | tex_coeff = coeff[:, 144:224] # texture(albedo) coeff of dim 80 9 | angles = coeff[:, 224:227] # ruler angles(x,y,z) for rotation of dim 3 10 | gamma = coeff[:, 227:254] # lighting coeff for 3 channel SH function of dim 27 11 | translation = coeff[:, 254:] # translation coeff of dim 3 12 | 13 | return id_coeff, ex_coeff, tex_coeff, angles, gamma, translation 14 | 15 | 16 | # compute face shape with identity and expression coeff, based on BFM model 17 | # input: id_coeff with shape [1,80] 18 | # ex_coeff with shape [1,64] 19 | # output: face_shape with shape [1,N,3], N is number of vertices 20 | def Shape_formation(id_coeff, ex_coeff, facemodel): 21 | face_shape = np.einsum('ij,aj->ai', facemodel.idBase, id_coeff) + \ 22 | np.einsum('ij,aj->ai', facemodel.exBase, ex_coeff) + \ 23 | facemodel.meanshape 24 | 25 | face_shape = np.reshape(face_shape, [1, -1, 3]) 26 | # re-center face shape 27 | face_shape = face_shape - np.mean(np.reshape(facemodel.meanshape, [1, -1, 3]), axis=1, keepdims=True) 28 | 29 | return face_shape 30 | 31 | 32 | # compute vertex normal using one-ring neighborhood 33 | # input: face_shape with shape [1,N,3] 34 | # output: v_norm with shape [1,N,3] 35 | def Compute_norm(face_shape, facemodel): 36 | face_id = facemodel.tri # vertex index for each triangle face, with shape [F,3], F is number of faces 37 | point_id = facemodel.point_buf # adjacent face index for each vertex, with shape [N,8], N is number of vertex 38 | shape = face_shape 39 | face_id = (face_id - 1).astype(np.int32) 40 | point_id = (point_id - 1).astype(np.int32) 41 | v1 = shape[:, face_id[:, 0], :] 42 | v2 = shape[:, face_id[:, 1], :] 43 | v3 = shape[:, face_id[:, 2], :] 44 | e1 = v1 - v2 45 | e2 = v2 - v3 46 | face_norm = np.cross(e1, e2) # compute normal for each face 47 | face_norm = np.concatenate([face_norm, np.zeros([1, 1, 3])], 48 | axis=1) # concat face_normal with a zero vector at the end 49 | v_norm = np.sum(face_norm[:, point_id, :], axis=2) # compute vertex normal using one-ring neighborhood 50 | v_norm = v_norm / np.expand_dims(np.linalg.norm(v_norm, axis=2), 2) # normalize normal vectors 51 | 52 | return v_norm 53 | 54 | 55 | # compute vertex texture(albedo) with tex_coeff 56 | # input: tex_coeff with shape [1,N,3] 57 | # output: face_texture with shape [1,N,3], RGB order, range from 0-255 58 | def Texture_formation(tex_coeff, facemodel): 59 | face_texture = np.einsum('ij,aj->ai', facemodel.texBase, tex_coeff) + facemodel.meantex 60 | face_texture = np.reshape(face_texture, [1, -1, 3]) 61 | 62 | return face_texture 63 | 64 | 65 | # compute rotation matrix based on 3 ruler angles 66 | # input: angles with shape [1,3] 67 | # output: rotation matrix with shape [1,3,3] 68 | def Compute_rotation_matrix(angles): 69 | angle_x = angles[:, 0][0] 70 | angle_y = angles[:, 1][0] 71 | angle_z = angles[:, 2][0] 72 | 73 | # compute rotation matrix for X,Y,Z axis respectively 74 | rotation_X = np.array([1.0, 0, 0, \ 75 | 0, np.cos(angle_x), -np.sin(angle_x), \ 76 | 0, np.sin(angle_x), np.cos(angle_x)]) 77 | rotation_Y = np.array([np.cos(angle_y), 0, np.sin(angle_y), \ 78 | 0, 1, 0, \ 79 | -np.sin(angle_y), 0, np.cos(angle_y)]) 80 | rotation_Z = np.array([np.cos(angle_z), -np.sin(angle_z), 0, \ 81 | np.sin(angle_z), np.cos(angle_z), 0, \ 82 | 0, 0, 1]) 83 | 84 | rotation_X = np.reshape(rotation_X, [1, 3, 3]) 85 | rotation_Y = np.reshape(rotation_Y, [1, 3, 3]) 86 | rotation_Z = np.reshape(rotation_Z, [1, 3, 3]) 87 | 88 | rotation = np.matmul(np.matmul(rotation_Z, rotation_Y), rotation_X) 89 | rotation = np.transpose(rotation, axes=[0, 2, 1]) # transpose row and column (dimension 1 and 2) 90 | 91 | return rotation 92 | 93 | 94 | # project 3D face onto image plane 95 | # input: face_shape with shape [1,N,3] 96 | # rotation with shape [1,3,3] 97 | # translation with shape [1,3] 98 | # output: face_projection with shape [1,N,2] 99 | # z_buffer with shape [1,N,1] 100 | def Projection_layer(face_shape, rotation, translation, focal=1015.0, 101 | center=112.0): # we choose the focal length and camera position empirically 102 | 103 | camera_pos = np.reshape(np.array([0.0, 0.0, 10.0]), [1, 1, 3]) # camera position 104 | reverse_z = np.reshape(np.array([1.0, 0, 0, 0, 1, 0, 0, 0, -1.0]), [1, 3, 3]) 105 | 106 | p_matrix = np.concatenate([[focal], [0.0], [center], [0.0], [focal], [center], [0.0], [0.0], [1.0]], 107 | axis=0) # projection matrix 108 | p_matrix = np.reshape(p_matrix, [1, 3, 3]) 109 | 110 | # calculate face position in camera space 111 | face_shape_r = np.matmul(face_shape, rotation) 112 | face_shape_t = face_shape_r + np.reshape(translation, [1, 1, 3]) 113 | face_shape_t = np.matmul(face_shape_t, reverse_z) + camera_pos 114 | 115 | # calculate projection of face vertex using perspective projection 116 | aug_projection = np.matmul(face_shape_t, np.transpose(p_matrix, [0, 2, 1])) 117 | face_projection = aug_projection[:, :, 0:2] / np.reshape(aug_projection[:, :, 2], [1, np.shape(aug_projection)[1], 1]) 118 | z_buffer = -np.reshape(aug_projection[:, :, 2], [1, -1, 1]) 119 | 120 | return face_projection, z_buffer 121 | 122 | 123 | # compute vertex color using face_texture and SH function lighting approximation 124 | # input: face_texture with shape [1,N,3] 125 | # norm with shape [1,N,3] 126 | # gamma with shape [1,27] 127 | # output: face_color with shape [1,N,3], RGB order, range from 0-255 128 | # lighting with shape [1,N,3], color under uniform texture 129 | def Illumination_layer(face_texture, norm, gamma): 130 | # gamma = np.zeros(gamma.shape, dtype=gamma.dtype) 131 | num_vertex = np.shape(face_texture)[1] 132 | 133 | init_lit = np.array([0.8, 0, 0, 0, 0, 0, 0, 0, 0]) 134 | gamma = np.reshape(gamma, [-1, 3, 9]) 135 | gamma = gamma + np.reshape(init_lit, [1, 1, 9]) 136 | 137 | # parameter of 9 SH function 138 | a0 = np.pi 139 | a1 = 2 * np.pi / np.sqrt(3.0) 140 | a2 = 2 * np.pi / np.sqrt(8.0) 141 | c0 = 1 / np.sqrt(4 * np.pi) 142 | c1 = np.sqrt(3.0) / np.sqrt(4 * np.pi) 143 | c2 = 3 * np.sqrt(5.0) / np.sqrt(12 * np.pi) 144 | 145 | Y0 = np.tile(np.reshape(a0 * c0, [1, 1, 1]), [1, num_vertex, 1]) 146 | Y1 = np.reshape(-a1 * c1 * norm[:, :, 1], [1, num_vertex, 1]) 147 | Y2 = np.reshape(a1 * c1 * norm[:, :, 2], [1, num_vertex, 1]) 148 | Y3 = np.reshape(-a1 * c1 * norm[:, :, 0], [1, num_vertex, 1]) 149 | Y4 = np.reshape(a2 * c2 * norm[:, :, 0] * norm[:, :, 1], [1, num_vertex, 1]) 150 | Y5 = np.reshape(-a2 * c2 * norm[:, :, 1] * norm[:, :, 2], [1, num_vertex, 1]) 151 | Y6 = np.reshape(a2 * c2 * 0.5 / np.sqrt(3.0) * (3 * np.square(norm[:, :, 2]) - 1), [1, num_vertex, 1]) 152 | Y7 = np.reshape(-a2 * c2 * norm[:, :, 0] * norm[:, :, 2], [1, num_vertex, 1]) 153 | Y8 = np.reshape(a2 * c2 * 0.5 * (np.square(norm[:, :, 0]) - np.square(norm[:, :, 1])), [1, num_vertex, 1]) 154 | 155 | Y = np.concatenate([Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8], axis=2) 156 | 157 | # Y shape:[batch,N,9]. 158 | 159 | lit_r = np.squeeze(np.matmul(Y, np.expand_dims(gamma[:, 0, :], 2)), 2) # [batch,N,9] * [batch,9,1] = [batch,N] 160 | lit_g = np.squeeze(np.matmul(Y, np.expand_dims(gamma[:, 1, :], 2)), 2) 161 | lit_b = np.squeeze(np.matmul(Y, np.expand_dims(gamma[:, 2, :], 2)), 2) 162 | 163 | # shape:[batch,N,3] 164 | face_color = np.stack([lit_r * face_texture[:, :, 0], lit_g * face_texture[:, :, 1], lit_b * face_texture[:, :, 2]], 165 | axis=2) 166 | lighting = np.stack([lit_r, lit_g, lit_b], axis=2) * 128 167 | 168 | return face_color, lighting 169 | 170 | 171 | # face reconstruction with coeff and BFM model 172 | def Reconstruction(coeff, facemodel): 173 | id_coeff, ex_coeff, tex_coeff, angles, gamma, translation = Split_coeff(coeff) 174 | # compute face shape 175 | face_shape = Shape_formation(id_coeff, ex_coeff, facemodel) 176 | # compute vertex texture(albedo) 177 | face_texture = Texture_formation(tex_coeff, facemodel) 178 | # vertex normal 179 | face_norm = Compute_norm(face_shape, facemodel) 180 | # rotation matrix 181 | rotation = Compute_rotation_matrix(angles) 182 | face_norm_r = np.matmul(face_norm, rotation) 183 | 184 | # compute vertex projection on image plane (with image sized 224*224) 185 | face_projection, z_buffer = Projection_layer(face_shape, rotation, translation) 186 | face_projection = np.stack([face_projection[:, :, 0], 224 - face_projection[:, :, 1]], axis=2) 187 | 188 | # compute 68 landmark on image plane 189 | landmarks_2d = face_projection[:, facemodel.keypoints, :] 190 | 191 | # compute vertex color using SH function lighting approximation 192 | face_color, lighting = Illumination_layer(face_texture, face_norm_r, gamma) 193 | 194 | return face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d, translation 195 | 196 | 197 | # face reconstruction with coeff and BFM model 198 | def Reconstruction_rotation(coeff, facemodel, angles): 199 | id_coeff, ex_coeff, tex_coeff, _, gamma, translation = Split_coeff(coeff) 200 | # compute face shape 201 | face_shape = Shape_formation(id_coeff, ex_coeff, facemodel) 202 | # compute vertex texture(albedo) 203 | face_texture = Texture_formation(tex_coeff, facemodel) 204 | # vertex normal 205 | face_norm = Compute_norm(face_shape, facemodel) 206 | # rotation matrix 207 | rotation = Compute_rotation_matrix(angles) 208 | face_norm_r = np.matmul(face_norm, rotation) 209 | 210 | # rotation matrix 211 | face_shape = np.matmul(face_shape, rotation) 212 | 213 | # compute vertex projection on image plane (with image sized 224*224) 214 | face_projection, z_buffer = Projection_layer(face_shape, rotation, translation) 215 | face_projection = np.stack([face_projection[:, :, 0], 224 - face_projection[:, :, 1]], axis=2) 216 | 217 | # compute 68 landmark on image plane 218 | landmarks_2d = face_projection[:, facemodel.keypoints, :] 219 | 220 | # compute vertex color using SH function lighting approximation 221 | face_color, lighting = Illumination_layer(face_texture, face_norm_r, gamma) 222 | 223 | return face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d 224 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import cv2 3 | import numpy as np 4 | import os 5 | import math 6 | 7 | alignment_handler = None 8 | dlib_detector = None 9 | 10 | def mkdir(dirname): 11 | if(not os.path.isdir(dirname)): 12 | os.makedirs(dirname) 13 | 14 | class MXDetectorHandler: 15 | ''' 16 | face 2D landmark alignment by mxnet, refer to https://github.com/deepinx/deep-face-alignment 17 | ''' 18 | def __init__(self, prefix, epoch, mx, name='model'): 19 | ctx_id = int(os.environ["CUDA_VISIBLE_DEVICES"]) 20 | if (ctx_id >= 0): 21 | ctx = mx.gpu(ctx_id) 22 | else: 23 | ctx = mx.cpu() 24 | 25 | sym, arg_params, aux_params = mx.model.load_checkpoint(os.path.join(prefix, name), epoch) 26 | all_layers = sym.get_internals() 27 | sym = all_layers['heatmap_output'] 28 | image_size = (128, 128) 29 | self.image_size = image_size 30 | model = mx.mod.Module(symbol=sym, context=ctx, label_names=None) 31 | model.bind(for_training=False, data_shapes=[('data', (1, 3, image_size[0], image_size[1]))]) 32 | model.set_params(arg_params, aux_params) 33 | self.model = model 34 | 35 | 36 | def get_mxnet_sat_alignment(model_dir, image): 37 | ''' 38 | Arguments: 39 | model_dir: The folder contains mxnet pretrained model. 40 | image: The image contains at least 1 face inside, we only detect the first face. 41 | Returns: 42 | image: The image input. 43 | img_landmarks: The 68 landmarks' coordinates in image. 44 | img: The face area expand by sat alignment, resize to out_img_size=224. 45 | lmk_cropped: The 68 landmarks' coordinates in img. 46 | center_x: the x position of the face center in image. 47 | center_y: the y position of the face center in image. 48 | ratio: The return image size / original face area size(before resize). 49 | ''' 50 | global alignment_handler, dlib_detector 51 | 52 | if (alignment_handler is None): 53 | alignment_handler = MXDetectorHandler(prefix=model_dir, epoch=0, mx=mx, name='model-sat') 54 | 55 | import dlib 56 | if (dlib_detector is None): 57 | dlib_detector = dlib.get_frontal_face_detector() 58 | 59 | def crop_expand_dlib(image, rect, ratio=1.5): 60 | ## rect: [left, right, top, bottom] 61 | mean = [(rect[2] + rect[3]) / 2.0, (rect[0] + rect[1]) / 2.0] 62 | ## mean: [y, x] 63 | width = rect[1] - rect[0] 64 | height = rect[3] - rect[2] 65 | 66 | max_ratio = min([(image.shape[0] - mean[0])/(height/2), (image.shape[1] - mean[1])/(width/2), mean[0]/(height/2), mean[1]/(width/2)]) 67 | if(max_ratio=0: 47 | ctx = mx.gpu(ctx_id) 48 | else: 49 | ctx = mx.cpu() 50 | sym, arg_params, aux_params = mx.model.load_checkpoint(os.path.join(prefix, "model"), epoch) 51 | all_layers = sym.get_internals() 52 | sym = all_layers['heatmap_output'] 53 | image_size = (128, 128) 54 | self.image_size = image_size 55 | model = mx.mod.Module(symbol=sym, context=ctx, label_names = None) 56 | model.bind(for_training=False, data_shapes=[('data', (1, 3, image_size[0], image_size[1]))]) 57 | model.set_params(arg_params, aux_params) 58 | self.model = model 59 | 60 | 61 | def face_alignment(image): 62 | import mxnet as mx 63 | global alignment_handler 64 | global MXDetectorHandler_prefix 65 | if(alignment_handler is None): 66 | alignment_handler = MXDetectorHandler(prefix=MXDetectorHandler_prefix, epoch=0, ctx_id=-1, mx=mx) 67 | 68 | import dlib 69 | dlib_detector = dlib.get_frontal_face_detector() 70 | 71 | def crop_expand_dlib(image, rect, ratio=1.5): 72 | ## rect: [left, right, top, bottom] 73 | mean = [(rect[2] + rect[3]) // 2, (rect[0] + rect[1]) // 2] 74 | ## mean: [y, x] 75 | half_crop_size = int((rect[1] + rect[3] - rect[0] - rect[2]) * ratio // 4) 76 | 77 | # padding if the crop area outside of image. 78 | if (mean[0] - half_crop_size < 0): 79 | image = cv2.copyMakeBorder(image, 0, 0, half_crop_size - mean[0], 0, cv2.BORDER_CONSTANT, 0) 80 | if (mean[0] + half_crop_size > image.shape[1]): 81 | image = cv2.copyMakeBorder(image, 0, 0, 0, mean[0] + half_crop_size - image.shape[1], cv2.BORDER_CONSTANT, 0) 82 | if (mean[1] - half_crop_size < 0): 83 | image = cv2.copyMakeBorder(image, half_crop_size - mean[1], 0, 0, 0, cv2.BORDER_CONSTANT, 0) 84 | if (mean[1] + half_crop_size > image.shape[0]): 85 | image = cv2.copyMakeBorder(image, 0, mean[1] + half_crop_size - image.shape[0], 0, 0, cv2.BORDER_CONSTANT, 0) 86 | 87 | left = mean[1] - half_crop_size 88 | right = mean[1] + half_crop_size 89 | top = mean[0] - half_crop_size 90 | buttom = mean[0] + half_crop_size 91 | 92 | if (left < 0): 93 | left = 0 94 | if (top < 0): 95 | top = 0 96 | 97 | return image, [left, right, top, buttom] 98 | 99 | def crop_expand_alignment(img, xys, out_img_size=224, ratio=1.3): 100 | xys = np.array(map(lambda x: int(x), xys)) 101 | max_x = max(xys[::2]) 102 | max_y = max(xys[1::2]) 103 | min_x = min(xys[::2]) 104 | min_y = min(xys[1::2]) 105 | width = int((max_x - min_x) * ratio) 106 | height = int((max_y - min_y) * ratio) 107 | height = width 108 | 109 | center_x = (max_x + min_x) // 2 110 | center_y = (max_y + min_y) // 2 111 | 112 | left = center_x - width / 2 113 | top = center_y - height / 2 114 | img = img[top:top + height, left:left + width] 115 | 116 | xys[::2] -= left 117 | xys[1::2] -= top 118 | xys[::2] = xys[::2] * out_img_size / width 119 | xys[1::2] = xys[1::2] * out_img_size / height 120 | 121 | img = cv2.resize(img, (out_img_size, out_img_size)) 122 | xys = np.array(list(map(lambda x: float(x)/out_img_size, xys))) 123 | 124 | return img, xys 125 | 126 | img_gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 127 | rects = dlib_detector(img_gray, 0) 128 | if (len(rects) != 1): 129 | return None 130 | 131 | rect = [rects[0].left(), rects[0].right(), rects[0].top(), rects[0].bottom()] 132 | image, rect = crop_expand_dlib(image, rect) # dlib region is too small 133 | ## rect: [left, right, top, bottom] 134 | 135 | img = cv2.cvtColor(image[rect[2]:rect[3], rect[0]:rect[1]], cv2.COLOR_BGR2RGB) 136 | crop_width = img.shape[1] 137 | crop_height = img.shape[0] 138 | 139 | img = cv2.resize(img, (128, 128)) 140 | img = np.transpose(img, (2, 0, 1)) # 3*128*128, RGB 141 | input_blob = np.zeros((1, 3, 128, 128), dtype=np.uint8) 142 | input_blob[0] = img 143 | data = mx.nd.array(input_blob) 144 | db = mx.io.DataBatch(data=(data,)) 145 | alignment_handler.model.forward(db, is_train=False) 146 | alabel = alignment_handler.model.get_outputs()[-1].asnumpy()[0] 147 | 148 | img_landmarks = [] 149 | for j in xrange(alabel.shape[0]): 150 | a = cv2.resize(alabel[j], (128, 128)) 151 | ind = np.unravel_index(np.argmax(a, axis=None), a.shape) 152 | ## ind: [y, x] 153 | 154 | origin_x = rect[0] + ind[1] * crop_width / 128 155 | origin_y = rect[2] + ind[0] * crop_height / 128 156 | 157 | img_landmarks.append(str(origin_x)) 158 | img_landmarks.append(str(origin_y)) 159 | 160 | image, img_landmarks = crop_expand_alignment(image, img_landmarks) 161 | return image, img_landmarks 162 | 163 | def test_atnet(config_path): 164 | global wav_file 165 | global img_path 166 | img = cv2.imread(img_path) 167 | example_img, example_lmk = face_alignment(img) 168 | 169 | params = YParams(config_path, 'default') 170 | sample_rate = params.mel['sample_rate'] 171 | hop_step = params.mel['hop_step'] 172 | win_length = params.mel['win_length'] 173 | frame_rate = params.frame_rate 174 | mean = np.load(params.mean_file) 175 | component = np.load(params.components_file) 176 | 177 | example_lmk = np.dot((example_lmk - mean), component[:,:20]) 178 | example_lmk *= np.array([1.5, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0,2.0,1.0,1.0, 1,1,1,1,1, 1,1,1,1,1]) 179 | example_lmk = np.dot(example_lmk, component[:,:20].T) 180 | 181 | wav_loader = WavLoader(sr=sample_rate) 182 | 183 | pose = np.ones([1000,3], dtype=np.float32)*0.0 184 | ear = np.ones([1000,1], dtype=np.float32)*0.6 185 | ear[40:75,:] = np.ones([35,1], dtype=np.float32)*0.2 186 | 187 | pcm = wav_loader.get_data(wav_file) 188 | 189 | frame_wav_scale = sample_rate / frame_rate 190 | frame_mfcc_scale = frame_wav_scale / hop_step 191 | 192 | assert (frame_mfcc_scale - int(frame_mfcc_scale) == 0), "sample_rate/hop_step must divided by frame_rate." 193 | 194 | frame_mfcc_scale = int(frame_mfcc_scale) 195 | min_len = min(ear.shape[0], pose.shape[0], pcm.shape[0]//frame_wav_scale) 196 | 197 | g1 = tf.Graph() 198 | with g1.as_default(): 199 | 200 | ear = tf.convert_to_tensor(ear[np.newaxis, :min_len, :], dtype=tf.float32) 201 | pose = tf.convert_to_tensor(pose[np.newaxis, :min_len, :], dtype=tf.float32) 202 | seq_len = tf.convert_to_tensor(np.array([min_len]), dtype=tf.int32) 203 | example_landmark = tf.convert_to_tensor(example_lmk[np.newaxis, :], dtype=tf.float32) 204 | 205 | pcm_length = hop_step * (min_len * frame_mfcc_scale - 1) + win_length 206 | if (pcm.shape[0] < pcm_length): 207 | pcm = np.pad(pcm, (0, pcm_length - pcm.shape[0]), 'constant', constant_values=(0)) 208 | elif(pcm.shape[0] > pcm_length): 209 | pcm = pcm[:pcm_length] 210 | mfcc = extract_mfcc(pcm[np.newaxis, :], params) 211 | 212 | atnet = ATNet(config_path) 213 | params = atnet.params 214 | params.batch_size = 1 215 | atnet.set_params(params) 216 | 217 | infer_nodes = atnet.build_inference_op(ear, pose, mfcc, example_landmark, seq_len) 218 | 219 | sess = tf.Session() 220 | sess.run(tf.global_variables_initializer()) 221 | tf.train.Saver().restore(sess, 'ckpt_atnet/atnet-80000') 222 | lmk_seq = sess.run(infer_nodes['LandmarkDecoder']) 223 | save_lmkseq_video(lmk_seq, mean, "atnet.avi", wav_file) 224 | 225 | return example_img, example_lmk, lmk_seq 226 | 227 | def test_vgnet(config_path, example_img, example_landmark, lmk_seq): 228 | example_img = cv2.resize(example_img, (128, 128)).astype(np.float32)[np.newaxis, ...] 229 | example_img /= 256.0 230 | example_img = (example_img - 0.5) / 0.5 231 | 232 | params = YParams(config_path, 'default') 233 | 234 | g2 = tf.Graph() 235 | with g2.as_default(): 236 | example_landmark = tf.convert_to_tensor(example_landmark[np.newaxis, :], dtype=tf.float32) 237 | example_img = tf.convert_to_tensor(example_img, dtype=tf.float32) 238 | seq_len = tf.convert_to_tensor(np.array([lmk_seq.shape[1]]), dtype=tf.int32) 239 | lmk_seq = tf.convert_to_tensor((lmk_seq), dtype=tf.float32) 240 | 241 | vgnet = VGNet(config_path) 242 | params = vgnet.params 243 | params.batch_size = 1 244 | vgnet.set_params(params) 245 | 246 | infer_nodes = vgnet.build_inference_op(lmk_seq, example_landmark, example_img, seq_len) 247 | 248 | sess = tf.Session(graph=g2) 249 | sess.run(tf.global_variables_initializer()) 250 | tf.train.Saver().restore(sess, 'ckpt_vgnet/vgnet-70000') 251 | img_seq = sess.run(infer_nodes['Fake_img_seq']) 252 | 253 | save_imgseq_video(img_seq, "vgnet.mp4", wav_file) 254 | 255 | 256 | if (__name__ == '__main__'): 257 | 258 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 259 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 260 | help='the config yaml file') 261 | 262 | opts, argv = cmd_parser.parse_args() 263 | 264 | if (opts.config_path is None): 265 | logger.error('Please check your parameters.') 266 | exit(0) 267 | 268 | config_path = opts.config_path 269 | 270 | if (not os.path.exists(config_path)): 271 | logger.error('config_path not exists') 272 | exit(0) 273 | 274 | example_img, example_landmark, lmk_seq = test_atnet(config_path) 275 | test_vgnet(config_path, example_img, example_landmark, lmk_seq) 276 | 277 | 278 | # lmk_seq = [] 279 | # example_image = None 280 | # example_landmark = None 281 | # params = YParams(config_path, 'default') 282 | # mean = np.load(params.mean_file) 283 | # component = np.load(params.components_file) 284 | 285 | # wav_file = '/Users/donglu/Downloads/cctv_cut.wav' 286 | # cap = cv2.VideoCapture('/Users/donglu/Downloads/cctv_cut.mp4') 287 | # if (cap.isOpened()): 288 | # success, image = cap.read() 289 | # idx = 0 290 | # while (success): 291 | # idx += 1 292 | # if(idx==100): 293 | # break 294 | # [h, w, c] = image.shape 295 | # if c > 3: 296 | # image = image[:, :, :3] 297 | # example_img, example_lmk = face_alignment(image) 298 | # example_lmk = np.dot((example_lmk - mean), component[:,:20]) 299 | # example_lmk *= np.array([1.5, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0,2.0,1.0,1.0, 1,1,1,1,1, 1,1,1,1,1]) 300 | # example_lmk = np.dot(example_lmk, component[:,:20].T) 301 | # if(example_image is None): 302 | # example_image = example_img 303 | # if(example_landmark is None): 304 | # example_landmark = example_lmk 305 | # lmk_seq.append(example_lmk) 306 | 307 | # success, image = cap.read() 308 | # cap.release() 309 | # lmk_seq = np.array(lmk_seq)[np.newaxis,...] 310 | # save_lmkseq_video(lmk_seq, mean, "atnet.avi", wav_file) 311 | 312 | # test_vgnet(config_path, example_image, example_landmark, lmk_seq) 313 | 314 | 315 | -------------------------------------------------------------------------------- /voicepuppet/atvgnet/plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import numpy as np 4 | import os 5 | import cv2 6 | import subprocess 7 | 8 | 9 | def strokeline_lookup(): 10 | ''' 11 | the strokeline index of 68 points. 12 | ''' 13 | Mouth = [[48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54], [54, 55], [55, 56], [56, 57], \ 14 | [57, 58], [58, 59], [59, 48], [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66], \ 15 | [66, 67], [67, 60]] 16 | 17 | Nose = [[27, 28], [28, 29], [29, 30], [30, 31], [30, 35], [31, 32], [32, 33], \ 18 | [33, 34], [34, 35], [27, 31], [27, 35]] 19 | 20 | leftBrow = [[17, 18], [18, 19], [19, 20], [20, 21]] 21 | rightBrow = [[22, 23], [23, 24], [24, 25], [25, 26]] 22 | 23 | leftEye = [[36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [36, 41]] 24 | rightEye = [[42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [42, 47]] 25 | 26 | other = [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], \ 27 | [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], \ 28 | [12, 13], [13, 14], [14, 15], [15, 16]] 29 | 30 | faceLmarkLookups = [] 31 | faceLmarkLookups.append(Mouth) 32 | faceLmarkLookups.append(Nose) 33 | faceLmarkLookups.append(leftBrow) 34 | faceLmarkLookups.append(rightBrow) 35 | faceLmarkLookups.append(leftEye) 36 | faceLmarkLookups.append(rightEye) 37 | faceLmarkLookups.append(other) 38 | return faceLmarkLookups 39 | 40 | 41 | def plot_lmk_seq(save_dir, step, mean, seq_len, real_lmk_seq, lmk_seq): 42 | ''' 43 | merge 128x128 images to a large 9*10 grid picture. 44 | ''' 45 | 46 | ## 9*10 block 47 | block_x = 10 48 | block_y = 9 49 | img_size = 128 50 | 51 | faceLmarkLookups = strokeline_lookup() 52 | 53 | def merge_seq(lmk_seq, big_img, time, h_index): 54 | 55 | for i in range(time): 56 | back_img = np.ones((img_size, img_size), dtype=np.uint8) * 255 57 | lmk = (((lmk_seq[0, i, ...] + mean)/2+0.5) * img_size).astype(np.int32) 58 | for k in range(68): 59 | cv2.circle(back_img, (int(lmk[k * 2]), int(lmk[k * 2 + 1])), 1, [0], -1) 60 | 61 | for part in faceLmarkLookups: 62 | for idx in part: 63 | cv2.line(back_img, (int(lmk[idx[0] * 2]), int(lmk[idx[0] * 2 + 1])), 64 | (int(lmk[idx[1] * 2]), int(lmk[idx[1] * 2 + 1])), (0), 1) 65 | 66 | big_img[(i // block_x + h_index) * img_size: (i // block_x + h_index + 1) * img_size, 67 | (i % block_x) * img_size: (i % block_x + 1) * img_size] = back_img 68 | 69 | return big_img 70 | 71 | ### We only pick the first sequence of the batch, trim length of 30. 72 | if (seq_len[0] > 30): 73 | time = 30 74 | else: 75 | time = seq_len[0] 76 | 77 | big_img = np.zeros((img_size * block_y, img_size * block_x), dtype=np.uint8) 78 | big_img = merge_seq(real_lmk_seq, big_img, time, 0) 79 | big_img = merge_seq(lmk_seq, big_img, time, 3) 80 | 81 | cv2.imwrite('{}/atnet_{}.jpg'.format(save_dir, step), big_img) 82 | 83 | 84 | def plot_image_seq(save_dir, step, mean, seq_len, real_lmk_seq, real_mask_seq, real_img_seq, fake_img_seq, 85 | attention_seq): 86 | ''' 87 | merge 2 sequence of image and attention map to a large image (9*10 grid picture). 88 | ''' 89 | 90 | ## 9*10 block 91 | block_x = 10 92 | block_y = 9 93 | img_size = real_img_seq.shape[2] 94 | 95 | ### We only pick the first sequence of the batch, trim length of 30. 96 | if (seq_len[0] > 30): 97 | time = 30 98 | else: 99 | time = seq_len[0] 100 | 101 | big_img = 255 * np.ones((img_size * block_y, img_size * block_x, 4), dtype=np.uint8) 102 | 103 | for i in range(time): 104 | real_img = (((real_img_seq[0, i, ...] * 0.5) + 0.5) * 256).astype(np.uint8) 105 | fake_img = (((fake_img_seq[0, i, ...] * 0.5) + 0.5) * 256).astype(np.uint8) 106 | real_mask = (((real_mask_seq[0, i, ...] + 1) / 2) * 255).astype(np.uint8) 107 | attention_img = (attention_seq[0, i, ...] * 256).astype(np.uint8) 108 | 109 | lmk = (((real_lmk_seq[0, i, ...] + mean)/2+0.5) * img_size).astype(np.int32) 110 | for k in range(68): 111 | cv2.circle(real_img, (int(lmk[k * 2]), int(lmk[k * 2 + 1])), 1, [255, 255, 0], 1) 112 | 113 | real_img = np.concatenate([real_img, real_mask], axis=-1) 114 | 115 | big_img[i // block_x * img_size: (i // block_x + 1) * img_size, 116 | (i % block_x) * img_size: (i % block_x + 1) * img_size, 117 | :] = real_img 118 | 119 | big_img[(i // block_x + 3) * img_size: (i // block_x + 1 + 3) * img_size, 120 | (i % block_x) * img_size: (i % block_x + 1) * img_size, 121 | :-1] = fake_img 122 | 123 | big_img[(i // block_x + 6) * img_size: (i // block_x + 1 + 6) * img_size, 124 | (i % block_x) * img_size: (i % block_x + 1) * img_size, 125 | :] = cv2.merge((attention_img, attention_img, attention_img, attention_img)) 126 | 127 | cv2.imwrite('{}/vgnet_{}.png'.format(save_dir, step), big_img) 128 | 129 | 130 | def save_lmkseq_video(lmk_seq, mean, output_file, wav_file=None): 131 | img_size = 480 132 | seq_len = lmk_seq.shape[1] 133 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 134 | output_movie = cv2.VideoWriter('temp.avi', fourcc, 25, (img_size, img_size), isColor=False) 135 | faceLmarkLookups = strokeline_lookup() 136 | 137 | for i in range(seq_len): 138 | back_img = np.ones((img_size, img_size), dtype=np.uint8) * 255 139 | lmk = (((lmk_seq[0, i, ...] + mean)/2+0.5) * img_size).astype(np.int32) 140 | for k in range(68): 141 | cv2.circle(back_img, (int(lmk[k * 2]), int(lmk[k * 2 + 1])), 1, [0], -1) 142 | 143 | for part in faceLmarkLookups: 144 | for idx in part: 145 | cv2.line(back_img, (int(lmk[idx[0] * 2]), int(lmk[idx[0] * 2 + 1])), 146 | (int(lmk[idx[1] * 2]), int(lmk[idx[1] * 2 + 1])), (0), 1) 147 | 148 | output_movie.write(back_img) 149 | 150 | if (wav_file is not None): 151 | cmd = 'ffmpeg -y -i temp.avi -i ' + wav_file + ' -c:v copy -c:a aac -strict experimental ' + output_file 152 | subprocess.call(cmd, shell=True) 153 | os.remove('temp.avi') 154 | 155 | 156 | def save_imgseq_video(img_seq, output_file, wav_file=None): 157 | def mkdir(dirname): 158 | if not os.path.isdir(dirname): 159 | os.makedirs(dirname) 160 | 161 | img_size = 128 162 | seq_len = img_seq.shape[1] 163 | mkdir('temp') 164 | 165 | for i in range(seq_len): 166 | real_img = (((img_seq[0, i, ...] * 0.5) + 0.5) * 256).astype(np.uint8) 167 | cv2.imwrite('temp/{}.jpg'.format(i), real_img, [int(cv2.IMWRITE_JPEG_QUALITY), 100]) 168 | 169 | if (wav_file is not None): 170 | cmd = 'ffmpeg -i temp/%d.jpg -i ' + wav_file + ' -c:v libx264 -c:a aac -strict experimental -y -vf format=yuv420p ' + output_file 171 | subprocess.call(cmd, shell=True) 172 | cmd = 'rm -rf temp temp.avi' 173 | subprocess.call(cmd, shell=True) 174 | -------------------------------------------------------------------------------- /voicepuppet/atvgnet/test_atnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """Test for ATNet architectures.""" 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | from optparse import OptionParser 9 | import tensorflow as tf 10 | import numpy as np 11 | import os 12 | from atnet import ATNet 13 | from tinynet import MfccNet 14 | 15 | 16 | class ArchitectureTest(tf.test.TestCase): 17 | 18 | def testATNet(self): 19 | config_path = 'config/params.yml' 20 | with tf.Graph().as_default(): 21 | time = 100 22 | 23 | ### ATNet setting 24 | atnet = ATNet(config_path) 25 | params = atnet.params 26 | params.batch_size = 2 27 | atnet.set_params(params) 28 | 29 | seq_len = np.random.uniform(1, 100, params.batch_size).astype(np.int32) 30 | time = max(seq_len) 31 | 32 | ## landmark: [batch_size, time, 68*2] 33 | landmark = tf.random.uniform([params.batch_size, time, params.landmark_size], minval=-1, maxval=1, 34 | dtype=tf.float32) 35 | ## ears: [batch_size, 1] 36 | ears = tf.random.uniform([params.batch_size, time, 1], minval=0, maxval=1, dtype=tf.float32) 37 | ## poses: [batch_size, 3] 38 | poses = tf.random.uniform([params.batch_size, time, 3], minval=-1, maxval=1, dtype=tf.float32) 39 | ## mfccs: [batch_size, time*frame_mfcc_scale, num_mel_bins] 40 | mfccs = tf.random.uniform([params.batch_size, time * 5, 80], dtype=tf.float32) 41 | ## example_landmark: [batch_size, 68*2] 42 | example_landmark = tf.random.uniform([params.batch_size, params.landmark_size], minval=-1, maxval=1, 43 | dtype=tf.float32) 44 | ## seq_len: [batch_size], in rational size 45 | seq_len = tf.convert_to_tensor(seq_len, dtype=tf.int32) 46 | 47 | def check_nodes(nodes): 48 | ## Test input tensor 49 | self.assertAllEqual(nodes['Landmark'].shape, landmark.shape.as_list()) 50 | self.assertAllEqual(nodes['Ears'].shape, ears.shape.as_list()) 51 | self.assertAllEqual(nodes['Poses'].shape, poses.shape.as_list()) 52 | self.assertAllEqual(nodes['Mfccs'].shape, mfccs.shape.as_list()) 53 | self.assertAllEqual(nodes['Example_landmark'].shape, example_landmark.shape.as_list()) 54 | self.assertAllEqual(nodes['Seq_len'].shape, seq_len.shape.as_list()) 55 | 56 | ## Test MfccEncoder output tensor 57 | self.assertAllEqual(nodes['MfccEncoder'].shape, [params.batch_size, time, params.encode_embedding_size]) 58 | ## Test LandmarkEncoder output tensor 59 | self.assertAllEqual(nodes['LandmarkEncoder'].shape, [params.batch_size, time, params.encode_embedding_size]) 60 | ## Test PoseEncoder output tensor 61 | self.assertAllEqual(nodes['PoseEncoder'].shape, [params.batch_size, time, params.encode_embedding_size]) 62 | ## Test RNNModule output tensor 63 | self.assertAllEqual(nodes['RNNModule'].shape, [params.batch_size, time, params.rnn_hidden_size]) 64 | ## Test LandmarkDecoder output tensor 65 | self.assertAllEqual(nodes['LandmarkDecoder'].shape, [params.batch_size, time, params.landmark_size]) 66 | 67 | ## Test LandmarkDecoder output value range 68 | self.assertAllGreaterEqual(nodes['LandmarkDecoder'], -2) 69 | self.assertAllLessEqual(nodes['LandmarkDecoder'], 2) 70 | 71 | ################## 1. Test train stage ################## 72 | nodes = atnet.build_train_op(landmark, ears, poses, mfccs, example_landmark, seq_len) 73 | with self.session() as sess: 74 | sess.run(tf.global_variables_initializer()) 75 | result = sess.run([nodes['Landmark'], nodes['Ears'], nodes['Poses'], nodes['Mfccs'], nodes['Example_landmark'], 76 | nodes['Seq_len'], nodes['MfccEncoder'], nodes['LandmarkEncoder'], nodes['PoseEncoder'], 77 | nodes['RNNModule'], nodes['LandmarkDecoder']]) 78 | 79 | nodes = {} 80 | nodes.update({'Landmark': result[0]}) 81 | nodes.update({'Ears': result[1]}) 82 | nodes.update({'Poses': result[2]}) 83 | nodes.update({'Mfccs': result[3]}) 84 | nodes.update({'Example_landmark': result[4]}) 85 | nodes.update({'Seq_len': result[5]}) 86 | nodes.update({'MfccEncoder': result[6]}) 87 | nodes.update({'LandmarkEncoder': result[7]}) 88 | nodes.update({'PoseEncoder': result[8]}) 89 | nodes.update({'RNNModule': result[9]}) 90 | nodes.update({'LandmarkDecoder': result[10]}) 91 | check_nodes(nodes) 92 | 93 | ################## 2. Test evaluate stage ################## 94 | nodes = atnet.build_eval_op(landmark, ears, poses, mfccs, example_landmark, seq_len) 95 | with self.session() as sess: 96 | sess.run(tf.global_variables_initializer()) 97 | result = sess.run([nodes['Landmark'], nodes['Ears'], nodes['Poses'], nodes['Mfccs'], nodes['Example_landmark'], 98 | nodes['Seq_len'], nodes['MfccEncoder'], nodes['LandmarkEncoder'], nodes['PoseEncoder'], 99 | nodes['RNNModule'], nodes['LandmarkDecoder']]) 100 | 101 | nodes = {} 102 | nodes.update({'Landmark': result[0]}) 103 | nodes.update({'Ears': result[1]}) 104 | nodes.update({'Poses': result[2]}) 105 | nodes.update({'Mfccs': result[3]}) 106 | nodes.update({'Example_landmark': result[4]}) 107 | nodes.update({'Seq_len': result[5]}) 108 | nodes.update({'MfccEncoder': result[6]}) 109 | nodes.update({'LandmarkEncoder': result[7]}) 110 | nodes.update({'PoseEncoder': result[8]}) 111 | nodes.update({'RNNModule': result[9]}) 112 | nodes.update({'LandmarkDecoder': result[10]}) 113 | check_nodes(nodes) 114 | 115 | 116 | if (__name__ == '__main__'): 117 | tf.test.main() 118 | -------------------------------------------------------------------------------- /voicepuppet/atvgnet/test_vgnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """Test for ATNet architectures.""" 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | import os 11 | import random 12 | from vgnet import VGNet 13 | 14 | 15 | class ArchitectureTest(tf.test.TestCase): 16 | 17 | def testVGNet(self): 18 | config_path = 'config/params.yml' 19 | with tf.Graph().as_default(): 20 | img_size = 128 21 | 22 | ### VGNet setting 23 | vgnet = VGNet(config_path) 24 | params = vgnet.params 25 | params.batch_size = 2 26 | vgnet.set_params(params) 27 | 28 | seq_len = np.random.uniform(1, 100, params.batch_size).astype(np.int32) 29 | time = max(seq_len) 30 | 31 | ## real_landmark_seq: [batch_size, time, 68*2] 32 | real_landmark_seq = tf.random.uniform([params.batch_size, time, params.landmark_size], minval=-1, maxval=1, 33 | dtype=tf.float32) 34 | ## real_mask_seq: [batch_size, time, img_size, img_size, 1] 35 | real_mask_seq = tf.random.uniform([params.batch_size, time, img_size, img_size, 1], minval=0, maxval=1, 36 | dtype=tf.float32) 37 | ## real_img_seq: [batch_size, time, img_size, img_size, 3] 38 | real_img_seq = tf.random.uniform([params.batch_size, time, img_size, img_size, 3], minval=-1, maxval=1, 39 | dtype=tf.float32) 40 | ## example_landmark: [batch_size, 68*2] 41 | example_landmark = tf.random.uniform([params.batch_size, params.landmark_size], minval=-1, maxval=1, 42 | dtype=tf.float32) 43 | ## example_img: [batch_size, img_size, img_size, 3] 44 | example_img = tf.random.uniform([params.batch_size, img_size, img_size, 3], minval=-1, maxval=1, dtype=tf.float32) 45 | ## seq_len: [batch_size], in rational size 46 | seq_len = tf.convert_to_tensor(seq_len, dtype=tf.int32) 47 | 48 | def check_nodes(nodes): 49 | ## Test input tensors' shape 50 | self.assertAllEqual(nodes['Real_landmark_seq'].shape, real_landmark_seq.shape.as_list()) 51 | self.assertAllEqual(nodes['Real_mask_seq'].shape, real_mask_seq.shape.as_list()) 52 | self.assertAllEqual(nodes['Real_img_seq'].shape, real_img_seq.shape.as_list()) 53 | self.assertAllEqual(nodes['Example_landmark'].shape, example_landmark.shape.as_list()) 54 | self.assertAllEqual(nodes['Example_img'].shape, example_img.shape.as_list()) 55 | self.assertAllEqual(nodes['Seq_len'].shape, seq_len.shape.as_list()) 56 | 57 | ## Test Discriminator tensors' shape 58 | self.assertAllEqual(nodes['Discriminator']['Real_node']['Discriminator']['Decision'].shape, [params.batch_size]) 59 | self.assertAllEqual(nodes['Discriminator']['Real_node']['Discriminator']['LandmarkSeq'].shape, 60 | [params.batch_size, time, params.landmark_size]) 61 | self.assertAllEqual(nodes['Discriminator']['Fake_node']['Discriminator']['Decision'].shape, 62 | [params.batch_size]) 63 | self.assertAllEqual(nodes['Discriminator']['Fake_node']['Discriminator']['LandmarkSeq'].shape, 64 | [params.batch_size, time, params.landmark_size]) 65 | self.assertAllEqual(nodes['Discriminator']['Generator_node']['Generator']['Color'].shape, 66 | [params.batch_size, time, img_size, img_size, 3]) 67 | self.assertAllEqual(nodes['Discriminator']['Generator_node']['Generator']['Attention'].shape, 68 | [params.batch_size, time, img_size, img_size, 1]) 69 | self.assertAllEqual(nodes['Discriminator']['Generator_node']['Generator']['Feature'].shape, 70 | [params.batch_size, time, img_size, img_size, 3]) 71 | 72 | ## Test Generator tensors' shape 73 | self.assertAllEqual(nodes['Generator']['Discriminator_node']['Discriminator']['Decision'].shape, 74 | [params.batch_size]) 75 | self.assertAllEqual(nodes['Generator']['Discriminator_node']['Discriminator']['LandmarkSeq'].shape, 76 | [params.batch_size, time, params.landmark_size]) 77 | self.assertAllEqual(nodes['Generator']['Generator_node']['Generator']['Color'].shape, 78 | [params.batch_size, time, img_size, img_size, 3]) 79 | self.assertAllEqual(nodes['Generator']['Generator_node']['Generator']['Attention'].shape, 80 | [params.batch_size, time, img_size, img_size, 1]) 81 | self.assertAllEqual(nodes['Generator']['Generator_node']['Generator']['Feature'].shape, 82 | [params.batch_size, time, img_size, img_size, 3]) 83 | 84 | ## Test input tensors' value range 85 | self.assertAllGreaterEqual(nodes['Real_landmark_seq'], -1) 86 | self.assertAllLessEqual(nodes['Real_landmark_seq'], 1) 87 | self.assertAllGreaterEqual(nodes['Real_mask_seq'], 0) 88 | self.assertAllLessEqual(nodes['Real_mask_seq'], 1) 89 | self.assertAllGreaterEqual(nodes['Real_img_seq'], -1) 90 | self.assertAllLessEqual(nodes['Real_img_seq'], 1) 91 | self.assertAllGreaterEqual(nodes['Example_landmark'], -1) 92 | self.assertAllLessEqual(nodes['Example_landmark'], 1) 93 | self.assertAllGreaterEqual(nodes['Example_img'], -1) 94 | self.assertAllLessEqual(nodes['Example_img'], 1) 95 | self.assertAllGreaterEqual(nodes['Seq_len'], 1) 96 | self.assertAllLessEqual(nodes['Seq_len'], time) 97 | 98 | ## Test Discriminator tensors' value range 99 | self.assertAllGreaterEqual(nodes['Discriminator']['Real_node']['Discriminator']['Decision'], 0) 100 | self.assertAllLessEqual(nodes['Discriminator']['Real_node']['Discriminator']['Decision'], 1) 101 | self.assertAllGreaterEqual(nodes['Discriminator']['Real_node']['Discriminator']['LandmarkSeq'], -2) 102 | self.assertAllLessEqual(nodes['Discriminator']['Real_node']['Discriminator']['LandmarkSeq'], 2) 103 | self.assertAllGreaterEqual(nodes['Discriminator']['Fake_node']['Discriminator']['Decision'], 0) 104 | self.assertAllLessEqual(nodes['Discriminator']['Fake_node']['Discriminator']['Decision'], 1) 105 | self.assertAllGreaterEqual(nodes['Discriminator']['Fake_node']['Discriminator']['LandmarkSeq'], -2) 106 | self.assertAllLessEqual(nodes['Discriminator']['Fake_node']['Discriminator']['LandmarkSeq'], 2) 107 | self.assertAllGreaterEqual(nodes['Discriminator']['Generator_node']['Generator']['Color'], -1) 108 | self.assertAllLessEqual(nodes['Discriminator']['Generator_node']['Generator']['Color'], 1) 109 | self.assertAllGreaterEqual(nodes['Discriminator']['Generator_node']['Generator']['Attention'], 0) 110 | self.assertAllLessEqual(nodes['Discriminator']['Generator_node']['Generator']['Attention'], 1) 111 | self.assertAllGreaterEqual(nodes['Discriminator']['Generator_node']['Generator']['Feature'], -1) 112 | self.assertAllLessEqual(nodes['Discriminator']['Generator_node']['Generator']['Feature'], 1) 113 | 114 | ## Test Generator tensors' value range 115 | self.assertAllGreaterEqual(nodes['Generator']['Discriminator_node']['Discriminator']['Decision'], 0) 116 | self.assertAllLessEqual(nodes['Generator']['Discriminator_node']['Discriminator']['Decision'], 1) 117 | self.assertAllGreaterEqual(nodes['Generator']['Discriminator_node']['Discriminator']['LandmarkSeq'], -2) 118 | self.assertAllLessEqual(nodes['Generator']['Discriminator_node']['Discriminator']['LandmarkSeq'], 2) 119 | self.assertAllGreaterEqual(nodes['Generator']['Generator_node']['Generator']['Color'], -1) 120 | self.assertAllLessEqual(nodes['Generator']['Generator_node']['Generator']['Color'], 1) 121 | self.assertAllGreaterEqual(nodes['Generator']['Generator_node']['Generator']['Attention'], 0) 122 | self.assertAllLessEqual(nodes['Generator']['Generator_node']['Generator']['Attention'], 1) 123 | self.assertAllGreaterEqual(nodes['Generator']['Generator_node']['Generator']['Feature'], -1) 124 | self.assertAllLessEqual(nodes['Generator']['Generator_node']['Generator']['Feature'], 1) 125 | 126 | def walkDict(aDict, key_list, value_list, path=()): 127 | ## visit the nodes dict into key and value list, while keep the hierarchy 128 | for k in aDict: 129 | if type(aDict[k]) != dict: 130 | if ('_grads' in k or '_tvars' in k): 131 | continue 132 | key_list.append(path + (k,)) 133 | value_list.append(aDict[k]) 134 | else: 135 | walkDict(aDict[k], key_list, value_list, path + (k,)) 136 | 137 | ################## 1. Test train stage ################## 138 | nodes = vgnet.build_train_op(real_landmark_seq, real_mask_seq, real_img_seq, example_landmark, example_img, 139 | seq_len) 140 | 141 | with self.session() as sess: 142 | sess.run(tf.global_variables_initializer()) 143 | ## visit the nodes dict into key and value list, while keep the hierarchy 144 | key_list = [] 145 | value_list = [] 146 | walkDict(nodes, key_list, value_list) 147 | 148 | result = sess.run(value_list) 149 | 150 | ## replace the tensor in nodes by numpy matrix after sess.run 151 | for i, tensor in enumerate(result): 152 | node = nodes 153 | for key in key_list[i]: 154 | node = node[key] 155 | node = tensor 156 | 157 | ## test the nodes' shapes and values 158 | check_nodes(nodes) 159 | 160 | ################## 2. Test evaluate stage ################## 161 | nodes = vgnet.build_eval_op(real_landmark_seq, real_mask_seq, real_img_seq, example_landmark, example_img, 162 | seq_len) 163 | with self.session() as sess: 164 | sess.run(tf.global_variables_initializer()) 165 | ## visit the nodes dict into key and value list, while keep the hierarchy 166 | key_list = [] 167 | value_list = [] 168 | walkDict(nodes, key_list, value_list) 169 | 170 | result = sess.run(value_list) 171 | 172 | ## replace the tensor in nodes by numpy matrix after sess.run 173 | for i, tensor in enumerate(result): 174 | node = nodes 175 | for key in key_list[i]: 176 | node = node[key] 177 | node = tensor 178 | 179 | ## test the nodes' shapes and values 180 | check_nodes(nodes) 181 | 182 | 183 | if (__name__ == '__main__'): 184 | tf.test.main() 185 | -------------------------------------------------------------------------------- /voicepuppet/atvgnet/train_atnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | from atnet import ATNet 9 | from dataset.generator import ATNetDataGenerator 10 | from plot import * 11 | 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def mkdir(dirname): 17 | if not os.path.isdir(dirname): 18 | os.makedirs(dirname) 19 | 20 | 21 | if (__name__ == '__main__'): 22 | 23 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 24 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 25 | help='the config yaml file') 26 | 27 | opts, argv = cmd_parser.parse_args() 28 | 29 | if (opts.config_path is None): 30 | logger.error('Please check your parameters.') 31 | exit(0) 32 | 33 | config_path = opts.config_path 34 | 35 | if (not os.path.exists(config_path)): 36 | logger.error('config_path not exists') 37 | exit(0) 38 | 39 | os.environ["CUDA_VISIBLE_DEVICES"] = '1' 40 | 41 | batch_size = 16 42 | ### Generator for training setting 43 | train_generator = ATNetDataGenerator(config_path) 44 | params = train_generator.params 45 | params.dataset_path = params.train_dataset_path 46 | params.batch_size = batch_size 47 | train_generator.set_params(params) 48 | train_dataset = train_generator.get_dataset() 49 | 50 | ### Generator for evaluation setting 51 | eval_generator = ATNetDataGenerator(config_path) 52 | params = eval_generator.params 53 | params.dataset_path = params.eval_dataset_path 54 | params.batch_size = batch_size 55 | eval_generator.set_params(params) 56 | eval_dataset = eval_generator.get_dataset() 57 | 58 | sess = tf.Session() 59 | tf.train.start_queue_runners(sess=sess) 60 | 61 | train_iter = train_dataset.make_one_shot_iterator() 62 | eval_iter = eval_dataset.make_one_shot_iterator() 63 | 64 | ### ATNet setting 65 | atnet = ATNet(config_path) 66 | params = atnet.params 67 | epochs = params.training['epochs'] 68 | params.add_hparam('max_to_keep', 10) 69 | params.add_hparam('save_dir', 'ckpt_atnet') 70 | params.add_hparam('save_name', 'atnet') 71 | params.add_hparam('save_step', 1000) 72 | params.add_hparam('eval_step', 1000) 73 | params.add_hparam('summary_step', 100) 74 | params.add_hparam('eval_visual_dir', 'log/eval_atnet') 75 | params.add_hparam('summary_dir', 'log/summary_atnet') 76 | params.batch_size = batch_size 77 | atnet.set_params(params) 78 | mean = np.load(params.mean_file) 79 | 80 | mkdir(params.save_dir) 81 | mkdir(params.eval_visual_dir) 82 | mkdir(params.summary_dir) 83 | 84 | train_nodes = atnet.build_train_op(*train_iter.get_next()) 85 | eval_nodes = atnet.build_eval_op(*eval_iter.get_next()) 86 | sess.run(tf.global_variables_initializer()) 87 | 88 | # Restore from save_dir 89 | if ('checkpoint' in os.listdir(params.save_dir)): 90 | tf.train.Saver().restore(sess, tf.train.latest_checkpoint(params.save_dir)) 91 | 92 | tf.summary.scalar("loss", train_nodes['Loss']) 93 | tf.summary.scalar("lr", train_nodes['Lr']) 94 | grads = train_nodes['Grads'] 95 | tvars = train_nodes['Tvars'] 96 | # Add histograms for gradients. 97 | for i, grad in enumerate(grads): 98 | if grad is not None: 99 | var = tvars[i] 100 | if ('BatchNorm' not in var.op.name): 101 | tf.summary.histogram(var.op.name + '/gradients', grad) 102 | 103 | merge_summary_op = tf.summary.merge_all() 104 | summary_writer = tf.summary.FileWriter(params.summary_dir, graph=sess.graph) 105 | 106 | for i in range(epochs): 107 | ### Run training 108 | result = sess.run([train_nodes['Train_op'], 109 | merge_summary_op, 110 | train_nodes['Loss'], 111 | train_nodes['Lr'], 112 | train_nodes['Global_step'], 113 | train_nodes['Mfccs'], 114 | train_nodes['Poses'], 115 | train_nodes['Ears'], 116 | train_nodes['Seq_len'], 117 | train_nodes['Landmark'], 118 | train_nodes['Example_landmark']]) 119 | _, summary, loss, lr, global_step, mfccs, poses, ears, seq_len, landmark, example_landmark = result 120 | print('Step {}: Loss= {:.3f}, Lr= {:.2e}'.format(global_step, loss, lr)) 121 | 122 | if (global_step % params.summary_step == 0): 123 | summary_writer.add_summary(summary, global_step) 124 | 125 | ### Run evaluation 126 | if (global_step % params.eval_step == 0): 127 | result = sess.run([eval_nodes['Loss'], 128 | eval_nodes['Seq_len'], 129 | eval_nodes['Landmark'], 130 | eval_nodes['LandmarkDecoder']]) 131 | loss, seq_len, real_lmk_seq, lmk_seq = result 132 | 133 | print('\r\nEvaluation >>> Loss= {:.3f}'.format(loss)) 134 | plot_lmk_seq(params.eval_visual_dir, global_step, mean, seq_len, real_lmk_seq, lmk_seq) 135 | 136 | ### Save checkpoint 137 | if (global_step % params.save_step == 0): 138 | tf.train.Saver(max_to_keep=params.max_to_keep, var_list=tf.global_variables()).save(sess, 139 | os.path.join(params.save_dir, 140 | params.save_name), 141 | global_step=global_step) 142 | -------------------------------------------------------------------------------- /voicepuppet/atvgnet/train_vgnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from vgnet import VGNet 7 | from dataset.generator import VGNetDataGenerator 8 | from optparse import OptionParser 9 | import logging 10 | from plot import * 11 | 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def mkdir(dirname): 17 | if not os.path.isdir(dirname): 18 | os.makedirs(dirname) 19 | 20 | 21 | if (__name__ == '__main__'): 22 | 23 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 24 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 25 | help='the config yaml file') 26 | 27 | opts, argv = cmd_parser.parse_args() 28 | 29 | if (opts.config_path is None): 30 | logger.error('Please check your parameters.') 31 | exit(0) 32 | 33 | config_path = opts.config_path 34 | 35 | if (not os.path.exists(config_path)): 36 | logger.error('config_path not exists') 37 | exit(0) 38 | 39 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 40 | 41 | batch_size = 4 42 | ### Generator for training setting 43 | train_generator = VGNetDataGenerator(config_path) 44 | params = train_generator.params 45 | params.dataset_path = params.train_dataset_path 46 | params.batch_size = batch_size 47 | train_generator.set_params(params) 48 | train_dataset = train_generator.get_dataset() 49 | 50 | ### Generator for evaluation setting 51 | eval_generator = VGNetDataGenerator(config_path) 52 | params = eval_generator.params 53 | params.dataset_path = params.eval_dataset_path 54 | params.batch_size = batch_size 55 | eval_generator.set_params(params) 56 | eval_dataset = eval_generator.get_dataset() 57 | 58 | sess = tf.Session() 59 | tf.train.start_queue_runners(sess=sess) 60 | 61 | train_iter = train_dataset.make_one_shot_iterator() 62 | eval_iter = eval_dataset.make_one_shot_iterator() 63 | 64 | ### VGNet setting 65 | vgnet = VGNet(config_path) 66 | params = vgnet.params 67 | epochs = params.training['epochs'] 68 | params.add_hparam('max_to_keep', 10) 69 | params.add_hparam('save_dir', 'ckpt_vgnet') 70 | params.add_hparam('save_name', 'vgnet') 71 | params.add_hparam('save_step', 1000) 72 | params.add_hparam('eval_step', 1000) 73 | params.add_hparam('summary_step', 100) 74 | params.add_hparam('alternative', 1000) 75 | params.add_hparam('eval_visual_dir', 'log/eval_vgnet') 76 | params.add_hparam('summary_dir', 'log/summary_vgnet') 77 | params.batch_size = batch_size 78 | vgnet.set_params(params) 79 | mean = np.load(params.mean_file) 80 | 81 | mkdir(params.save_dir) 82 | mkdir(params.eval_visual_dir) 83 | mkdir(params.summary_dir) 84 | 85 | train_nodes = vgnet.build_train_op(*train_iter.get_next()) 86 | eval_nodes = vgnet.build_eval_op(*eval_iter.get_next()) 87 | sess.run(tf.global_variables_initializer()) 88 | 89 | # Restore from save_dir 90 | if ('checkpoint' in os.listdir(params.save_dir)): 91 | tf.train.Saver().restore(sess, tf.train.latest_checkpoint(params.save_dir, latest_filename=None)) 92 | 93 | # Add summary when training 94 | discriminator_summary = [] 95 | discriminator_summary.append(tf.summary.scalar("real_bce_loss", train_nodes['Discriminator']['Real_bce_loss'])) 96 | discriminator_summary.append(tf.summary.scalar("real_lmk_loss", train_nodes['Discriminator']['Real_lmk_loss'])) 97 | discriminator_summary.append(tf.summary.scalar("fake_bce_loss", train_nodes['Discriminator']['Fake_bce_loss'])) 98 | discriminator_summary.append(tf.summary.scalar("fake_lmk_loss", train_nodes['Discriminator']['Fake_lmk_loss'])) 99 | discriminator_summary.append( 100 | tf.summary.scalar("discriminator_loss", train_nodes['Discriminator']['Discriminator_loss'])) 101 | 102 | generator_summary = [] 103 | generator_summary.append(tf.summary.scalar("bce_loss", train_nodes['Generator']['Bce_loss'])) 104 | generator_summary.append(tf.summary.scalar("lmk_loss", train_nodes['Generator']['Lmk_loss'])) 105 | generator_summary.append(tf.summary.scalar("pix_loss", train_nodes['Generator']['Pix_loss'])) 106 | generator_summary.append(tf.summary.scalar("generator_loss", train_nodes['Generator']['Generator_loss'])) 107 | 108 | # Add gradient to summary 109 | grads = train_nodes['Discriminator_grads'] 110 | tvars = train_nodes['Discriminator_tvars'] 111 | for i, grad in enumerate(grads): 112 | if grad is not None: 113 | var = tvars[i] 114 | if('BatchNorm' not in var.name): 115 | discriminator_summary.append(tf.summary.histogram(var.op.name + '/gradients', grad)) 116 | 117 | grads = train_nodes['Generator_grads'] 118 | tvars = train_nodes['Generator_tvars'] 119 | for i, grad in enumerate(grads): 120 | if grad is not None: 121 | var = tvars[i] 122 | if('BatchNorm' not in var.name): 123 | generator_summary.append(tf.summary.histogram(var.op.name + '/gradients', grad)) 124 | 125 | discriminator_summary_op = tf.summary.merge(discriminator_summary) 126 | generator_summary_op = tf.summary.merge(generator_summary) 127 | lr_summary_op = tf.summary.scalar("lr", train_nodes['Lr']) 128 | 129 | summary_writer = tf.summary.FileWriter(params.summary_dir, graph=sess.graph) 130 | 131 | # Run epoch 132 | for i in range(epochs): 133 | if ((i // params.alternative) % 2 == 0): 134 | ### Run discriminator training 135 | result = sess.run([train_nodes['Train_discriminator'], 136 | discriminator_summary_op, 137 | train_nodes['Lr'], 138 | train_nodes['Global_step'], 139 | train_nodes['Discriminator']['Real_bce_loss'], 140 | train_nodes['Discriminator']['Real_lmk_loss'], 141 | train_nodes['Discriminator']['Fake_bce_loss'], 142 | train_nodes['Discriminator']['Fake_lmk_loss'], 143 | train_nodes['Discriminator']['Discriminator_loss']]) 144 | _, summary, lr, global_step, real_bce_loss, real_lmk_loss, fake_bce_loss, fake_lmk_loss, discriminator_loss = result 145 | print( 146 | 'Step {}: Lr= {:.2e}, Discriminator_loss= {:.3f}, [Real_bce_loss= {:.3f}, Real_lmk_loss= {:.3f}, Fake_bce_loss= {:.3f}, Fake_lmk_loss= {:.3f}]'.format( 147 | global_step, lr, discriminator_loss, real_bce_loss, real_lmk_loss, fake_bce_loss, fake_lmk_loss)) 148 | 149 | else: 150 | ### Run generator training 151 | result = sess.run([train_nodes['Train_generator'], 152 | generator_summary_op, 153 | train_nodes['Lr'], 154 | train_nodes['Global_step'], 155 | train_nodes['Generator']['Bce_loss'], 156 | train_nodes['Generator']['Lmk_loss'], 157 | train_nodes['Generator']['Pix_loss'], 158 | train_nodes['Generator']['Generator_loss']]) 159 | _, summary, lr, global_step, bce_loss, lmk_loss, pix_loss, generator_loss = result 160 | print( 161 | 'Step {}: Lr= {:.2e}, Generator_loss= {:.3f}, [Bce_loss= {:.3f}, Lmk_loss= {:.3f}, Pix_loss= {:.3f}]'.format( 162 | global_step, lr, 163 | generator_loss, 164 | bce_loss, lmk_loss, pix_loss)) 165 | 166 | if (global_step % params.summary_step == 0): 167 | summary_writer.add_summary(summary, global_step) 168 | 169 | ### Run evaluation 170 | if (global_step % params.eval_step == 0): 171 | result = sess.run([eval_nodes['Real_landmark_seq'], 172 | eval_nodes['Real_mask_seq'], 173 | eval_nodes['Real_img_seq'], 174 | eval_nodes['Example_landmark'], 175 | eval_nodes['Example_img'], 176 | eval_nodes['Seq_len'], 177 | eval_nodes['Generator']['Fake_img_seq'], 178 | eval_nodes['Generator']['Attention'], 179 | eval_nodes['Generator']['Generator_loss'], 180 | eval_nodes['Discriminator']['Discriminator_loss']]) 181 | real_landmark_seq, real_mask_seq, real_img_seq, example_landmark, example_img, seq_len, fake_img_seq, attention, generator_loss, discriminator_loss = result 182 | 183 | print('\r\nEvaluation >>> Generator_loss= {:.3f}, Discriminator_loss= {:.3f}'.format(generator_loss, 184 | discriminator_loss)) 185 | plot_image_seq(params.eval_visual_dir, global_step, mean, seq_len, real_landmark_seq, real_mask_seq, real_img_seq, 186 | fake_img_seq, attention) 187 | 188 | ### Save checkpoint 189 | if (global_step % params.save_step == 0): 190 | tf.train.Saver(max_to_keep=params.max_to_keep, var_list=tf.global_variables()).save(sess, 191 | os.path.join(params.save_dir, 192 | params.save_name), 193 | global_step=global_step) 194 | -------------------------------------------------------------------------------- /voicepuppet/bfmnet/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "tinynet", 10 | srcs = ["tinynet.py"], 11 | deps = [ 12 | ], 13 | ) 14 | 15 | py_library( 16 | name = "bfmnet", 17 | srcs = ["bfmnet.py"], 18 | deps = [ 19 | "//config:configure", 20 | ":tinynet", 21 | "//voicepuppet:builder" 22 | ], 23 | ) 24 | 25 | py_binary( 26 | name = "train_bfmnet", 27 | srcs = ["train_bfmnet.py"], 28 | deps = [ 29 | "//utils:bfm_load_data", 30 | "//utils:bfm_visual", 31 | "//utils:reconstruct_mesh", 32 | "//utils:utils", 33 | ":bfmnet", 34 | "//generator:generator" 35 | ], 36 | ) 37 | 38 | py_binary( 39 | name = "infer_bfmnet", 40 | srcs = ["infer_bfmnet.py"], 41 | deps = [ 42 | "//utils:bfm_load_data", 43 | "//utils:bfm_visual", 44 | "//utils:reconstruct_mesh", 45 | "//utils:utils", 46 | ":bfmnet", 47 | "//generator:generator", 48 | "//generator:loader" 49 | ], 50 | ) 51 | -------------------------------------------------------------------------------- /voicepuppet/bfmnet/infer_bfmnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | import subprocess 9 | from generator.loader import * 10 | from bfmnet import BFMNet 11 | from generator.generator import DataGenerator 12 | from utils.bfm_load_data import * 13 | from utils.bfm_visual import * 14 | from utils.utils import * 15 | import scipy 16 | 17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | logger = logging.getLogger(__name__) 19 | 20 | # ######################################################################################################### 21 | # facemodel = BFM('../allmodels') 22 | # def visual_3dface(root, name): 23 | # mkdir('output') 24 | # for file in os.listdir('output'): 25 | # os.system('rm -rf output/{}'.format(file)) 26 | 27 | # bfmcoeff_loader = BFMCoeffLoader() 28 | # bfm_coeff_seq = bfmcoeff_loader.get_data(os.path.join(root, 'bfmcoeff.txt')) 29 | # audio_file = os.path.join(root, 'audio.wav') 30 | # id_coeff = np.mean(bfm_coeff_seq[:, :80], 0) 31 | 32 | # for i in range(bfm_coeff_seq.shape[0]): 33 | # bfm_coeff_seq[i, :80] = id_coeff 34 | 35 | # for i in range(bfm_coeff_seq.shape[0]): 36 | # face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d, _ = Reconstruction( 37 | # bfm_coeff_seq[i:i + 1, ...], facemodel) 38 | # if(i>300): 39 | # break 40 | # shape = np.squeeze(face_shape, (0)) 41 | # color = np.squeeze(face_color, (0)) 42 | # color = np.clip(color, 0, 255).astype(np.int32) 43 | # shape[:, :2] = 112 - shape[:, :2] * 112 44 | # shape *=3 45 | 46 | # img_size = 672 47 | # new_image = np.zeros((img_size * img_size * 3), dtype=np.uint8) 48 | # face_mask = np.zeros((img_size * img_size), dtype=np.uint8) 49 | 50 | # vertices = shape.reshape(-1).astype(np.float32).copy() 51 | # triangles = (facemodel.tri - 1).reshape(-1).astype(np.int32).copy() 52 | # colors = color.reshape(-1).astype(np.float32).copy() 53 | # depth_buffer = (np.zeros((img_size * img_size)) - 99999.0).astype(np.float32) 54 | # mesh_core_cython.render_colors_core(new_image, face_mask, vertices, triangles, colors, depth_buffer, 55 | # facemodel.tri.shape[0], img_size, img_size, 3) 56 | # new_image = new_image.reshape([img_size, img_size, 3]) 57 | 58 | # new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2RGB) 59 | 60 | # cv2.imwrite('output/{}.jpg'.format(i), new_image) 61 | # print(i) 62 | 63 | # cmd = 'ffmpeg -i output/%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y {}'.format(name) 64 | # subprocess.call(cmd, shell=True) 65 | 66 | # root = '/media/dong/DiskData/gridcorpus/todir/vid1' 67 | # for folder in os.listdir(root): 68 | # name = os.path.join(root, folder+'.mp4') 69 | # visual_3dface(os.path.join(root, folder), name) 70 | # sys.exit(0) 71 | # ######################################################################################################### 72 | 73 | def alignto_bfm_coeff(model_dir, img, xys): 74 | from PIL import Image 75 | import tensorflow as tf 76 | 77 | def load_graph(graph_filename): 78 | with tf.gfile.GFile(graph_filename, 'rb') as f: 79 | graph_def = tf.GraphDef() 80 | graph_def.ParseFromString(f.read()) 81 | 82 | return graph_def 83 | 84 | # read standard landmarks for preprocessing images 85 | lm3D = load_lm3d(model_dir) 86 | 87 | # build reconstruction model 88 | with tf.Graph().as_default() as graph, tf.device('/cpu:0'): 89 | images = tf.placeholder(name='input_imgs', shape=[None, 224, 224, 3], dtype=tf.float32) 90 | graph_def = load_graph(os.path.join(model_dir, "FaceReconModel.pb")) 91 | tf.import_graph_def(graph_def, name='resnet', input_map={'input_imgs:0': images}) 92 | 93 | # output coefficients of R-Net (dim = 257) 94 | coeff = graph.get_tensor_by_name('resnet/coeff:0') 95 | 96 | with tf.Session() as sess: 97 | ps = map(lambda x: int(x), xys) 98 | 99 | left_eye_x = int(round((ps[72] + ps[74] + ps[76] + ps[78] + ps[80] + ps[82]) / 6)) 100 | left_eye_y = int(round((ps[73] + ps[75] + ps[77] + ps[79] + ps[81] + ps[83]) / 6)) 101 | right_eye_x = int(round((ps[84] + ps[86] + ps[88] + ps[90] + ps[92] + ps[94]) / 6)) 102 | right_eye_y = int(round((ps[85] + ps[87] + ps[89] + ps[91] + ps[93] + ps[95]) / 6)) 103 | nose_x = int(round(ps[60])) 104 | nose_y = int(round(ps[61])) 105 | left_mouse_x = int(round(ps[96])) 106 | left_mouse_y = int(round(ps[97])) 107 | right_mouse_x = int(round(ps[108])) 108 | right_mouse_y = int(round(ps[109])) 109 | 110 | lmk5 = np.array( 111 | [[left_eye_x, left_eye_y], [right_eye_x, right_eye_y], [nose_x, nose_y], [left_mouse_x, left_mouse_y], 112 | [right_mouse_x, right_mouse_y]]) 113 | 114 | image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) 115 | # preprocess input image 116 | input_img, lm_new, transform_params = Preprocess(image, lmk5, lm3D) 117 | bfmcoeff = sess.run(coeff, feed_dict={images: input_img}) 118 | return bfmcoeff, input_img, transform_params 119 | 120 | 121 | if (__name__ == '__main__'): 122 | 123 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 124 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 125 | help='the config yaml file') 126 | 127 | opts, argv = cmd_parser.parse_args() 128 | 129 | if (opts.config_path is None): 130 | logger.error('Please check your parameters.') 131 | exit(0) 132 | 133 | config_path = opts.config_path 134 | 135 | if (not os.path.exists(config_path)): 136 | logger.error('config_path not exists') 137 | exit(0) 138 | 139 | image_file, audio_file = argv 140 | 141 | os.environ["CUDA_VISIBLE_DEVICES"] = '-1' 142 | mkdir('output') 143 | for file in os.listdir('output'): 144 | os.system('rm -rf output/{}'.format(file)) 145 | 146 | batch_size = 1 147 | ### Generator for inference setting 148 | infer_generator = DataGenerator(config_path) 149 | params = infer_generator.params 150 | params.batch_size = batch_size 151 | infer_generator.set_params(params) 152 | wav_loader = WavLoader(sr=infer_generator.sample_rate) 153 | pcm = wav_loader.get_data(audio_file) 154 | 155 | pad_len = int(1 + pcm.shape[0] / infer_generator.frame_wav_scale) 156 | # calculate the rational length of pcm in order to keep the alignment of mfcc and landmark sequence. 157 | pcm_length = infer_generator.hop_step * (pad_len * infer_generator.frame_mfcc_scale - 1) + infer_generator.win_length 158 | if (pcm.shape[0] < pcm_length): 159 | pcm = np.pad(pcm, (0, pcm_length - pcm.shape[0]), 'constant', constant_values=(0)) 160 | pcm_slice = pcm[:pcm_length][np.newaxis, :] 161 | 162 | ears = np.ones([1, pad_len, 1], dtype=np.float32)*0.9 163 | for i in range(pad_len//2): 164 | ears[0, i, 0] = 0.2 165 | ears = tf.convert_to_tensor(ears, dtype=tf.float32) 166 | mfcc = infer_generator.extract_mfcc(pcm_slice) 167 | img = cv2.imread(image_file) 168 | 169 | _, _, img_cropped, lmk_cropped, center_x, center_y, ratio = get_mxnet_sat_alignment(params.model_dir, img) 170 | bfmcoeff, input_img, transform_params = alignto_bfm_coeff(params.model_dir, img_cropped, lmk_cropped) 171 | ratio *= transform_params[2] 172 | tx = -int(round(transform_params[3] / ratio)) 173 | ty = -int(round(transform_params[4] / ratio)) 174 | 175 | seq_len = tf.convert_to_tensor([pad_len], dtype=tf.int32) 176 | 177 | config = tf.ConfigProto() 178 | config.gpu_options.allow_growth = True 179 | sess = tf.Session(config=config) 180 | 181 | ### BFMNet setting 182 | bfmnet = BFMNet(config_path) 183 | params = bfmnet.params 184 | params.batch_size = batch_size 185 | bfmnet.set_params(params) 186 | facemodel = BFM(params.model_dir) 187 | 188 | infer_nodes = bfmnet.build_inference_op(ears, mfcc, seq_len) 189 | sess.run(tf.global_variables_initializer()) 190 | 191 | # Restore from save_dir 192 | tf.train.Saver().restore(sess, 'ckpt_bfmnet/bfmnet-65000') 193 | 194 | ### Run inference 195 | bfm_coeff_seq = sess.run(infer_nodes['BFMCoeffDecoder']) 196 | bfmcoeff = np.tile(bfmcoeff[:, np.newaxis, :], [1, bfm_coeff_seq.shape[1], 1]) 197 | 198 | bfm_coeff_seq = np.concatenate([bfmcoeff[:, :, :80], bfm_coeff_seq[:, :, :], bfmcoeff[:, :, 144:]], axis=2) 199 | merge_images = [] 200 | 201 | ### step 2: generate tuple image sequence 202 | angles = np.array([[0, 0, 0]], dtype=np.float32) 203 | shift = 0.04 204 | for i in range(bfm_coeff_seq.shape[1]): 205 | angles[0][1] += shift 206 | if (angles[0][1] > 0.8 or angles[0][1] < -0.8): 207 | shift = -shift 208 | 209 | face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d, _ = Reconstruction( 210 | bfm_coeff_seq[0, i:i + 1, ...], facemodel) 211 | 212 | shape = np.squeeze(face_shape, (0)) 213 | color = np.squeeze(face_color, (0)) 214 | color = np.clip(color, 0, 255).astype(np.int32) 215 | shape[:, :2] = 112 - shape[:, :2] * 112 216 | shape *=3 217 | 218 | img_size = 672 219 | new_image = np.zeros((img_size * img_size * 3), dtype=np.uint8) 220 | face_mask = np.zeros((img_size * img_size), dtype=np.uint8) 221 | 222 | vertices = shape.reshape(-1).astype(np.float32).copy() 223 | triangles = (facemodel.tri - 1).reshape(-1).astype(np.int32).copy() 224 | colors = color.reshape(-1).astype(np.float32).copy() 225 | depth_buffer = (np.zeros((img_size * img_size)) - 99999.0).astype(np.float32) 226 | mesh_core_cython.render_colors_core(new_image, face_mask, vertices, triangles, colors, depth_buffer, 227 | facemodel.tri.shape[0], img_size, img_size, 3) 228 | new_image = new_image.reshape([img_size, img_size, 3]) 229 | 230 | new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2RGB) 231 | 232 | cv2.imwrite('output/{}.jpg'.format(i), new_image) 233 | 234 | cmd = 'ffmpeg -i output/%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y temp.mp4' 235 | subprocess.call(cmd, shell=True) 236 | -------------------------------------------------------------------------------- /voicepuppet/bfmnet/train_bfmnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | import sys 9 | 10 | sys.path.append(os.path.join(os.getcwd(), 'generator')) 11 | sys.path.append(os.path.join(os.getcwd(), 'utils')) 12 | 13 | from bfmnet import BFMNet 14 | from generator import BFMNetDataGenerator 15 | from bfm_load_data import * 16 | from bfm_visual import * 17 | from utils import * 18 | 19 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | if (__name__ == '__main__'): 24 | 25 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 26 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 27 | help='the config yaml file') 28 | 29 | opts, argv = cmd_parser.parse_args() 30 | 31 | if (opts.config_path is None): 32 | logger.error('Please check your parameters.') 33 | exit(0) 34 | 35 | config_path = opts.config_path 36 | 37 | if (not os.path.exists(config_path)): 38 | logger.error('config_path not exists') 39 | exit(0) 40 | 41 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 42 | 43 | batch_size = 4 44 | ### Generator for training setting 45 | train_generator = BFMNetDataGenerator(config_path) 46 | params = train_generator.params 47 | params.dataset_path = params.train_dataset_path 48 | params.batch_size = batch_size 49 | train_generator.set_params(params) 50 | train_dataset = train_generator.get_dataset() 51 | 52 | ### Generator for evaluation setting 53 | eval_generator = BFMNetDataGenerator(config_path) 54 | params = eval_generator.params 55 | params.dataset_path = params.eval_dataset_path 56 | params.batch_size = batch_size 57 | eval_generator.set_params(params) 58 | eval_dataset = eval_generator.get_dataset() 59 | 60 | config = tf.ConfigProto() 61 | config.gpu_options.allow_growth = True 62 | sess = tf.Session(config=config) 63 | 64 | tf.train.start_queue_runners(sess=sess) 65 | 66 | train_iter = train_dataset.make_one_shot_iterator() 67 | eval_iter = eval_dataset.make_one_shot_iterator() 68 | 69 | ### BFMNet setting 70 | bfmnet = BFMNet(config_path) 71 | params = bfmnet.params 72 | epochs = params.training['epochs'] 73 | params.add_hparam('max_to_keep', 10) 74 | params.add_hparam('save_dir', 'ckpt_bfmnet') 75 | params.add_hparam('save_name', 'bfmnet') 76 | params.add_hparam('save_step', 5000) 77 | params.add_hparam('eval_step', 1000) 78 | # params.add_hparam('summary_step', 1000) 79 | params.add_hparam('eval_visual_dir', 'log/eval_bfmnet') 80 | # params.add_hparam('summary_dir', 'log/summary_bfmnet') 81 | params.batch_size = batch_size 82 | bfmnet.set_params(params) 83 | facemodel = BFM(params.model_dir) 84 | 85 | mkdir(params.save_dir) 86 | mkdir(params.eval_visual_dir) 87 | # mkdir(params.summary_dir) 88 | 89 | train_nodes = bfmnet.build_train_op(*train_iter.get_next()) 90 | eval_nodes = bfmnet.build_eval_op(*eval_iter.get_next()) 91 | sess.run(tf.global_variables_initializer()) 92 | 93 | # Restore from save_dir 94 | if ('checkpoint' in os.listdir(params.save_dir)): 95 | print('Restore from {}\n'.format(params.save_dir)) 96 | tf.train.Saver().restore(sess, tf.train.latest_checkpoint(params.save_dir)) 97 | 98 | # tf.summary.scalar("loss", train_nodes['Loss']) 99 | # tf.summary.scalar("lr", train_nodes['Lr']) 100 | # grads = train_nodes['Grads'] 101 | # tvars = train_nodes['Tvars'] 102 | # # Add histograms for gradients. 103 | # for i, grad in enumerate(grads): 104 | # if grad is not None: 105 | # var = tvars[i] 106 | # if ('BatchNorm' not in var.op.name): 107 | # tf.summary.histogram(var.op.name + '/gradients', grad) 108 | 109 | # merge_summary_op = tf.summary.merge_all() 110 | # summary_writer = tf.summary.FileWriter(params.summary_dir, graph=sess.graph) 111 | 112 | for i in range(epochs): 113 | ### Run training 114 | result = sess.run([train_nodes['Train_op'], 115 | # merge_summary_op, 116 | train_nodes['Loss'], 117 | train_nodes['Lr'], 118 | train_nodes['Global_step'], 119 | train_nodes['Mfccs'], 120 | train_nodes['Seq_len'], 121 | train_nodes['BFM_coeff_seq'], 122 | train_nodes['Ears']]) 123 | _, loss, lr, global_step, mfccs, seq_len, bfm_coeff_seq, ears = result 124 | print('Step {}: Loss= {:.3f}, Lr= {:.2e}'.format(global_step, loss, lr)) 125 | 126 | # if (global_step % params.summary_step == 0): 127 | # summary_writer.add_summary(summary, global_step) 128 | 129 | ### Run evaluation 130 | if (global_step % params.eval_step == 0): 131 | result = sess.run([eval_nodes['Loss'], 132 | eval_nodes['Seq_len'], 133 | eval_nodes['BFM_coeff_seq'], 134 | eval_nodes['BFMCoeffDecoder']]) 135 | loss, seq_len, real_bfm_coeff_seq, bfm_coeff_seq = result 136 | 137 | print('\r\nEvaluation >>> Loss= {:.3f}'.format(loss)) 138 | plot_bfm_coeff_seq(params.eval_visual_dir, facemodel, global_step, seq_len, real_bfm_coeff_seq, bfm_coeff_seq) 139 | 140 | ### Save checkpoint 141 | if (global_step % params.save_step == 0): 142 | tf.train.Saver(max_to_keep=params.max_to_keep, var_list=tf.global_variables()).save(sess, 143 | os.path.join(params.save_dir, 144 | params.save_name), 145 | global_step=global_step) 146 | -------------------------------------------------------------------------------- /voicepuppet/builder.py: -------------------------------------------------------------------------------- 1 | class ModelBuilder(object): 2 | 3 | def __init__(self): 4 | raise NotImplementError('__init__ not implemented.') 5 | 6 | def build_network(self): 7 | raise NotImplementError('build_network not implemented.') 8 | 9 | def __call__(self): 10 | raise NotImplementError('__call__ not implemented.') 11 | -------------------------------------------------------------------------------- /voicepuppet/pixflow/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "pixflow", 10 | srcs = ["pixflow.py"], 11 | deps = [ 12 | "//config:configure", 13 | "//voicepuppet:builder" 14 | ], 15 | ) 16 | 17 | py_binary( 18 | name = "train_pixflow", 19 | srcs = ["train_pixflow.py"], 20 | deps = [ 21 | "//utils:utils", 22 | ":pixflow", 23 | "//generator:generator", 24 | "//generator:loader" 25 | ], 26 | ) 27 | 28 | py_binary( 29 | name = "infer_pixflow", 30 | srcs = ["infer_pixflow.py"], 31 | deps = [ 32 | "//utils:bfm_load_data", 33 | "//utils:reconstruct_mesh", 34 | "//utils:bfm_visual", 35 | "//utils:utils", 36 | ":pixflow", 37 | "//voicepuppet/bfmnet:bfmnet", 38 | "//generator:generator", 39 | "//generator:loader" 40 | ], 41 | ) 42 | 43 | py_binary( 44 | name = "infer_bfm_pixflow", 45 | srcs = ["infer_bfm_pixflow.py"], 46 | deps = [ 47 | "//utils:bfm_load_data", 48 | "//utils:reconstruct_mesh", 49 | "//utils:bfm_visual", 50 | "//utils:utils", 51 | ":pixflow", 52 | "//voicepuppet/bfmnet:bfmnet", 53 | "//generator:generator", 54 | "//generator:loader" 55 | ], 56 | ) 57 | -------------------------------------------------------------------------------- /voicepuppet/pixflow/infer_bfm_pixflow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | import subprocess 9 | from pixflow import PixFlowNet 10 | from voicepuppet.bfmnet.bfmnet import BFMNet 11 | from generator.loader import * 12 | from generator.generator import DataGenerator 13 | from utils.bfm_load_data import * 14 | from utils.bfm_visual import * 15 | from utils.utils import * 16 | import scipy 17 | import random 18 | 19 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 20 | logger = logging.getLogger(__name__) 21 | 22 | def alignto_bfm_coeff(model_dir, img, xys): 23 | from PIL import Image 24 | import tensorflow as tf 25 | 26 | def load_graph(graph_filename): 27 | with tf.gfile.GFile(graph_filename, 'rb') as f: 28 | graph_def = tf.GraphDef() 29 | graph_def.ParseFromString(f.read()) 30 | 31 | return graph_def 32 | 33 | # read standard landmarks for preprocessing images 34 | lm3D = load_lm3d(model_dir) 35 | 36 | # build reconstruction model 37 | with tf.Graph().as_default() as graph, tf.device('/cpu:0'): 38 | images = tf.placeholder(name='input_imgs', shape=[None, 224, 224, 3], dtype=tf.float32) 39 | graph_def = load_graph(os.path.join(model_dir, "FaceReconModel.pb")) 40 | tf.import_graph_def(graph_def, name='resnet', input_map={'input_imgs:0': images}) 41 | 42 | # output coefficients of R-Net (dim = 257) 43 | coeff = graph.get_tensor_by_name('resnet/coeff:0') 44 | 45 | with tf.Session() as sess: 46 | ps = map(lambda x: int(x), xys) 47 | 48 | left_eye_x = int(round((ps[72] + ps[74] + ps[76] + ps[78] + ps[80] + ps[82]) / 6)) 49 | left_eye_y = int(round((ps[73] + ps[75] + ps[77] + ps[79] + ps[81] + ps[83]) / 6)) 50 | right_eye_x = int(round((ps[84] + ps[86] + ps[88] + ps[90] + ps[92] + ps[94]) / 6)) 51 | right_eye_y = int(round((ps[85] + ps[87] + ps[89] + ps[91] + ps[93] + ps[95]) / 6)) 52 | nose_x = int(round(ps[60])) 53 | nose_y = int(round(ps[61])) 54 | left_mouse_x = int(round(ps[96])) 55 | left_mouse_y = int(round(ps[97])) 56 | right_mouse_x = int(round(ps[108])) 57 | right_mouse_y = int(round(ps[109])) 58 | 59 | lmk5 = np.array( 60 | [[left_eye_x, left_eye_y], [right_eye_x, right_eye_y], [nose_x, nose_y], [left_mouse_x, left_mouse_y], 61 | [right_mouse_x, right_mouse_y]]) 62 | 63 | image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) 64 | # preprocess input image 65 | input_img, lm_new, transform_params = Preprocess(image, lmk5, lm3D) 66 | bfmcoeff = sess.run(coeff, feed_dict={images: input_img}) 67 | return bfmcoeff, input_img, transform_params 68 | 69 | angles = np.array([[0, 0, 0]], dtype=np.float32) 70 | shift = 0.005 71 | 72 | def render_face(center_x, center_y, ratio, bfmcoeff, img, transform_params, facemodel): 73 | ratio *= transform_params[2] 74 | tx = -int((transform_params[3] / ratio)) 75 | ty = -int((transform_params[4] / ratio)) 76 | global angles, shift 77 | 78 | # angles[0][0] += shift 79 | # angles[0][1] += shift 80 | # angles[0][2] += shift 81 | # if (angles[0][1] > 0.03 or angles[0][1] < -0.03): 82 | # shift = -shift 83 | 84 | face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d = Reconstruction_rotation( 85 | bfmcoeff, facemodel, angles) 86 | face_projection2 = np.concatenate([face_projection, z_buffer], axis=2) 87 | face_projection = np.squeeze(face_projection2, (0)) 88 | 89 | shape = np.squeeze(face_projection2, (0)) 90 | color = np.squeeze(face_color, (0)) 91 | color = np.clip(color, 0, 255).astype(np.int32) 92 | 93 | new_image = np.zeros((224 * 224 * 3), dtype=np.uint8) 94 | face_mask = np.zeros((224 * 224), dtype=np.uint8) 95 | 96 | vertices = shape.reshape(-1).astype(np.float32).copy() 97 | triangles = (facemodel.tri - 1).reshape(-1).astype(np.int32).copy() 98 | colors = color.reshape(-1).astype(np.float32).copy() 99 | depth_buffer = (np.zeros((224 * 224)) - 99999.0).astype(np.float32) 100 | mesh_core_cython.render_colors_core(new_image, face_mask, vertices, triangles, colors, depth_buffer, 101 | facemodel.tri.shape[0], 224, 224, 3) 102 | new_image = new_image.reshape([224, 224, 3]) 103 | 104 | new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2RGB) 105 | new_image = cv2.resize(new_image, ( 106 | int(round(new_image.shape[0] / ratio)), int(round(new_image.shape[1] / ratio)))) 107 | 108 | back_new_image = np.zeros((img.shape[0], img.shape[1], img.shape[2]), dtype=img.dtype) 109 | center_face_x = new_image.shape[1] // 2 110 | center_face_y = new_image.shape[0] // 2 111 | 112 | ry = center_y - center_face_y + new_image.shape[0] - ty 113 | rx = center_x - center_face_x + new_image.shape[1] - tx 114 | back_new_image[center_y - center_face_y - ty:ry, center_x - center_face_x - tx:rx, :] = new_image 115 | return back_new_image 116 | 117 | 118 | if (__name__ == '__main__'): 119 | 120 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 121 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 122 | help='the config yaml file') 123 | 124 | opts, argv = cmd_parser.parse_args() 125 | 126 | if (opts.config_path is None): 127 | logger.error('Please check your parameters.') 128 | exit(0) 129 | 130 | config_path = opts.config_path 131 | 132 | if (not os.path.exists(config_path)): 133 | logger.error('config_path not exists') 134 | exit(0) 135 | 136 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 137 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 138 | 139 | image_file, audio_file = argv 140 | 141 | mkdir('output') 142 | for file in os.listdir('output'): 143 | os.system('rm -rf output/{}'.format(file)) 144 | 145 | batch_size = 1 146 | ### Generator for inference setting 147 | infer_generator = DataGenerator(config_path) 148 | params = infer_generator.params 149 | params.batch_size = batch_size 150 | infer_generator.set_params(params) 151 | wav_loader = WavLoader(sr=infer_generator.sample_rate) 152 | pcm = wav_loader.get_data(audio_file) 153 | facemodel = BFM(params.pretrain_dir) 154 | 155 | pad_len = int(1 + pcm.shape[0] / infer_generator.frame_wav_scale) 156 | # calculate the rational length of pcm in order to keep the alignment of mfcc and landmark sequence. 157 | pcm_length = infer_generator.hop_step * (pad_len * infer_generator.frame_mfcc_scale - 1) + infer_generator.win_length 158 | if (pcm.shape[0] < pcm_length): 159 | pcm = np.pad(pcm, (0, pcm_length - pcm.shape[0]), 'constant', constant_values=(0)) 160 | pcm_slice = pcm[:pcm_length][np.newaxis, :] 161 | 162 | mfcc = infer_generator.extract_mfcc(pcm_slice) 163 | img_size = 512 164 | img = cv2.imread(image_file)[:, :512, :] 165 | img, img_landmarks, img_cropped, lmk_cropped, center_x, center_y, ratio = get_mxnet_sat_alignment(params.pretrain_dir, img) 166 | bfmcoeff, input_img, transform_params = alignto_bfm_coeff(params.pretrain_dir, img_cropped, lmk_cropped) 167 | 168 | img = cv2.cvtColor(cv2.imread(image_file), cv2.COLOR_BGR2RGB).astype(np.float32)/255.0 169 | face3d_refer = img[:, 512:512*2, :] 170 | fg_refer = img[:, :512, :] * img[:, 512*2:, :] 171 | img = img[:, :512, :] 172 | 173 | with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: 174 | seq_len = tf.convert_to_tensor([pad_len], dtype=tf.int32) 175 | ear = np.random.rand(1, pad_len, 1).astype(np.float32)/100 176 | ear = tf.convert_to_tensor(ear, dtype=tf.float32) 177 | 178 | with tf.variable_scope('localization'): 179 | ### BFMNet setting 180 | bfmnet = BFMNet(config_path) 181 | params = bfmnet.params 182 | params.batch_size = 1 183 | bfmnet.set_params(params) 184 | 185 | bfmnet_nodes = bfmnet.build_inference_op(ear, mfcc, seq_len) 186 | 187 | with tf.variable_scope('recognition'): 188 | ### Vid2VidNet setting 189 | vid2vidnet = PixFlowNet(config_path) 190 | params = vid2vidnet.params 191 | params.batch_size = 1 192 | vid2vidnet.set_params(params) 193 | 194 | inputs_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 6]) 195 | targets_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 6]) 196 | vid2vid_nodes = vid2vidnet.build_inference_op(inputs_holder, targets_holder) 197 | 198 | variables_to_restore = tf.global_variables() 199 | loc_varlist = {v.name[13:][:-2]: v 200 | for v in variables_to_restore if v.name[:12]=='localization'} 201 | rec_varlist = {v.name[12:][:-2]: v 202 | for v in variables_to_restore if v.name[:11]=='recognition'} 203 | 204 | loc_saver = tf.train.Saver(var_list=loc_varlist) 205 | rec_saver = tf.train.Saver(var_list=rec_varlist) 206 | 207 | sess.run(tf.global_variables_initializer()) 208 | loc_saver.restore(sess, 'ckpt_bfmnet_new3/bfmnet-40000') 209 | rec_saver.restore(sess, 'ckpt_pixflow3/pixflownet-50000') 210 | 211 | ### Run inference 212 | bfm_coeff_seq = sess.run(bfmnet_nodes['BFMCoeffDecoder']) 213 | bfmcoeff = np.tile(bfmcoeff[:, np.newaxis, :], [1, bfm_coeff_seq.shape[1], 1]) 214 | 215 | bfm_coeff_seq = np.concatenate([bfmcoeff[:, :, :80], bfm_coeff_seq[:, :, :], bfmcoeff[:, :, 144:]], axis=2) 216 | 217 | inputs = np.zeros([1, img_size, img_size, 6], dtype=np.float32) 218 | inputs[0, ..., 0:3] = face3d_refer 219 | 220 | for i in range(bfm_coeff_seq.shape[1]): 221 | face3d = render_face(center_x+random.randint(0, 0), center_y+random.randint(0, 0), ratio, bfm_coeff_seq[0, i:i + 1, ...], img, transform_params, facemodel) 222 | # cv2.imwrite('output/{}.jpg'.format(i), face3d) 223 | face3d = cv2.cvtColor(face3d, cv2.COLOR_BGR2RGB).astype(np.float32)/255.0 224 | 225 | inputs[0, ..., 0:3] = face3d 226 | 227 | bg_img = np.zeros([1, img_size, img_size, 6], dtype=np.float32) 228 | bg_img[0, ..., :3] = cv2.resize(cv2.imread('background/{}.jpg'.format(i+1)), (img_size, img_size)).astype(np.float32)/255.0 229 | bg_img[0, ..., 3:] = bg_img[0, ..., :3] 230 | 231 | # bg_img = cv2.cvtColor(bg_img, cv2.COLOR_BGR2RGB) 232 | frames = sess.run(vid2vid_nodes['Outputs'], 233 | feed_dict={inputs_holder: inputs, targets_holder: bg_img}) 234 | 235 | cv2.imwrite('output/{}.jpg'.format(i), cv2.cvtColor((frames[0,..., :3]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 236 | 237 | # image_loader = ImageLoader() 238 | # for index in range(4, 195): 239 | # img = image_loader.get_data(os.path.join('/media/dong/DiskData/gridcorpus/todir_vid2vid/vid1/05', '{}.jpg'.format(index))) 240 | # face3d = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)[:, img_size:img_size*2, :] 241 | 242 | # inputs[0, ..., 3:6] = inputs[0, ..., 6:9] 243 | # inputs[0, ..., 6:9] = face3d 244 | 245 | # frames, last = sess.run([vid2vid_nodes['Outputs'], vid2vid_nodes['Outputs_FG']], 246 | # feed_dict={inputs_holder: inputs, fg_inputs_holder: fg_inputs, targets_holder: np.tile(bg_img, (1, 1, 3))[np.newaxis, ...]}) 247 | # fg_inputs[0, ..., 3:6] = last 248 | 249 | # cv2.imwrite('output/{}.jpg'.format(index), cv2.cvtColor((last[0,...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 250 | -------------------------------------------------------------------------------- /voicepuppet/pixflow/infer_pixflow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | import subprocess 9 | from pixflow import PixFlowNet 10 | from voicepuppet.bfmnet.bfmnet import BFMNet 11 | from generator.loader import * 12 | from generator.generator import DataGenerator 13 | from utils.bfm_load_data import * 14 | from utils.bfm_visual import * 15 | from utils.utils import * 16 | import scipy 17 | 18 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | if (__name__ == '__main__'): 23 | 24 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 25 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 26 | help='the config yaml file') 27 | 28 | opts, argv = cmd_parser.parse_args() 29 | 30 | if (opts.config_path is None): 31 | logger.error('Please check your parameters.') 32 | exit(0) 33 | 34 | config_path = opts.config_path 35 | 36 | if (not os.path.exists(config_path)): 37 | logger.error('config_path not exists') 38 | exit(0) 39 | 40 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 41 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 42 | 43 | mkdir('output') 44 | for file in os.listdir('output'): 45 | os.system('rm -rf output/{}'.format(file)) 46 | 47 | batch_size = 1 48 | img_size = 512 49 | image_loader = ImageLoader() 50 | root = '/media/dong/DiskData/gridcorpus/todir_vid2vid/vid1/05' 51 | bg_img = np.zeros([1, img_size, img_size, 6], dtype=np.float32) 52 | bg_img[0, ..., :3] = cv2.resize(cv2.imread('/home/dong/Downloads/bg.jpg'), (img_size, img_size)).astype(np.float32)/255.0 53 | bg_img[0, ..., 3:] = bg_img[0, ..., :3] 54 | 55 | with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: 56 | with tf.variable_scope('recognition'): 57 | ### Vid2VidNet setting 58 | vid2vidnet = PixFlowNet(config_path) 59 | params = vid2vidnet.params 60 | params.batch_size = 1 61 | vid2vidnet.set_params(params) 62 | 63 | inputs_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 6]) 64 | targets_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 6]) 65 | vid2vid_nodes = vid2vidnet.build_inference_op(inputs_holder, targets_holder) 66 | 67 | variables_to_restore = tf.global_variables() 68 | rec_varlist = {v.name[12:][:-2]: v 69 | for v in variables_to_restore if v.name[:11]=='recognition'} 70 | 71 | rec_saver = tf.train.Saver(var_list=rec_varlist) 72 | 73 | sess.run(tf.global_variables_initializer()) 74 | rec_saver.restore(sess, 'ckpt_pixflow3/pixflownet-60000') 75 | 76 | inputs = np.zeros([1, img_size, img_size, 6], dtype=np.float32) 77 | 78 | img = image_loader.get_data(os.path.join(root, '{}.jpg'.format(10))) 79 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 80 | inputs[0, :, :, 0:3] = img[:, img_size:img_size*2, :] 81 | 82 | for index in range(195): 83 | img = image_loader.get_data(os.path.join(root, '{}.jpg'.format(index))) 84 | if (img is not None): 85 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 86 | inputs[0, ..., 3:6] = img[:, img_size:img_size*2, :] 87 | 88 | frames = sess.run(vid2vid_nodes['Outputs'], 89 | feed_dict={inputs_holder: inputs, targets_holder: bg_img}) 90 | 91 | cv2.imwrite('output/_{}.jpg'.format(index), cv2.cvtColor((frames[0,...,3:]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 92 | # cv2.imshow('', last[0, ...]) 93 | # cv2.waitKey(0) 94 | 95 | 96 | # cv2.imwrite('output/_{}.jpg'.format(i), cv2.cvtColor((frames[0,...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 97 | 98 | # cmd = 'ffmpeg -i output/_%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y temp2.mp4' 99 | # subprocess.call(cmd, shell=True) 100 | 101 | # cmd = 'ffmpeg -i output/%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y temp.mp4' 102 | # subprocess.call(cmd, shell=True) 103 | -------------------------------------------------------------------------------- /voicepuppet/pixflow/train_pixflow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | from pixflow import PixFlowNet 9 | from generator.generator import PixFlowDataGenerator 10 | from utils.utils import * 11 | 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | if (__name__ == '__main__'): 17 | 18 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 19 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 20 | help='the config yaml file') 21 | 22 | opts, argv = cmd_parser.parse_args() 23 | 24 | if (opts.config_path is None): 25 | logger.error('Please check your parameters.') 26 | exit(0) 27 | 28 | config_path = opts.config_path 29 | 30 | if (not os.path.exists(config_path)): 31 | logger.error('config_path not exists') 32 | exit(0) 33 | 34 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 35 | 36 | batch_size = 3 37 | ### Generator for training setting 38 | train_generator = PixFlowDataGenerator(config_path) 39 | params = train_generator.params 40 | params.dataset_path = params.train_dataset_path 41 | params.batch_size = batch_size 42 | train_generator.set_params(params) 43 | train_dataset = train_generator.get_dataset() 44 | 45 | config = tf.ConfigProto() 46 | config.gpu_options.allow_growth = True 47 | sess = tf.Session(config=config) 48 | tf.train.start_queue_runners(sess=sess) 49 | 50 | train_iter = train_dataset.make_one_shot_iterator() 51 | 52 | # inputs, fg_inputs, targets, masks = sess.run(train_iter.get_next()) 53 | # inp1 = cv2.cvtColor((inputs[0,...,0:3]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 54 | # inp2 = cv2.cvtColor((inputs[0,...,3:6]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 55 | # fg_inputs1 = cv2.cvtColor((fg_inputs[0,...,0:3]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 56 | # fg_inputs2 = cv2.cvtColor((fg_inputs[0,...,3:6]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 57 | # targets1 = cv2.cvtColor((targets[0,...,0:3]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 58 | # targets2 = cv2.cvtColor((targets[0,...,3:6]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 59 | # masks1 = cv2.cvtColor((masks[0,...,0:3]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 60 | # masks2 = cv2.cvtColor((masks[0,...,3:6]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 61 | 62 | # cv2.imwrite('to/inp1.jpg', inp1) 63 | # cv2.imwrite('to/inp2.jpg', inp2) 64 | # cv2.imwrite('to/fg_inputs1.jpg', fg_inputs1) 65 | # cv2.imwrite('to/fg_inputs2.jpg', fg_inputs2) 66 | # cv2.imwrite('to/targets1.jpg', targets1) 67 | # cv2.imwrite('to/targets2.jpg', targets2) 68 | # cv2.imwrite('to/masks1.jpg', masks1) 69 | # cv2.imwrite('to/masks2.jpg', masks2) 70 | # sys.exit(0) 71 | 72 | 73 | ### Vid2VidNet setting 74 | vid2vidnet = PixFlowNet(config_path) 75 | params = vid2vidnet.params 76 | epochs = params.training['epochs'] 77 | params.add_hparam('max_to_keep', 2) 78 | params.add_hparam('save_dir', 'ckpt_pixflow') 79 | params.add_hparam('save_name', 'pixflownet') 80 | params.add_hparam('save_step', 5000) 81 | params.add_hparam('summary_step', 100) 82 | params.add_hparam('summary_dir', 'log/summary_pixflow') 83 | params.batch_size = batch_size 84 | vid2vidnet.set_params(params) 85 | 86 | mkdir(params.save_dir) 87 | mkdir(params.summary_dir) 88 | 89 | train_nodes = vid2vidnet.build_train_op(*train_iter.get_next()) 90 | sess.run(tf.global_variables_initializer()) 91 | 92 | # Restore from save_dir 93 | if ('checkpoint' in os.listdir(params.save_dir)): 94 | tf.train.Saver().restore(sess, tf.train.latest_checkpoint(params.save_dir)) 95 | 96 | tf.summary.scalar("discriminator_loss", train_nodes['Discrim_loss']) 97 | tf.summary.scalar("generator_loss_GAN", train_nodes['Gen_loss_GAN']) 98 | tf.summary.scalar("generator_loss_L1", train_nodes['Gen_loss_L1']) 99 | 100 | with tf.name_scope("inputs_summary"): 101 | tf.summary.image("inputs", tf.image.convert_image_dtype(train_nodes['Inputs'][... ,3:6], dtype=tf.uint8)) 102 | 103 | with tf.name_scope("targets_summary"): 104 | tf.summary.image("targets", tf.image.convert_image_dtype(train_nodes['FG_Inputs'][... ,3:6], dtype=tf.uint8)) 105 | 106 | with tf.name_scope("outputs_summary"): 107 | tf.summary.image("outputs", tf.image.convert_image_dtype(train_nodes['Outputs'], dtype=tf.uint8)) 108 | 109 | with tf.name_scope("alpha_summary"): 110 | tf.summary.image("alphas", tf.image.convert_image_dtype(train_nodes['Alphas'], dtype=tf.uint8)) 111 | 112 | # Add histograms for gradients. 113 | for grad, var in train_nodes['Discrim_grads_and_vars'] + train_nodes['Gen_grads_and_vars']: 114 | if(grad is not None): 115 | tf.summary.histogram(var.op.name, grad) 116 | 117 | merge_summary_op = tf.summary.merge_all() 118 | summary_writer = tf.summary.FileWriter(params.summary_dir, graph=sess.graph) 119 | 120 | for i in range(epochs): 121 | ### Run training 122 | result = sess.run([train_nodes['Train_op'], 123 | merge_summary_op, 124 | train_nodes['Gen_loss_GAN'], 125 | train_nodes['Gen_loss_L1'], 126 | train_nodes['Discrim_loss'], 127 | train_nodes['Lr'], 128 | train_nodes['Global_step']]) 129 | _, summary, gen_loss_GAN, gen_loss_L1, discrim_loss, lr, global_step = result 130 | if(global_step % params.summary_step==0): 131 | print('Step {}, Lr= {:.2e}: \n\tgen_loss_GAN= {:.3f}, \n\tgen_loss_L1= {:.3f}, \n\tdiscrim_loss= {:.3f}'.format(global_step, lr, gen_loss_GAN, gen_loss_L1, discrim_loss)) 132 | summary_writer.add_summary(summary, global_step) 133 | 134 | ### Save checkpoint 135 | if (global_step % params.save_step == 0): 136 | tf.train.Saver(max_to_keep=params.max_to_keep, var_list=tf.global_variables()).save(sess, 137 | os.path.join(params.save_dir, 138 | params.save_name), 139 | global_step=global_step) 140 | -------------------------------------------------------------------------------- /voicepuppet/pixrefer/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = [ 3 | "//visibility:public", 4 | ], 5 | features = ["-layering_check"], 6 | ) 7 | 8 | py_library( 9 | name = "vgg_simple", 10 | srcs = ["vgg_simple.py"], 11 | deps = [ 12 | "//config:configure", 13 | "//voicepuppet:builder" 14 | ], 15 | ) 16 | 17 | py_library( 18 | name = "pixrefer", 19 | srcs = ["pixrefer.py"], 20 | deps = [ 21 | "//config:configure", 22 | "//voicepuppet:builder", 23 | ":vgg_simple" 24 | ], 25 | ) 26 | 27 | py_binary( 28 | name = "train_pixrefer", 29 | srcs = ["train_pixrefer.py"], 30 | deps = [ 31 | "//utils:utils", 32 | ":pixrefer", 33 | "//generator:generator", 34 | "//generator:loader" 35 | ], 36 | ) 37 | 38 | py_binary( 39 | name = "infer_pixrefer", 40 | srcs = ["infer_pixrefer.py"], 41 | deps = [ 42 | "//utils:bfm_load_data", 43 | "//utils:reconstruct_mesh", 44 | "//utils:bfm_visual", 45 | "//utils:utils", 46 | ":pixrefer", 47 | "//voicepuppet/bfmnet:bfmnet", 48 | "//generator:generator", 49 | "//generator:loader" 50 | ], 51 | ) 52 | 53 | py_binary( 54 | name = "infer_bfmvid", 55 | srcs = ["infer_bfmvid.py"], 56 | deps = [ 57 | "//utils:bfm_load_data", 58 | "//utils:reconstruct_mesh", 59 | "//utils:bfm_visual", 60 | "//utils:utils", 61 | ":pixrefer", 62 | "//voicepuppet/bfmnet:bfmnet", 63 | "//generator:generator", 64 | "//generator:loader" 65 | ], 66 | ) 67 | -------------------------------------------------------------------------------- /voicepuppet/pixrefer/infer_bfmvid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | import subprocess 9 | import scipy 10 | import random 11 | import sys 12 | 13 | sys.path.append(os.path.join(os.getcwd(), 'utils')) 14 | 15 | from pixrefer import PixReferNet 16 | from voicepuppet.bfmnet.bfmnet import BFMNet 17 | from generator.loader import * 18 | from generator.generator import DataGenerator 19 | from bfm_load_data import * 20 | from bfm_visual import * 21 | from utils import * 22 | 23 | bfmcoeff_loader = BFMCoeffLoader() 24 | # vid_bfmcoeff = bfmcoeff_loader.get_data('/media/dong/DiskData/gridcorpus/todir/bilibili/4_16/bfmcoeff.txt') 25 | 26 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 27 | logger = logging.getLogger(__name__) 28 | 29 | def alignto_bfm_coeff(model_dir, img, xys): 30 | from PIL import Image 31 | import tensorflow as tf 32 | 33 | def load_graph(graph_filename): 34 | with tf.gfile.GFile(graph_filename, 'rb') as f: 35 | graph_def = tf.GraphDef() 36 | graph_def.ParseFromString(f.read()) 37 | 38 | return graph_def 39 | 40 | # read standard landmarks for preprocessing images 41 | lm3D = load_lm3d(model_dir) 42 | 43 | # build reconstruction model 44 | with tf.Graph().as_default() as graph, tf.device('/cpu:0'): 45 | images = tf.placeholder(name='input_imgs', shape=[None, 224, 224, 3], dtype=tf.float32) 46 | graph_def = load_graph(os.path.join(model_dir, "FaceReconModel.pb")) 47 | tf.import_graph_def(graph_def, name='resnet', input_map={'input_imgs:0': images}) 48 | 49 | # output coefficients of R-Net (dim = 257) 50 | coeff = graph.get_tensor_by_name('resnet/coeff:0') 51 | 52 | with tf.Session() as sess: 53 | ps = list(map(lambda x: int(x), xys)) 54 | 55 | left_eye_x = int(round((ps[72] + ps[74] + ps[76] + ps[78] + ps[80] + ps[82]) / 6)) 56 | left_eye_y = int(round((ps[73] + ps[75] + ps[77] + ps[79] + ps[81] + ps[83]) / 6)) 57 | right_eye_x = int(round((ps[84] + ps[86] + ps[88] + ps[90] + ps[92] + ps[94]) / 6)) 58 | right_eye_y = int(round((ps[85] + ps[87] + ps[89] + ps[91] + ps[93] + ps[95]) / 6)) 59 | nose_x = int(round(ps[60])) 60 | nose_y = int(round(ps[61])) 61 | left_mouse_x = int(round(ps[96])) 62 | left_mouse_y = int(round(ps[97])) 63 | right_mouse_x = int(round(ps[108])) 64 | right_mouse_y = int(round(ps[109])) 65 | 66 | lmk5 = np.array( 67 | [[left_eye_x, left_eye_y], [right_eye_x, right_eye_y], [nose_x, nose_y], [left_mouse_x, left_mouse_y], 68 | [right_mouse_x, right_mouse_y]]) 69 | 70 | image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) 71 | # preprocess input image 72 | input_img, lm_new, transform_params = Preprocess(image, lmk5, lm3D) 73 | bfmcoeff = sess.run(coeff, feed_dict={images: input_img}) 74 | return bfmcoeff, input_img, transform_params 75 | 76 | angles = np.array([[0, 0, 0]], dtype=np.float32) 77 | shift = 0.005 78 | 79 | def render_face(center_x, center_y, ratio, bfmcoeff, img, transform_params, facemodel): 80 | ratio *= transform_params[2] 81 | tx = -int((transform_params[3] / ratio)) 82 | ty = -int((transform_params[4] / ratio)) 83 | global angles, shift 84 | 85 | angles[0][0] += shift 86 | angles[0][1] += shift 87 | angles[0][2] += shift 88 | if (angles[0][1] > 0.03 or angles[0][1] < -0.03): 89 | shift = -shift 90 | 91 | face_shape, face_texture, face_color, face_projection, z_buffer, landmarks_2d = Reconstruction_rotation( 92 | bfmcoeff, facemodel, angles) 93 | face_projection2 = np.concatenate([face_projection, z_buffer], axis=2) 94 | face_projection = np.squeeze(face_projection2, (0)) 95 | 96 | shape = np.squeeze(face_projection2, (0)) 97 | color = np.squeeze(face_color, (0)) 98 | color = np.clip(color, 0, 255).astype(np.int32) 99 | 100 | new_image = np.zeros((224 * 224 * 3), dtype=np.uint8) 101 | face_mask = np.zeros((224 * 224), dtype=np.uint8) 102 | 103 | vertices = shape.reshape(-1).astype(np.float32).copy() 104 | triangles = (facemodel.tri - 1).reshape(-1).astype(np.int32).copy() 105 | colors = color.reshape(-1).astype(np.float32).copy() 106 | depth_buffer = (np.zeros((224 * 224)) - 99999.0).astype(np.float32) 107 | mesh_core_cython.render_colors_core(new_image, face_mask, vertices, triangles, colors, depth_buffer, 108 | facemodel.tri.shape[0], 224, 224, 3) 109 | new_image = new_image.reshape([224, 224, 3]) 110 | 111 | new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2RGB) 112 | new_image = cv2.resize(new_image, ( 113 | int(round(new_image.shape[0] / ratio)), int(round(new_image.shape[1] / ratio)))) 114 | 115 | back_new_image = np.zeros((img.shape[0], img.shape[1], img.shape[2]), dtype=img.dtype) 116 | center_face_x = new_image.shape[1] // 2 117 | center_face_y = new_image.shape[0] // 2 118 | 119 | ry = center_y - center_face_y + new_image.shape[0] - ty 120 | rx = center_x - center_face_x + new_image.shape[1] - tx 121 | back_new_image[center_y - center_face_y - ty:ry, center_x - center_face_x - tx:rx, :] = new_image 122 | return back_new_image 123 | 124 | 125 | if (__name__ == '__main__'): 126 | 127 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 128 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 129 | help='the config yaml file') 130 | 131 | opts, argv = cmd_parser.parse_args() 132 | 133 | if (opts.config_path is None): 134 | logger.error('Please check your parameters.') 135 | exit(0) 136 | 137 | config_path = opts.config_path 138 | 139 | if (not os.path.exists(config_path)): 140 | logger.error('config_path not exists') 141 | exit(0) 142 | 143 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 144 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 145 | 146 | image_file, audio_file = argv 147 | 148 | mkdir('output') 149 | for file in os.listdir('output'): 150 | os.system('rm -rf output/{}'.format(file)) 151 | 152 | batch_size = 1 153 | ### Generator for inference setting 154 | infer_generator = DataGenerator(config_path) 155 | params = infer_generator.params 156 | params.batch_size = batch_size 157 | infer_generator.set_params(params) 158 | wav_loader = WavLoader(sr=infer_generator.sample_rate) 159 | pcm = wav_loader.get_data(audio_file) 160 | facemodel = BFM(params.model_dir) 161 | 162 | pad_len = int(1 + pcm.shape[0] / infer_generator.frame_wav_scale) 163 | # calculate the rational length of pcm in order to keep the alignment of mfcc and landmark sequence. 164 | pcm_length = infer_generator.hop_step * (pad_len * infer_generator.frame_mfcc_scale - 1) + infer_generator.win_length 165 | if (pcm.shape[0] < pcm_length): 166 | pcm = np.pad(pcm, (0, pcm_length - pcm.shape[0]), 'constant', constant_values=(0)) 167 | pcm_slice = pcm[:pcm_length][np.newaxis, :] 168 | 169 | mfcc = infer_generator.extract_mfcc(pcm_slice) 170 | img_size = 512 171 | img = cv2.imread(image_file)[:, :512, :] 172 | img, img_landmarks, img_cropped, lmk_cropped, center_x, center_y, ratio = get_mxnet_sat_alignment(params.model_dir, img) 173 | bfmcoeff, input_img, transform_params = alignto_bfm_coeff(params.model_dir, img_cropped, lmk_cropped) 174 | 175 | img = cv2.cvtColor(cv2.imread(image_file), cv2.COLOR_BGR2RGB).astype(np.float32)/255.0 176 | face3d_refer = img[:, 512:512*2, :] 177 | fg_refer = img[:, :512, :] * img[:, 512*2:, :] 178 | img = img[:, :512, :] 179 | 180 | with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: 181 | seq_len = tf.convert_to_tensor([pad_len], dtype=tf.int32) 182 | ear = np.random.rand(1, pad_len, 1).astype(np.float32)/100 183 | ear = tf.convert_to_tensor(ear, dtype=tf.float32) 184 | 185 | with tf.variable_scope('bfm_scope'): 186 | ### BFMNet setting 187 | bfmnet = BFMNet(config_path) 188 | params = bfmnet.params 189 | params.batch_size = 1 190 | bfmnet.set_params(params) 191 | 192 | bfmnet_nodes = bfmnet.build_inference_op(ear, mfcc, seq_len) 193 | 194 | with tf.variable_scope('vid_scope'): 195 | ### Vid2VidNet setting 196 | vid2vidnet = PixReferNet(config_path) 197 | params = vid2vidnet.params 198 | params.batch_size = 1 199 | params.add_hparam('is_training', False) 200 | vid2vidnet.set_params(params) 201 | 202 | inputs_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 6]) 203 | fg_inputs_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 3]) 204 | targets_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 3]) 205 | vid2vid_nodes = vid2vidnet.build_inference_op(inputs_holder, fg_inputs_holder, targets_holder) 206 | 207 | variables_to_restore = tf.global_variables() 208 | bfm_varlist = {v.name[len('bfm_scope')+1:][:-2]: v 209 | for v in variables_to_restore if v.name[:len('bfm_scope')]=='bfm_scope'} 210 | vid_varlist = {v.name[len('vid_scope')+1:][:-2]: v 211 | for v in variables_to_restore if v.name[:len('vid_scope')]=='vid_scope'} 212 | 213 | bfm_saver = tf.train.Saver(var_list=bfm_varlist) 214 | vid_saver = tf.train.Saver(var_list=vid_varlist) 215 | 216 | sess.run(tf.global_variables_initializer()) 217 | bfm_saver.restore(sess, 'ckpt_bfmnet/bfmnet-65000') 218 | vid_saver.restore(sess, 'ckpt_pixrefer/pixrefernet-20000') 219 | 220 | # ### Run inference 221 | bfm_coeff_seq = sess.run(bfmnet_nodes['BFMCoeffDecoder']) 222 | # bfm_coeff_seq = vid_bfmcoeff[np.newaxis, :, 80:144] 223 | bfmcoeff = np.tile(bfmcoeff[:, np.newaxis, :], [1, bfm_coeff_seq.shape[1], 1]) 224 | bfm_coeff_seq = np.concatenate([bfmcoeff[:, :, :80], bfm_coeff_seq, bfmcoeff[:, :, 144:]], axis=2) 225 | 226 | inputs = np.zeros([1, img_size, img_size, 6], dtype=np.float32) 227 | fg_inputs = np.zeros([1, img_size, img_size, 3], dtype=np.float32) 228 | inputs[0, ..., 0:3] = face3d_refer 229 | fg_inputs[0, ..., 0:3] = fg_refer 230 | 231 | for i in range(bfm_coeff_seq.shape[1]):# 232 | face3d = render_face(center_x+random.randint(-0, 0), center_y+random.randint(-0, 0), ratio, bfm_coeff_seq[0, i:i + 1, ...], img, transform_params, facemodel) 233 | # cv2.imwrite('output/{}.jpg'.format(i), face3d) 234 | face3d = cv2.cvtColor(face3d, cv2.COLOR_BGR2RGB).astype(np.float32)/255.0 235 | 236 | inputs[0, ..., 3:6] = face3d 237 | 238 | bg_img = cv2.resize(cv2.imread('background/{}.jpg'.format(i%100+1)), (img_size, img_size)).astype(np.float32)/255.0 239 | bg_img = cv2.cvtColor(bg_img, cv2.COLOR_BGR2RGB) 240 | frames, last = sess.run([vid2vid_nodes['Outputs'], vid2vid_nodes['Outputs_FG']], 241 | feed_dict={inputs_holder: inputs, fg_inputs_holder: fg_inputs, targets_holder: bg_img[np.newaxis, ...]}) 242 | 243 | cv2.imwrite('output/{}.jpg'.format(i), cv2.cvtColor((frames[0,...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 244 | 245 | cmd = 'ffmpeg -i output/%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y output.mp4' 246 | subprocess.call(cmd, shell=True) 247 | 248 | # image_loader = ImageLoader() 249 | # for index in range(4, 195): 250 | # img = image_loader.get_data(os.path.join('/media/dong/DiskData/gridcorpus/todir_vid2vid/vid1/05', '{}.jpg'.format(index))) 251 | # face3d = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)[:, img_size:img_size*2, :] 252 | 253 | # inputs[0, ..., 3:6] = inputs[0, ..., 6:9] 254 | # inputs[0, ..., 6:9] = face3d 255 | 256 | # frames, last = sess.run([vid2vid_nodes['Outputs'], vid2vid_nodes['Outputs_FG']], 257 | # feed_dict={inputs_holder: inputs, fg_inputs_holder: fg_inputs, targets_holder: np.tile(bg_img, (1, 1, 3))[np.newaxis, ...]}) 258 | # fg_inputs[0, ..., 3:6] = last 259 | 260 | # cv2.imwrite('output/{}.jpg'.format(index), cv2.cvtColor((last[0,...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 261 | -------------------------------------------------------------------------------- /voicepuppet/pixrefer/infer_pixrefer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | import subprocess 9 | from pixrefer import PixReferNet 10 | from voicepuppet.bfmnet.bfmnet import BFMNet 11 | from generator.loader import * 12 | from generator.generator import DataGenerator 13 | from utils.bfm_load_data import * 14 | from utils.bfm_visual import * 15 | from utils.utils import * 16 | import scipy 17 | 18 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | if (__name__ == '__main__'): 23 | 24 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 25 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 26 | help='the config yaml file') 27 | 28 | opts, argv = cmd_parser.parse_args() 29 | 30 | if (opts.config_path is None): 31 | logger.error('Please check your parameters.') 32 | exit(0) 33 | 34 | config_path = opts.config_path 35 | 36 | if (not os.path.exists(config_path)): 37 | logger.error('config_path not exists') 38 | exit(0) 39 | 40 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 41 | os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' 42 | 43 | mkdir('output') 44 | for file in os.listdir('output'): 45 | os.system('rm -rf output/{}'.format(file)) 46 | 47 | batch_size = 1 48 | img_size = 512 49 | image_loader = ImageLoader() 50 | root = '/media/dong/DiskData/gridcorpus/todir_vid2vid/vid1/05' 51 | bg_img = cv2.resize(cv2.imread('/home/dong/Downloads/bg.jpg'), (img_size, img_size)).astype(np.float32)/255.0 52 | 53 | with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess: 54 | with tf.variable_scope('recognition'): 55 | ### Vid2VidNet setting 56 | vid2vidnet = PixReferNet(config_path) 57 | params = vid2vidnet.params 58 | params.batch_size = 1 59 | params.add_hparam('is_training', False) 60 | vid2vidnet.set_params(params) 61 | 62 | inputs_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 6]) 63 | fg_inputs_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 3]) 64 | targets_holder = tf.placeholder(tf.float32, shape=[None, img_size, img_size, 3]) 65 | vid2vid_nodes = vid2vidnet.build_inference_op(inputs_holder, fg_inputs_holder, targets_holder) 66 | 67 | variables_to_restore = tf.global_variables() 68 | rec_varlist = {v.name[12:][:-2]: v 69 | for v in variables_to_restore if v.name[:11]=='recognition'} 70 | 71 | rec_saver = tf.train.Saver(var_list=rec_varlist) 72 | 73 | sess.run(tf.global_variables_initializer()) 74 | rec_saver.restore(sess, 'ckpt_pixrefer/pixrefernet-20000') 75 | 76 | inputs = np.zeros([1, img_size, img_size, 6], dtype=np.float32) 77 | fg_inputs = np.zeros([1, img_size, img_size, 3], dtype=np.float32) 78 | 79 | img = image_loader.get_data(os.path.join(root, '{}.jpg'.format(0))) 80 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 81 | inputs[0, :, :, 0:3] = img[:, img_size:img_size*2, :] 82 | fg_inputs[0, :, :, 0:3] = img[:, :img_size, :] * img[:, img_size*2:, :] 83 | 84 | for index in range(4, 195): 85 | img = image_loader.get_data(os.path.join(root, '{}.jpg'.format(index))) 86 | if (img is not None): 87 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 88 | inputs[0, ..., 3:6] = img[:, img_size:img_size*2, :] 89 | 90 | frames, last = sess.run([vid2vid_nodes['Outputs'], vid2vid_nodes['Outputs_FG']], 91 | feed_dict={inputs_holder: inputs, fg_inputs_holder: fg_inputs, targets_holder: bg_img[np.newaxis, ...]}) 92 | 93 | cv2.imwrite('output/_{}.jpg'.format(index), cv2.cvtColor((frames[0,...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 94 | # cv2.imshow('', last[0, ...]) 95 | # cv2.waitKey(0) 96 | 97 | 98 | # cv2.imwrite('output/_{}.jpg'.format(i), cv2.cvtColor((frames[0,...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB)) 99 | 100 | # cmd = 'ffmpeg -i output/_%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y temp2.mp4' 101 | # subprocess.call(cmd, shell=True) 102 | 103 | # cmd = 'ffmpeg -i output/%d.jpg -i ' + audio_file + ' -c:v libx264 -c:a aac -strict experimental -y temp.mp4' 104 | # subprocess.call(cmd, shell=True) 105 | -------------------------------------------------------------------------------- /voicepuppet/pixrefer/train_pixrefer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | from optparse import OptionParser 7 | import logging 8 | from pixrefer import PixReferNet 9 | from generator.generator import PixReferDataGenerator 10 | from utils.utils import * 11 | 12 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | if (__name__ == '__main__'): 17 | 18 | cmd_parser = OptionParser(usage="usage: %prog [options] --config_path <>") 19 | cmd_parser.add_option('--config_path', type="string", dest="config_path", 20 | help='the config yaml file') 21 | 22 | opts, argv = cmd_parser.parse_args() 23 | 24 | if (opts.config_path is None): 25 | logger.error('Please check your parameters.') 26 | exit(0) 27 | 28 | config_path = opts.config_path 29 | 30 | if (not os.path.exists(config_path)): 31 | logger.error('config_path not exists') 32 | exit(0) 33 | 34 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 35 | 36 | batch_size = 2 37 | ### Generator for training setting 38 | train_generator = PixReferDataGenerator(config_path) 39 | params = train_generator.params 40 | params.dataset_path = params.train_dataset_path 41 | params.batch_size = batch_size 42 | train_generator.set_params(params) 43 | train_dataset = train_generator.get_dataset() 44 | 45 | config = tf.ConfigProto() 46 | config.gpu_options.allow_growth = True 47 | sess = tf.Session(config=config) 48 | tf.train.start_queue_runners(sess=sess) 49 | 50 | train_iter = train_dataset.make_one_shot_iterator() 51 | 52 | # inputs, fg_inputs, targets, masks = sess.run(train_iter.get_next()) 53 | # inp1 = cv2.cvtColor((inputs[0,...,0:3]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 54 | # inp2 = cv2.cvtColor((inputs[0,...,3:6]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 55 | # fg1 = cv2.cvtColor((fg_inputs[0, ...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 56 | # targets1 = cv2.cvtColor((targets[0, ...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 57 | # masks1 = cv2.cvtColor((masks[0, ...]*255).astype(np.uint8), cv2.COLOR_BGR2RGB) 58 | 59 | # cv2.imwrite('to/inp1.jpg', inp1) 60 | # cv2.imwrite('to/inp2.jpg', inp2) 61 | # cv2.imwrite('to/fg1.jpg', fg1) 62 | # cv2.imwrite('to/targets1.jpg', targets1) 63 | # cv2.imwrite('to/masks1.jpg', masks1) 64 | # sys.exit(0) 65 | 66 | 67 | ### Vid2VidNet setting 68 | vid2vidnet = PixReferNet(config_path) 69 | params = vid2vidnet.params 70 | epochs = params.training['epochs'] 71 | params.add_hparam('max_to_keep', 2) 72 | params.add_hparam('save_dir', 'ckpt_pixrefer') 73 | params.add_hparam('save_name', 'pixrefernet') 74 | params.add_hparam('save_step', 5000) 75 | params.add_hparam('summary_step', 100) 76 | params.add_hparam('summary_dir', 'log/summary_pixrefer') 77 | params.batch_size = batch_size 78 | params.add_hparam('is_training', True) 79 | params.sess = sess 80 | params.vgg_model_path = os.path.join(params.model_dir, 'vgg_16.ckpt') 81 | vid2vidnet.set_params(params) 82 | 83 | mkdir(params.save_dir) 84 | mkdir(params.summary_dir) 85 | 86 | train_nodes = vid2vidnet.build_train_op(*train_iter.get_next()) 87 | # sess.run(tf.global_variables_initializer()) 88 | 89 | all_var = tf.global_variables() 90 | init_var = [v for v in all_var if 'vgg_16' not in v.name] 91 | init = tf.variables_initializer(var_list=init_var) 92 | sess.run(init) 93 | 94 | # # Restore from save_dir 95 | # if ('checkpoint' in os.listdir(params.save_dir)): 96 | # variables_to_restore = tf.trainable_variables() 97 | # varlist = {v.name[:-2]: v for v in variables_to_restore if v.name[:6]!='vgg_16'} 98 | # print(varlist) 99 | # tf.train.Saver(varlist).restore(sess, tf.train.latest_checkpoint(params.save_dir)) 100 | 101 | tf.summary.scalar("discriminator_loss", train_nodes['Discrim_loss']) 102 | tf.summary.scalar("generator_loss_GAN", train_nodes['Gen_loss_GAN']) 103 | tf.summary.scalar("generator_loss_L1", train_nodes['Gen_loss_L1']) 104 | 105 | with tf.name_scope("inputs1_summary"): 106 | tf.summary.image("inputs1", tf.image.convert_image_dtype(train_nodes['Inputs'][... ,3:6], dtype=tf.uint8)) 107 | 108 | with tf.name_scope("targets_summary"): 109 | tf.summary.image("targets", tf.image.convert_image_dtype(train_nodes['Targets'], dtype=tf.uint8)) 110 | 111 | with tf.name_scope("outputs_summary"): 112 | tf.summary.image("outputs", tf.image.convert_image_dtype(train_nodes['Outputs'], dtype=tf.uint8)) 113 | 114 | with tf.name_scope("alpha_summary"): 115 | tf.summary.image("alphas", tf.image.convert_image_dtype(train_nodes['Alphas'], dtype=tf.uint8)) 116 | 117 | with tf.name_scope("inputs0_summary"): 118 | tf.summary.image("inputs0", tf.image.convert_image_dtype(train_nodes['Inputs'][:,:,:,:3], dtype=tf.uint8)) 119 | 120 | # with tf.name_scope("fg_inputs0_summary"): 121 | # tf.summary.image("fg_inputs0", tf.image.convert_image_dtype(train_nodes['FGInputs'], dtype=tf.uint8)) 122 | 123 | # with tf.name_scope("inputs_fg_summary"): 124 | # tf.summary.image("inputs_fg", tf.image.convert_image_dtype(train_nodes['Inputs'][:,:,:,:3], dtype=tf.uint8)) 125 | 126 | # # Add histograms for gradients. 127 | # for grad, var in train_nodes['Discrim_grads_and_vars'] + train_nodes['Gen_grads_and_vars']: 128 | # if(grad is not None): 129 | # tf.summary.histogram(var.op.name + "/gradients", grad) 130 | 131 | merge_summary_op = tf.summary.merge_all() 132 | summary_writer = tf.summary.FileWriter(params.summary_dir, graph=sess.graph) 133 | 134 | for i in range(epochs): 135 | ### Run training 136 | result = sess.run([train_nodes['Train_op'], 137 | merge_summary_op, 138 | train_nodes['Gen_loss_GAN'], 139 | train_nodes['Gen_loss_L1'], 140 | train_nodes['Discrim_loss'], 141 | train_nodes['Lr'], 142 | train_nodes['Global_step']]) 143 | _, summary, gen_loss_GAN, gen_loss_L1, discrim_loss, lr, global_step = result 144 | if(global_step % params.summary_step==0): 145 | print('Step {}, Lr= {:.2e}: \n\tgen_loss_GAN= {:.3f}, \n\tgen_loss_L1= {:.3f}, \n\tdiscrim_loss= {:.3f}'.format(global_step, lr, gen_loss_GAN, gen_loss_L1, discrim_loss)) 146 | summary_writer.add_summary(summary, global_step) 147 | 148 | ### Save checkpoint 149 | if (global_step % params.save_step == 0): 150 | tf.train.Saver(max_to_keep=params.max_to_keep, var_list=tf.global_variables()).save(sess, 151 | os.path.join(params.save_dir, 152 | params.save_name), 153 | global_step=global_step) 154 | -------------------------------------------------------------------------------- /voicepuppet/pixrefer/vgg_simple.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | slim = tf.contrib.slim 4 | 5 | 6 | def vgg_arg_scope(weight_decay=0.0005): 7 | """Defines the VGG arg scope. 8 | Args: 9 | weight_decay: The l2 regularization coefficient. 10 | Returns: 11 | An arg_scope. 12 | """ 13 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 14 | activation_fn=tf.nn.relu, 15 | weights_regularizer=slim.l2_regularizer(weight_decay), 16 | biases_initializer=tf.zeros_initializer()): 17 | with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc: 18 | return arg_sc 19 | 20 | 21 | def vgg_a(inputs, 22 | num_classes=1000, 23 | is_training=True, 24 | dropout_keep_prob=0.5, 25 | spatial_squeeze=True, 26 | scope='vgg_a', 27 | fc_conv_padding='VALID', 28 | global_pool=False): 29 | """Oxford Net VGG 11-Layers version A Example. 30 | Note: All the fully_connected layers have been transformed to conv2d layers. 31 | To use in classification mode, resize input to 224x224. 32 | Args: 33 | inputs: a tensor of size [batch_size, height, width, channels]. 34 | num_classes: number of predicted classes. If 0 or None, the logits layer is 35 | omitted and the input features to the logits layer are returned instead. 36 | is_training: whether or not the model is being trained. 37 | dropout_keep_prob: the probability that activations are kept in the dropout 38 | layers during training. 39 | spatial_squeeze: whether or not should squeeze the spatial dimensions of the 40 | outputs. Useful to remove unnecessary dimensions for classification. 41 | scope: Optional scope for the variables. 42 | fc_conv_padding: the type of padding to use for the fully connected layer 43 | that is implemented as a convolutional layer. Use 'SAME' padding if you 44 | are applying the network in a fully convolutional manner and want to 45 | get a prediction map downsampled by a factor of 32 as an output. 46 | Otherwise, the output prediction map will be (input / 32) - 6 in case of 47 | 'VALID' padding. 48 | global_pool: Optional boolean flag. If True, the input to the classification 49 | layer is avgpooled to size 1x1, for any input size. (This is not part 50 | of the original VGG architecture.) 51 | Returns: 52 | net: the output of the logits layer (if num_classes is a non-zero integer), 53 | or the input to the logits layer (if num_classes is 0 or None). 54 | end_points: a dict of tensors with intermediate activations. 55 | """ 56 | with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc: 57 | end_points_collection = sc.original_name_scope + '_end_points' 58 | # Collect outputs for conv2d, fully_connected and max_pool2d. 59 | with slim.arg_scope([slim.conv2d, slim.max_pool2d], 60 | outputs_collections=end_points_collection): 61 | net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1') 62 | net = slim.max_pool2d(net, [2, 2], scope='pool1') 63 | net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2') 64 | net = slim.max_pool2d(net, [2, 2], scope='pool2') 65 | net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3') 66 | net = slim.max_pool2d(net, [2, 2], scope='pool3') 67 | net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4') 68 | net = slim.max_pool2d(net, [2, 2], scope='pool4') 69 | net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5') 70 | net = slim.max_pool2d(net, [2, 2], scope='pool5') 71 | 72 | # Use conv2d instead of fully_connected layers. 73 | net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6') 74 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training, 75 | scope='dropout6') 76 | net = slim.conv2d(net, 4096, [1, 1], scope='fc7') 77 | # Convert end_points_collection into a end_point dict. 78 | end_points = slim.utils.convert_collection_to_dict(end_points_collection) 79 | if global_pool: 80 | net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool') 81 | end_points['global_pool'] = net 82 | if num_classes: 83 | net = slim.dropout(net, dropout_keep_prob, is_training=is_training, 84 | scope='dropout7') 85 | net = slim.conv2d(net, num_classes, [1, 1], 86 | activation_fn=None, 87 | normalizer_fn=None, 88 | scope='fc8') 89 | if spatial_squeeze: 90 | net = tf.squeeze(net, [1, 2], name='fc8/squeezed') 91 | end_points[sc.name + '/fc8'] = net 92 | return net, end_points 93 | vgg_a.default_image_size = 224 94 | 95 | 96 | def vgg_16(inputs, 97 | num_classes=1000, 98 | is_training=False, 99 | dropout_keep_prob=0.5, 100 | spatial_squeeze=True, 101 | scope='vgg_16', 102 | fc_conv_padding='VALID', 103 | global_pool=False, 104 | reuse=False): 105 | """Oxford Net VGG 16-Layers version D Example. 106 | Note: All the fully_connected layers have been transformed to conv2d layers. 107 | To use in classification mode, resize input to 224x224. 108 | Args: 109 | inputs: a tensor of size [batch_size, height, width, channels]. 110 | num_classes: number of predicted classes. If 0 or None, the logits layer is 111 | omitted and the input features to the logits layer are returned instead. 112 | is_training: whether or not the model is being trained. 113 | dropout_keep_prob: the probability that activations are kept in the dropout 114 | layers during training. 115 | spatial_squeeze: whether or not should squeeze the spatial dimensions of the 116 | outputs. Useful to remove unnecessary dimensions for classification. 117 | scope: Optional scope for the variables. 118 | fc_conv_padding: the type of padding to use for the fully connected layer 119 | that is implemented as a convolutional layer. Use 'SAME' padding if you 120 | are applying the network in a fully convolutional manner and want to 121 | get a prediction map downsampled by a factor of 32 as an output. 122 | Otherwise, the output prediction map will be (input / 32) - 6 in case of 123 | 'VALID' padding. 124 | global_pool: Optional boolean flag. If True, the input to the classification 125 | layer is avgpooled to size 1x1, for any input size. (This is not part 126 | of the original VGG architecture.) 127 | Returns: 128 | net: the output of the logits layer (if num_classes is a non-zero integer), 129 | or the input to the logits layer (if num_classes is 0 or None). 130 | end_points: a dict of tensors with intermediate activations. 131 | """ 132 | with tf.variable_scope(scope, 'vgg_16', [inputs], reuse=reuse) as sc: 133 | out = [] 134 | end_points_collection = sc.original_name_scope + '_end_points' 135 | # Collect outputs for conv2d, fully_connected and max_pool2d. 136 | with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], 137 | outputs_collections=end_points_collection): 138 | net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') 139 | 140 | # with tf.variable_scope('relu1'): 141 | out1 = net 142 | 143 | net = slim.max_pool2d(net, [2, 2], scope='pool1') 144 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') 145 | 146 | # with tf.variable_scope('relu2'): 147 | # out = tf.add(net, tf.zeros_like(net), name='conv2_2') 148 | out2 = net 149 | 150 | net = slim.max_pool2d(net, [2, 2], scope='pool2') 151 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') 152 | 153 | # with tf.variable_scope('relu3'): 154 | out3 = net 155 | 156 | net = slim.max_pool2d(net, [2, 2], scope='pool3') 157 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') 158 | 159 | out4 = net 160 | exclude = ['vgg_16/fc6', 'vgg_16/pool4','vgg_16/conv5','vgg_16/pool5','vgg_16/fc7','vgg_16/global_pool','vgg_16/fc8/squeezed','vgg_16/fc8'] 161 | 162 | return out1, out2, out3, out4, exclude 163 | vgg_16.default_image_size = 224 164 | --------------------------------------------------------------------------------