├── EmotionTalk ├── feature_extraction │ ├── visual │ │ ├── pytorch-benchmarks │ │ │ ├── fer2013 │ │ │ │ ├── __init__.py │ │ │ │ └── fer.py │ │ │ ├── imagenet │ │ │ │ ├── __init__.py │ │ │ │ ├── imagenet.py.bak │ │ │ │ └── evaluation.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ └── benchmark_helpers.py │ │ │ ├── .gitignore │ │ │ ├── LICENSE.md │ │ │ ├── README.md │ │ │ ├── run_fer_benchmarks.py │ │ │ └── model │ │ │ │ ├── vgg_m_face_bn_fer_dag.py │ │ │ │ ├── alexnet_face_fer_bn_dag.py │ │ │ │ └── vgg_vd_face_fer_dag.py │ │ ├── emonet │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ └── affecnet.py │ │ │ ├── models │ │ │ │ └── __init__.py │ │ │ └── metrics.py │ │ ├── manet │ │ │ ├── log │ │ │ │ ├── SFEW.png │ │ │ │ ├── CAER-S.png │ │ │ │ ├── FED-RO.png │ │ │ │ ├── RAF-DB.png │ │ │ │ ├── AffectNet7.png │ │ │ │ ├── AffectNet8.png │ │ │ │ ├── [02-08]-[16-22]-cnn.png │ │ │ │ ├── [02-08]-[19-12]-cnn.png │ │ │ │ ├── [02-08]-[21-19]-cnn.png │ │ │ │ ├── [02-08]-[22-55]-cnn.png │ │ │ │ ├── [02-12]-[19-11]-cnn.png │ │ │ │ ├── [02-12]-[22-21]-cnn.png │ │ │ │ └── [05-28]-[13-07]-cnn.png │ │ │ ├── model │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── manet.cpython-39.pyc │ │ │ │ │ └── attention.cpython-39.pyc │ │ │ │ └── attention.py │ │ │ ├── reorganize_rafdb.py │ │ │ ├── LICENSE │ │ │ └── README.md │ │ ├── dataset.py │ │ ├── util.py │ │ ├── extract_imagenet_embedding.py │ │ ├── extract_emonet_embedding.py │ │ └── extract_manet_embedding.py │ └── audio │ │ ├── vggish │ │ ├── vggish_pca_params.npz │ │ ├── vggish_params.py │ │ ├── vggish_postprocess.py │ │ ├── vggish_smoke_test.py │ │ ├── vggish_input.py │ │ └── vggish_slim.py │ │ ├── extract_vggish_embedding.py │ │ └── extract_wav2vec_embedding.py ├── dataset │ └── mm-process │ │ ├── mm_label.npz │ │ ├── mm_label4.npz │ │ ├── txt_label.npz │ │ ├── audio_label.npz │ │ ├── txt_label4.npz │ │ ├── video_label.npz │ │ ├── audio_label4.npz │ │ └── video_label4.npz ├── toolkit │ ├── models │ │ ├── __pycache__ │ │ │ ├── lmf.cpython-38.pyc │ │ │ ├── lmf.cpython-39.pyc │ │ │ ├── mctn.cpython-38.pyc │ │ │ ├── mctn.cpython-39.pyc │ │ │ ├── mfm.cpython-38.pyc │ │ │ ├── mfm.cpython-39.pyc │ │ │ ├── mfn.cpython-38.pyc │ │ │ ├── mfn.cpython-39.pyc │ │ │ ├── misa.cpython-38.pyc │ │ │ ├── misa.cpython-39.pyc │ │ │ ├── mmim.cpython-38.pyc │ │ │ ├── mmim.cpython-39.pyc │ │ │ ├── mult.cpython-38.pyc │ │ │ ├── mult.cpython-39.pyc │ │ │ ├── tfn.cpython-38.pyc │ │ │ ├── tfn.cpython-39.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── attention.cpython-38.pyc │ │ │ ├── attention.cpython-39.pyc │ │ │ ├── graph_mfn.cpython-38.pyc │ │ │ └── graph_mfn.cpython-39.pyc │ │ ├── modules │ │ │ ├── __pycache__ │ │ │ │ ├── encoder.cpython-38.pyc │ │ │ │ └── encoder.cpython-39.pyc │ │ │ ├── transformers_encoder │ │ │ │ ├── __pycache__ │ │ │ │ │ ├── transformer.cpython-38.pyc │ │ │ │ │ ├── transformer.cpython-39.pyc │ │ │ │ │ ├── position_embedding.cpython-38.pyc │ │ │ │ │ ├── position_embedding.cpython-39.pyc │ │ │ │ │ ├── multihead_attention.cpython-38.pyc │ │ │ │ │ └── multihead_attention.cpython-39.pyc │ │ │ │ └── position_embedding.py │ │ │ └── encoder.py │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── tfn.py │ │ ├── lmf.py │ │ └── mfn.py │ ├── utils │ │ ├── __pycache__ │ │ │ ├── loss.cpython-38.pyc │ │ │ ├── loss.cpython-39.pyc │ │ │ ├── metric.cpython-38.pyc │ │ │ ├── metric.cpython-39.pyc │ │ │ ├── chatgpt.cpython-38.pyc │ │ │ ├── chatgpt.cpython-39.pyc │ │ │ ├── functions.cpython-38.pyc │ │ │ ├── functions.cpython-39.pyc │ │ │ ├── read_data.cpython-38.pyc │ │ │ ├── read_data.cpython-39.pyc │ │ │ ├── read_files.cpython-38.pyc │ │ │ └── read_files.cpython-39.pyc │ │ ├── loss.py │ │ ├── metric.py │ │ ├── chatgpt.py │ │ └── read_data.py │ ├── preprocess │ │ ├── __pycache__ │ │ │ ├── config.cpython-39.pyc │ │ │ └── globals.cpython-39.pyc │ │ ├── utils │ │ │ ├── __pycache__ │ │ │ │ ├── chatgpt.cpython-39.pyc │ │ │ │ ├── functions.cpython-39.pyc │ │ │ │ └── read_files.cpython-39.pyc │ │ │ ├── loss.py │ │ │ ├── metric.py │ │ │ ├── chatgpt.py │ │ │ └── read_data.py │ │ ├── mer2023.py │ │ ├── simsv2.py │ │ ├── cmumosi.py │ │ ├── meld.py │ │ ├── sims.py │ │ └── config.py │ ├── data │ │ ├── __init__.py │ │ └── feat_data.py │ ├── dataloader │ │ ├── __init__.py │ │ └── mm.py │ └── model-tune.yaml └── config.py ├── environment.yml └── README.md /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/fer2013/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/imagenet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/emonet/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/emonet/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .affecnet import AffectNet 2 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/emonet/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .emonet import EmoNet 2 | 3 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | .nfs* 4 | scratch 5 | res_cache 6 | -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/mm_label.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/mm_label.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/mm_label4.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/mm_label4.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/txt_label.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/txt_label.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/audio_label.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/audio_label.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/txt_label4.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/txt_label4.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/video_label.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/video_label.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/audio_label4.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/audio_label4.npz -------------------------------------------------------------------------------- /EmotionTalk/dataset/mm-process/video_label4.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/video_label4.npz -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/SFEW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/SFEW.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/CAER-S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/CAER-S.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/FED-RO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/FED-RO.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/RAF-DB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/RAF-DB.png -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/lmf.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/lmf.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/lmf.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/lmf.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mctn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mctn.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mctn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mctn.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mfm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfm.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mfm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfm.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mfn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfn.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mfn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfn.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/misa.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/misa.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/misa.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/misa.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mmim.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mmim.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mmim.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mmim.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mult.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mult.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/mult.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mult.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/tfn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/tfn.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/tfn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/tfn.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/loss.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/metric.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/metric.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/metric.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/metric.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/AffectNet7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/AffectNet7.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/AffectNet8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/AffectNet8.png -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/attention.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/__pycache__/config.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/__pycache__/config.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/functions.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/functions.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/functions.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/vggish/vggish_pca_params.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/audio/vggish/vggish_pca_params.npz -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/__pycache__/globals.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/__pycache__/globals.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[16-22]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[16-22]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[19-12]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[19-12]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[21-19]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[21-19]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[22-55]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[22-55]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[19-11]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[19-11]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[22-21]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[22-21]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/log/[05-28]-[13-07]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[05-28]-[13-07]-cnn.png -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/__pycache__/chatgpt.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/utils/__pycache__/chatgpt.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/__pycache__/functions.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/utils/__pycache__/functions.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/__pycache__/read_files.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/utils/__pycache__/read_files.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/model/__pycache__/manet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/model/__pycache__/manet.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/model/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/model/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-38.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-39.pyc -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | # classification loss 5 | class CELoss(nn.Module): 6 | 7 | def __init__(self): 8 | super(CELoss, self).__init__() 9 | self.loss = nn.NLLLoss(reduction='sum') 10 | 11 | def forward(self, pred, target): 12 | pred = F.log_softmax(pred, 1) # [n_samples, n_classes] 13 | target = target.long() # [n_samples] 14 | loss = self.loss(pred, target) / len(pred) 15 | return loss 16 | 17 | # regression loss 18 | class MSELoss(nn.Module): 19 | 20 | def __init__(self): 21 | super(MSELoss, self).__init__() 22 | self.loss = nn.MSELoss(reduction='sum') 23 | 24 | def forward(self, pred, target): 25 | pred = pred.view(-1,1) 26 | target = target.view(-1,1) 27 | loss = self.loss(pred, target) / len(pred) 28 | return loss 29 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | # classification loss 5 | class CELoss(nn.Module): 6 | 7 | def __init__(self): 8 | super(CELoss, self).__init__() 9 | self.loss = nn.NLLLoss(reduction='sum') 10 | 11 | def forward(self, pred, target): 12 | pred = F.log_softmax(pred, 1) # [n_samples, n_classes] 13 | target = target.long() # [n_samples] 14 | loss = self.loss(pred, target) / len(pred) 15 | return loss 16 | 17 | # regression loss 18 | class MSELoss(nn.Module): 19 | 20 | def __init__(self): 21 | super(MSELoss, self).__init__() 22 | self.loss = nn.MSELoss(reduction='sum') 23 | 24 | def forward(self, pred, target): 25 | pred = pred.view(-1,1) 26 | target = target.view(-1,1) 27 | loss = self.loss(pred, target) / len(pred) 28 | return loss 29 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Samuel Albanie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/reorganize_rafdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import shutil 5 | 6 | 7 | rafdb_path = '/data1/sunlicai/Affective Computing/Dataset/RAF-DB/basic' 8 | src_path = os.path.join(rafdb_path, 'Image/aligned') 9 | tgt_path = os.path.join(rafdb_path, 'Image/aligned_c') # split/class_id/img_file 10 | label_file = os.path.join(rafdb_path, 'EmoLabel/list_patition_label.txt') 11 | df = pd.read_csv(label_file, header=None, delimiter=' ') 12 | file_names, label_ids = df[0].values, df[1].values 13 | print(f'Number of images: {len(df)}.') 14 | name_to_label = dict(zip(file_names, label_ids)) 15 | img_files = glob.glob(os.path.join(src_path, '*.jpg')) 16 | 17 | for src_file in img_files: 18 | img_name = os.path.basename(src_file).replace('_aligned', '') 19 | label = name_to_label[img_name] 20 | split = img_name.split('_')[0] 21 | saved_path = os.path.join(tgt_path, split, str(label)) 22 | if not os.path.exists(saved_path): 23 | os.makedirs(saved_path) 24 | tgt_file = os.path.join(saved_path, img_name) 25 | shutil.copyfile(src_file, tgt_file) 26 | print(f'Copy "{src_file}" to "{tgt_file}".') -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Zengqun Zhao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/data/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | 3 | from .feat_data import Data_Feat 4 | 5 | # 目标:输入 (names, labels, data_type),得到所有特征与标签 6 | class get_datasets(Dataset): 7 | 8 | def __init__(self, args, names, labels): 9 | 10 | MODEL_DATASET_MAP = { 11 | 12 | # 解析特征 13 | 'attention': Data_Feat, 14 | 'lf_dnn': Data_Feat, 15 | 'lmf': Data_Feat, 16 | 'misa': Data_Feat, 17 | 'mmim': Data_Feat, 18 | 'tfn': Data_Feat, 19 | 'mfn': Data_Feat, 20 | 'graph_mfn': Data_Feat, 21 | 'ef_lstm': Data_Feat, 22 | 'mfm': Data_Feat, 23 | 'mctn': Data_Feat, 24 | 'mult': Data_Feat, 25 | 26 | } 27 | 28 | self.dataset_class = MODEL_DATASET_MAP[args.model] 29 | self.dataset = self.dataset_class(args, names, labels) 30 | 31 | def __len__(self): 32 | return self.dataset.__len__() 33 | 34 | def __getitem__(self, index): 35 | return self.dataset.__getitem__(index) 36 | 37 | def collater(self, instances): 38 | return self.dataset.collater(instances) 39 | 40 | def get_featdim(self): 41 | return self.dataset.get_featdim() -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | get_models: get models and load default configs; 3 | link: https://github.com/thuiar/MMSA-FET/tree/master 4 | """ 5 | import torch 6 | 7 | from .tfn import TFN 8 | from .lmf import LMF 9 | from .mfn import MFN 10 | from .mfm import MFM 11 | from .mult import MULT 12 | from .misa import MISA 13 | from .mctn import MCTN 14 | from .mmim import MMIM 15 | from .graph_mfn import Graph_MFN 16 | from .attention import Attention 17 | 18 | class get_models(torch.nn.Module): 19 | def __init__(self, args): 20 | super(get_models, self).__init__() 21 | # misa/mmim在有些参数配置下会存在梯度爆炸的风险 22 | # tfn 显存占比比较高 23 | 24 | MODEL_MAP = { 25 | 26 | # 特征压缩到句子级再处理,所以支持 utt/align/unalign 27 | 'attention': Attention, 28 | 'lmf': LMF, 29 | 'misa': MISA, 30 | 'mmim': MMIM, 31 | 'tfn': TFN, 32 | 33 | # 只支持align 34 | 'mfn': MFN, # slow 35 | 'graph_mfn': Graph_MFN, # slow 36 | 'mfm': MFM, # slow 37 | 'mctn': MCTN, # slow 38 | 39 | # 支持align/unalign 40 | 'mult': MULT, # slow 41 | 42 | } 43 | self.model = MODEL_MAP[args.model](args) 44 | 45 | def forward(self, batch): 46 | return self.model(batch) 47 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/README.md: -------------------------------------------------------------------------------- 1 | # MA-Net 2 | 3 | PyTorch implementation of the paper *“Learning Deep Global Multi-scale and Local Attention Features 4 | for Facial Expression Recognition in the Wild”*, This work is under submission. 5 | 6 | ## Requirements 7 | - Python $\geq$3.6 8 | - PyTorch $\geq$1.2 9 | - torchvision $\geq$0.4.0 10 | - numpy 11 | - matplotlib 12 | - datetime 13 | - shutil 14 | - time 15 | - argparse 16 | - os 17 | 18 | ## Training 19 | 20 | - Step 1: download basic emotions dataset of [RAF-DB](http://www.whdeng.cn/raf/model1.html), and make sure it have the structure like following: 21 | 22 | ``` 23 | ./RAF-DB/ 24 | train/ 25 | 0/ 26 | train_09748.jpg 27 | ... 28 | train_12271.jpg 29 | 1/ 30 | ... 31 | 6/ 32 | test/ 33 | 0/ 34 | ... 35 | 6/ 36 | 37 | [Note] 0: Neutral; 1: Happiness; 2: Sadness; 3: Surprise; 4: Fear; 5: Disgust; 6: Anger 38 | ``` 39 | 40 | - Step 2: download pre-trained model from 41 | [Google Drive](https://drive.google.com/file/d/1tro_RCovLKNACt4MKYp3dmIvvxiOC2pi/view?usp=sharing), 42 | and put it into ***./checkpoint***. 43 | 44 | - Step 3: change the ***project_path*** and ***data_path*** in *main.py* to your path 45 | 46 | - Step 4: run ```python main.py ``` 47 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/README.md: -------------------------------------------------------------------------------- 1 | ### pytorch-benchmark 2 | 3 | Some scripts for validating models on common benchmarks. Assumes at least Python3 and PyTorch 4.0. 4 | 5 | 6 | ### Supported datasets: 7 | 8 | * **ImageNet** (this is essentially just a cut-down version of the [official example](https://github.com/pytorch/examples/tree/master/imagenet)) 9 | * **Fer2013** - A dataset of greyscale faces labelled with emotions. 10 | 11 | 12 | 13 | ### References 14 | 15 | **ImageNet**: [paper](https://arxiv.org/abs/1409.0575) 16 | 17 | ``` 18 | @article{ILSVRC15, 19 | Author = {Olga Russakovsky and Jia Deng and Hao Su and Jonathan Krause and Sanjeev Satheesh and Sean Ma and Zhiheng Huang and Andrej Karpathy and Aditya Khosla and Michael Bernstein and Alexander C. Berg and Li Fei-Fei}, 20 | Title = {{ImageNet Large Scale Visual Recognition Challenge}}, 21 | Year = {2015}, 22 | journal = {International Journal of Computer Vision (IJCV)}, 23 | doi = {10.1007/s11263-015-0816-y}, 24 | volume={115}, 25 | number={3}, 26 | pages={211-252} 27 | } 28 | ``` 29 | 30 | **FER2013**: [paper](https://arxiv.org/abs/1307.0414) 31 | 32 | ``` 33 | @inproceedings{goodfellow2013challenges, 34 | title={Challenges in representation learning: A report on three machine learning contests}, 35 | author={Goodfellow, Ian J and Erhan, Dumitru and Carrier, Pierre Luc and Courville, Aaron and Mirza, Mehdi and Hamner, Ben and Cukierski, Will and Tang, Yichuan and Thaler, David and Lee, Dong-Hyun and others}, 36 | booktitle={International Conference on Neural Information Processing}, 37 | pages={117--124}, 38 | year={2013}, 39 | organization={Springer} 40 | } 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: mertools 2 | channels: 3 | - pytorch 4 | - defaults 5 | - anaconda 6 | dependencies: 7 | - python=3.9 8 | - cudatoolkit 9 | - pip 10 | - pytorch=1.12.1 11 | - pytorch-mutex=1.0=cuda 12 | - torchaudio=0.12.1 13 | - torchvision=0.13.1 14 | 15 | - pip: 16 | - accelerate==0.16.0 17 | - aiohttp==3.8.4 18 | - aiosignal==1.3.1 19 | - async-timeout==4.0.2 20 | - attrs==22.2.0 21 | - bitsandbytes==0.37.0 22 | - cchardet==2.1.7 23 | - chardet==5.1.0 24 | - contourpy==1.0.7 25 | - cycler==0.11.0 26 | - filelock==3.9.0 27 | - fonttools==4.38.0 28 | - frozenlist==1.3.3 29 | - huggingface-hub==0.13.4 30 | - importlib-resources==5.12.0 31 | - kiwisolver==1.4.4 32 | - matplotlib==3.7.0 33 | - multidict==6.0.4 34 | - openai==0.27.0 35 | - packaging==23.0 36 | - psutil==5.9.4 37 | - pycocotools==2.0.6 38 | - pyparsing==3.0.9 39 | - python-dateutil==2.8.2 40 | - pyyaml==6.0 41 | - regex==2022.10.31 42 | - tokenizers==0.13.2 43 | - tqdm==4.64.1 44 | - transformers==4.28.0 45 | - timm==0.6.13 46 | - spacy==3.5.1 47 | - webdataset==0.2.48 48 | - scikit-learn==1.2.2 49 | - scipy==1.10.1 50 | - yarl==1.8.2 51 | - zipp==3.14.0 52 | - omegaconf==2.3.0 53 | - opencv-python==4.7.0.72 54 | - iopath==0.1.10 55 | - decord==0.6.0 56 | - tenacity==8.2.2 57 | - peft 58 | - pycocoevalcap 59 | - sentence-transformers 60 | - umap-learn 61 | - notebook 62 | - gradio==3.24.1 63 | - gradio-client==0.0.8 64 | - wandb 65 | - einops 66 | - SentencePiece 67 | - ftfy 68 | - thop 69 | - pytorchvideo==0.1.5 70 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .iemocap import IEMOCAP 2 | from .cmudata import CMUDATA 3 | from .mer2023 import MER2023 4 | from .sims import SIMS 5 | from .meld import MELD 6 | from .simsv2 import SIMSv2 7 | from .crossdim import CROSSDIM 8 | from .crossdis import CROSSDIS 9 | from .mm import mm 10 | from .MM import MM 11 | 12 | DIM_DATASET = ['CMUMOSI', 'CMUMOSEI', 'SIMS', 'SIMSv2','MM'] 13 | DIS_DATASET = ['IEMOCAPFour', 'IEMOCAPSix', 'MER2023', 'MELD','mm4','mm7'] 14 | 15 | # 输入数据库名称,得到 dataloaders 16 | class get_dataloaders: 17 | 18 | def __init__(self, args): 19 | 20 | if args.train_dataset is None: 21 | DATALOADER_MAP = { 22 | 23 | 'IEMOCAPFour': IEMOCAP, 24 | 'mm4': mm, 25 | 'mm7': mm, 26 | 'MM': MM, 27 | 'IEMOCAPSix': IEMOCAP, 28 | 'CMUMOSI': CMUDATA, 29 | 'CMUMOSEI': CMUDATA, 30 | 'MER2023': MER2023, 31 | 'SIMS': SIMS, 32 | 'SIMSv2': SIMSv2, 33 | 'MELD': MELD, 34 | } 35 | self.dataloader = DATALOADER_MAP[args.dataset](args) 36 | elif args.train_dataset in DIM_DATASET: 37 | assert args.test_dataset in DIM_DATASET 38 | self.dataloader = CROSSDIM(args) 39 | elif args.train_dataset in DIS_DATASET: 40 | assert args.test_dataset in DIS_DATASET 41 | self.dataloader = CROSSDIS(args) 42 | 43 | def get_loaders(self): 44 | return self.dataloader.get_loaders() 45 | 46 | def calculate_results(self, emo_probs=[], emo_labels=[], val_preds=[], val_labels=[]): 47 | return self.dataloader.calculate_results(emo_probs, emo_labels, val_preds, val_labels) 48 | 49 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/model-tune.yaml: -------------------------------------------------------------------------------- 1 | tfn: 2 | hidden_dim: [64, 128] 3 | dropout: [0.2, 0.3, 0.4, 0.5] 4 | grad_clip: [-1.0] 5 | lr: [1e-3, 1e-4] 6 | 7 | lmf: 8 | hidden_dim: [32, 64, 128, 256] 9 | dropout: [0.2, 0.3, 0.4, 0.5] 10 | rank: [3, 4, 5, 6] 11 | grad_clip: [-1.0] 12 | lr: [1e-3, 1e-4] 13 | 14 | mmim: 15 | hidden_dim: [64, 128, 256] 16 | dropout: [0.0, 0.1, 0.2, 0.3] 17 | cpc_layers: [1, 2, 3, 4] 18 | alpha: [0.0, 0.1, 0.2] 19 | beta: [0.0, 0.1, 0.2] 20 | grad_clip: [0.6, 0.8, 1.0] 21 | lr: [1e-3, 1e-4] 22 | 23 | misa: 24 | dropout: [0.2, 0.3, 0.4, 0.5] 25 | hidden_dim: [64, 128, 256] 26 | sim_weight: [0.0, 0.1, 0.2] 27 | diff_weight: [0.0, 0.1, 0.2] 28 | recon_weight: [0.0, 0.1, 0.2] 29 | grad_clip: [-1.0, 0.8, 1.0] 30 | lr: [1e-4, 1e-5] 31 | 32 | mfn: 33 | hidden_dim: [128, 256] 34 | mem_dim: [128] 35 | dropout: [0.0, 0.3, 0.5, 0.7] 36 | window_dim: [2] 37 | grad_clip: [-1.0] 38 | lr: [1e-3, 1e-4] 39 | # lr: [1e-3, 1e-4] 40 | 41 | graph_mfn: 42 | hidden_dim: [128, 256] 43 | mem_dim: [128] 44 | dropout: [0.0, 0.3, 0.5, 0.7] 45 | grad_clip: [-1.0] 46 | lr: [1e-3, 1e-4] 47 | # lr: [1e-3, 1e-4] 48 | 49 | mfm: 50 | hidden_dim: [128, 256] 51 | mem_dim: [128] 52 | dropout: [0.0, 0.3, 0.5, 0.7] 53 | window_dim: [2] 54 | lda_xl: [0.01, 0.1, 0.5, 1.0] 55 | lda_xa: [0.01, 0.1, 0.5, 1.0] 56 | lda_xv: [0.01, 0.1, 0.5, 1.0] 57 | lda_mmd: [10, 50, 100] 58 | grad_clip: [-1.0] 59 | lr: [1e-3, 1e-4] 60 | 61 | mult: 62 | layers: [2, 4, 6] 63 | dropout: [0.0, 0.1, 0.2, 0.3] 64 | num_heads: [8] 65 | hidden_dim: [64, 128, 256] 66 | conv1d_kernel_size: [1, 3] 67 | grad_clip: [0.6, 0.8, 1.0] 68 | lr: [1e-3, 1e-4] 69 | # lr: [1e-3, 1e-4] 70 | 71 | mctn: 72 | hidden_dim: [64, 128, 256] 73 | dropout: [0.0, 0.1, 0.2, 0.3] 74 | teacher_forcing_ratio: [0.3, 0.5] 75 | loss_weight: [0.1, 0.3, 0.5, 0.8, 1.0] 76 | grad_clip: [0.6, 0.8, 1.0] 77 | lr: [1e-3, 1e-4] 78 | 79 | attention: 80 | hidden_dim: [64, 128, 256] 81 | dropout: [0.2, 0.3, 0.4, 0.5] 82 | grad_clip: [-1.0] 83 | lr: [1e-3, 1e-4] 84 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/emonet/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def ACC(ground_truth, predictions): 5 | """Evaluates the mean accuracy 6 | """ 7 | return np.mean(ground_truth.astype(int) == predictions.astype(int)) 8 | 9 | def RMSE(ground_truth, predictions): 10 | """ 11 | Evaluates the RMSE between estimate and ground truth. 12 | """ 13 | return np.sqrt(np.mean((ground_truth-predictions)**2)) 14 | 15 | 16 | def SAGR(ground_truth, predictions): 17 | """ 18 | Evaluates the SAGR between estimate and ground truth. 19 | """ 20 | return np.mean(np.sign(ground_truth) == np.sign(predictions)) 21 | 22 | 23 | def PCC(ground_truth, predictions): 24 | """ 25 | Evaluates the Pearson Correlation Coefficient. 26 | Inputs are numpy arrays. 27 | Corr = Cov(GT, Est)/(std(GT)std(Est)) 28 | """ 29 | return np.corrcoef(ground_truth, predictions)[0,1] 30 | 31 | 32 | def CCC(ground_truth, predictions): 33 | """ 34 | Evaluates the Concordance Correlation Coefficient. 35 | Inputs are numpy arrays. 36 | """ 37 | mean_pred = np.mean(predictions) 38 | mean_gt = np.mean(ground_truth) 39 | 40 | std_pred= np.std(predictions) 41 | std_gt = np.std(ground_truth) 42 | 43 | pearson = PCC(ground_truth, predictions) 44 | return 2.0*pearson*std_pred*std_gt/(std_pred**2+std_gt**2+(mean_pred-mean_gt)**2) 45 | 46 | def ICC(labels, predictions): 47 | """Evaluates the ICC(3, 1) 48 | """ 49 | naus = predictions.shape[1] 50 | icc = np.zeros(naus) 51 | 52 | n = predictions.shape[0] 53 | 54 | for i in range(0,naus): 55 | a = np.asmatrix(labels[:,i]).transpose() 56 | b = np.asmatrix(predictions[:,i]).transpose() 57 | dat = np.hstack((a, b)) 58 | mpt = np.mean(dat, axis=1) 59 | mpr = np.mean(dat, axis=0) 60 | tm = np.mean(mpt, axis=0) 61 | BSS = np.sum(np.square(mpt-tm))*2 62 | BMS = BSS/(n-1) 63 | RSS = np.sum(np.square(mpr-tm))*n 64 | tmp = np.square(dat - np.hstack((mpt,mpt))) 65 | WSS = np.sum(np.sum(tmp, axis=1)) 66 | ESS = WSS - RSS 67 | EMS = ESS/(n-1) 68 | icc[i] = (BMS - EMS)/(BMS + EMS) 69 | 70 | return icc 71 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/vggish/vggish_params.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Global parameters for the VGGish model. 17 | 18 | See vggish_slim.py for more information. 19 | """ 20 | 21 | # Architectural constants. 22 | NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. 23 | NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. 24 | EMBEDDING_SIZE = 128 # Size of embedding layer. 25 | 26 | # Hyperparameters used in feature and example generation. 27 | SAMPLE_RATE = 16000 28 | STFT_WINDOW_LENGTH_SECONDS = 0.025 29 | STFT_HOP_LENGTH_SECONDS = 0.010 30 | NUM_MEL_BINS = NUM_BANDS 31 | MEL_MIN_HZ = 125 32 | MEL_MAX_HZ = 7500 33 | LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 34 | EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 35 | # Note: original value for EXAMPLE_HOP_SECONDS is 0.96, i.e. no overlapping between adjacent examples 36 | # EXAMPLE_HOP_SECONDS = 0.25 # with zero overlap. 37 | 38 | # Parameters used for embedding postprocessing. 39 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' 40 | PCA_MEANS_NAME = 'pca_means' 41 | QUANTIZE_MIN_VAL = -2.0 42 | QUANTIZE_MAX_VAL = +2.0 43 | 44 | # Hyperparameters used in training. 45 | INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. 46 | LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. 47 | ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. 48 | 49 | # Names of ops, tensors, and features. 50 | INPUT_OP_NAME = 'vggish/input_features' 51 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' 52 | OUTPUT_OP_NAME = 'vggish/embedding' 53 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' 54 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | 3 | 4 | # EmotionTalk 5 | 6 | > An Interactive Chinese Multimodal Emotion Dataset With Rich Annotations. 7 | 8 | [![Python](https://img.shields.io/badge/python-3.9+-blue.svg)]() 9 | [![Project Page](https://img.shields.io/badge/Project-Website-blue.svg)](https://github.com/NKU-HLT/EmotionTalk) 10 | 11 | --- 12 | 13 | 14 | ## 📖 Overview 15 | 16 | We propose EmotionTalk, an interactive Chinese multimodal emotion dataset with rich annotations. This dataset provides multimodal information from 19 actors participating in dyadic conversational settings, incorporating acoustic, visual, and textual modalities. It includes 23.6 hours of speech (19,250 utterances), annotations for 7 utterance-level emotion categories (happy, surprise, sad, disgust, anger, fear, and neutral), 5-dimensional sentiment labels (negative, weakly negative, neutral, weakly positive, and positive) and 4-dimensional speech captions (speaker, speaking style, emotion and overall). The dataset is well-suited for research on unimodal and multimodal emotion recognition, missing modality challenges, and speech captioning tasks. To our knowledge, it represents the first high-quality and versatile Chinese dialogue multimodal emotion dataset, which is a valuable contribution to research on cross-cultural emotion analysis and recognition. Additionally, we conduct experiments on EmotionTalk to demonstrate the effectiveness and quality of the dataset. It will be open-source and freely available for all academic purposes. The dataset and codes will be made available at [EmotionTalk](https://github.com/NKU-HLT/EmotionTalk). 17 | 18 | ## 🚀 Getting Started 19 | ### Environment 20 | 21 | ```shell 22 | conda env create -f environment.yml 23 | ``` 24 | 25 | ## 🤗 Dataset Download 26 | 27 | You can access the Emotiontalk dataset on HuggingFace Datasets: 28 | 29 | [https://huggingface.co/datasets/BAAI/Emotiontalk](https://huggingface.co/datasets/BAAI/Emotiontalk) 30 | 31 | ### Tool 32 | openface_win_x64 (https://drive.google.com/file/d/1-O8epcTDYCrRUU_mtXgjrS3OWA4HTp0-/view?usp=share_link -> tools/openface_win_x64) 33 | 34 | You need to follow the steps to run in EmotionTalk/run.sh. 35 | 36 | Please refer to run.sh for more details. 37 | 38 | ## 🙏 Acknowledgements 39 | 40 | This project builds upon prior work from the [zeroQiaoba/MERTools](https://github.com/zeroQiaoba/MERTools) repository. We thank them for their contributions! 41 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ref paper: Tensor Fusion Network for Multimodal Sentiment Analysis 3 | Ref url: https://github.com/Justin1904/TensorFusionNetworks 4 | """ 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | ## 这两个模块都是用在 TFN 中的 (video|audio) 9 | class MLPEncoder(nn.Module): 10 | ''' 11 | The subnetwork that is used in TFN for video and audio in the pre-fusion stage 12 | ''' 13 | 14 | def __init__(self, in_size, hidden_size, dropout): 15 | ''' 16 | Args: 17 | in_size: input dimension 18 | hidden_size: hidden layer dimension 19 | dropout: dropout probability 20 | Output: 21 | (return value in forward) a tensor of shape (batch_size, hidden_size) 22 | ''' 23 | super(MLPEncoder, self).__init__() 24 | # self.norm = nn.BatchNorm1d(in_size) 25 | self.drop = nn.Dropout(p=dropout) 26 | self.linear_1 = nn.Linear(in_size, hidden_size) 27 | self.linear_2 = nn.Linear(hidden_size, hidden_size) 28 | self.linear_3 = nn.Linear(hidden_size, hidden_size) 29 | 30 | def forward(self, x): 31 | ''' 32 | Args: 33 | x: tensor of shape (batch_size, in_size) 34 | ''' 35 | # normed = self.norm(x) 36 | dropped = self.drop(x) 37 | y_1 = F.relu(self.linear_1(dropped)) 38 | y_2 = F.relu(self.linear_2(y_1)) 39 | y_3 = F.relu(self.linear_3(y_2)) 40 | 41 | return y_3 42 | 43 | 44 | # TFN 中的文本编码,额外需要lstm 操作 [感觉是audio|video] 45 | class LSTMEncoder(nn.Module): 46 | ''' 47 | The LSTM-based subnetwork that is used in TFN for text 48 | ''' 49 | 50 | def __init__(self, in_size, hidden_size, dropout, num_layers=1, bidirectional=False): 51 | 52 | super(LSTMEncoder, self).__init__() 53 | 54 | if num_layers == 1: 55 | rnn_dropout = 0.0 56 | else: 57 | rnn_dropout = dropout 58 | 59 | self.rnn = nn.LSTM(in_size, hidden_size, num_layers=num_layers, dropout=rnn_dropout, bidirectional=bidirectional, batch_first=True) 60 | self.dropout = nn.Dropout(dropout) 61 | self.linear_1 = nn.Linear(hidden_size, hidden_size) 62 | 63 | def forward(self, x): 64 | ''' 65 | Args: 66 | x: tensor of shape (batch_size, sequence_len, in_size) 67 | 因为用的是 final_states ,所以特征的 padding 是放在前面的 68 | ''' 69 | _, final_states = self.rnn(x) 70 | h = self.dropout(final_states[0].squeeze(0)) 71 | y_1 = self.linear_1(h) 72 | return y_1 73 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/attention.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Description: unimodal encoder + concat + attention fusion 3 | ''' 4 | import torch 5 | import torch.nn as nn 6 | from .modules.encoder import MLPEncoder, LSTMEncoder 7 | 8 | class Attention(nn.Module): 9 | def __init__(self, args): 10 | super(Attention, self).__init__() 11 | 12 | text_dim = args.text_dim 13 | audio_dim = args.audio_dim 14 | video_dim = args.video_dim 15 | output_dim1 = args.output_dim1 16 | output_dim2 = args.output_dim2 17 | dropout = args.dropout 18 | hidden_dim = args.hidden_dim 19 | self.grad_clip = args.grad_clip 20 | 21 | if args.feat_type in ['utt']: 22 | self.audio_encoder = MLPEncoder(audio_dim, hidden_dim, dropout) 23 | self.text_encoder = MLPEncoder(text_dim, hidden_dim, dropout) 24 | self.video_encoder = MLPEncoder(video_dim, hidden_dim, dropout) 25 | elif args.feat_type in ['frm_align', 'frm_unalign']: 26 | self.audio_encoder = LSTMEncoder(audio_dim, hidden_dim, dropout) 27 | self.text_encoder = LSTMEncoder(text_dim, hidden_dim, dropout) 28 | self.video_encoder = LSTMEncoder(video_dim, hidden_dim, dropout) 29 | 30 | self.attention_mlp = MLPEncoder(hidden_dim * 3, hidden_dim, dropout) 31 | 32 | self.fc_att = nn.Linear(hidden_dim, 3) 33 | self.fc_out_1 = nn.Linear(hidden_dim, output_dim1) 34 | self.fc_out_2 = nn.Linear(hidden_dim, output_dim2) 35 | 36 | def forward(self, batch): 37 | ''' 38 | support feat_type: utt | frm-align | frm-unalign 39 | ''' 40 | audio_hidden = self.audio_encoder(batch['audios']) # [32, 128] 41 | text_hidden = self.text_encoder(batch['texts']) # [32, 128] 42 | video_hidden = self.video_encoder(batch['videos']) # [32, 128] 43 | 44 | multi_hidden1 = torch.cat([audio_hidden, text_hidden, video_hidden], dim=1) # [32, 384] 45 | attention = self.attention_mlp(multi_hidden1) 46 | attention = self.fc_att(attention) 47 | attention = torch.unsqueeze(attention, 2) # [32, 3, 1] 48 | 49 | multi_hidden2 = torch.stack([audio_hidden, text_hidden, video_hidden], dim=2) # [32, 128, 3] 50 | fused_feat = torch.matmul(multi_hidden2, attention) # [32, 128, 3] * [32, 3, 1] = [32, 128, 1] 51 | 52 | features = fused_feat.squeeze(axis=2) # [32, 128] => 解决batch=1报错的问题 53 | emos_out = self.fc_out_1(features) 54 | vals_out = self.fc_out_2(features) 55 | interloss = torch.tensor(0).cuda() 56 | 57 | return features, emos_out, vals_out, interloss 58 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/mer2023.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | from toolkit.globals import * 5 | from toolkit.utils.read_files import * 6 | from toolkit.utils.functions import * 7 | 8 | def normalize_dataset_format(data_root, save_root): 9 | ## input path 10 | train_video, train_label = os.path.join(data_root, 'train'), os.path.join(data_root, 'train-label.csv') 11 | test1_video, test1_label = os.path.join(data_root, 'test1'), os.path.join(data_root, 'test1-label.csv') 12 | test2_video, test2_label = os.path.join(data_root, 'test2'), os.path.join(data_root, 'test2-label.csv') 13 | test3_video, test3_label = os.path.join(data_root, 'test3'), os.path.join(data_root, 'test3-label.csv') 14 | 15 | ## output path 16 | save_video = os.path.join(save_root, 'video') 17 | save_label = os.path.join(save_root, 'label-6way.npz') 18 | if not os.path.exists(save_root): os.makedirs(save_root) 19 | if not os.path.exists(save_video): os.makedirs(save_video) 20 | 21 | ## generate label path 22 | whole_corpus = {} 23 | for name, video_root, label_path in [('train', train_video, train_label), 24 | ('test1', test1_video, test1_label), 25 | ('test2', test2_video, test2_label), 26 | ('test3', test3_video, test3_label)]: 27 | 28 | whole_corpus[name] = {} 29 | names = func_read_key_from_csv(label_path, 'name') 30 | emos = func_read_key_from_csv(label_path, 'discrete') 31 | vals = func_read_key_from_csv(label_path, 'valence') 32 | # process for test3 [test3 do not have vals] 33 | if name == 'test3': vals = [-10] * len(names) 34 | print (f'{name}: sample number: {len(names)}') 35 | for ii in range(len(names)): 36 | whole_corpus[name][names[ii]] = {'emo': emos[ii], 'val': vals[ii]} 37 | # copy video 38 | video_path = glob.glob(os.path.join(video_root, f'{names[ii]}*'))[0] 39 | video_name = os.path.basename(video_path) 40 | new_path = os.path.join(save_video, video_name) 41 | shutil.copy(video_path, new_path) 42 | 43 | np.savez_compressed(save_label, 44 | train_corpus=whole_corpus['train'], 45 | test1_corpus=whole_corpus['test1'], 46 | test2_corpus=whole_corpus['test2'], 47 | test3_corpus=whole_corpus['test3']) 48 | 49 | if __name__ == '__main__': 50 | data_root = '/data/lianzheng/chinese-mer-2023/mer2023-dataset' 51 | save_root = '/data/lianzheng/chinese-mer-2023/mer2023-dataset-process' 52 | normalize_dataset_format(data_root, save_root) 53 | -------------------------------------------------------------------------------- /EmotionTalk/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ############ For LINUX ############## 4 | DATA_DIR = { 5 | 'mm4': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm-process', 6 | 'mm7': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm-process', 7 | } 8 | PATH_TO_RAW_AUDIO = { 9 | 'mm4': os.path.join(DATA_DIR['mm4'], 'subaudio'), 10 | 'mm7': os.path.join(DATA_DIR['mm4'], 'subaudio'), 11 | } 12 | PATH_TO_RAW_VIDEO = { 13 | 'mm4': os.path.join(DATA_DIR['mm4'], 'subvideo-tgt'), 14 | 'mm7': os.path.join(DATA_DIR['mm4'], 'subvideo-tgt'), 15 | } 16 | PATH_TO_RAW_FACE = { 17 | 18 | 'mm4': os.path.join(DATA_DIR['mm4'], 'openface_face'), 19 | 'mm7': os.path.join(DATA_DIR['mm4'], 'openface_face'), 20 | } 21 | PATH_TO_TRANSCRIPTIONS = { 22 | 'mm4': os.path.join(DATA_DIR['mm4'], 'transcription.csv'), 23 | 'mm7': os.path.join(DATA_DIR['mm4'], 'transcription.csv'), 24 | } 25 | PATH_TO_FEATURES = { 26 | 'mm4': os.path.join(DATA_DIR['mm4'], 'features'), 27 | 'mm7': os.path.join(DATA_DIR['mm4'], 'features'), 28 | 'MM': os.path.join(DATA_DIR['mm4'], 'features'), 29 | } 30 | PATH_TO_LABEL = { 31 | 'mm4': os.path.join(DATA_DIR['mm4'], 'mm_label4.npz'), 32 | 'mm7': os.path.join(DATA_DIR['mm7'], 'mm_label.npz'), 33 | 'MM': os.path.join(DATA_DIR['mm7'], 'mm_label.npz'), 34 | } 35 | 36 | # pre-trained models, including supervised and unsupervised 37 | # PATH_TO_PRETRAINED_MODELS = '/mnt/real_sda/wangxuechen_space/code/MERBench/models/' 38 | PATH_TO_PRETRAINED_MODELS = '/mnt/real_sda/MERTools-master/MERBench/feature_extraction/checkpoint' 39 | PATH_TO_OPENSMILE = './tools/opensmile-2.3.0/' 40 | PATH_TO_FFMPEG = '/mnt/real_sda/ffmpeg-4.4.1-i686-static/ffmpeg' 41 | 42 | # dir 43 | SAVED_ROOT = os.path.join('./saved') 44 | MODEL_DIR = os.path.join(SAVED_ROOT, 'model') 45 | LOG_DIR = os.path.join(SAVED_ROOT, 'log') 46 | PREDICTION_DIR = os.path.join(SAVED_ROOT, 'prediction') 47 | FUSION_DIR = os.path.join(SAVED_ROOT, 'fusion') 48 | SUBMISSION_DIR = os.path.join(SAVED_ROOT, 'submission') 49 | 50 | 51 | ############ For Windows [OpenFace to extract face] ############## 52 | DATA_DIR_Win = { 53 | 'mm4': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm4-process', 54 | 'mm7': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm4-process', 55 | 'MM': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm4-process', 56 | } 57 | 58 | PATH_TO_RAW_FACE_Win = { 59 | 'mm4': os.path.join(DATA_DIR_Win['mm4'], 'video'), 60 | 'mm7': os.path.join(DATA_DIR_Win['mm4'], 'video'), 61 | } 62 | 63 | PATH_TO_FEATURES_Win = { 64 | 'mm4': os.path.join(DATA_DIR_Win['mm4'], 'features'), 65 | 'mm7': os.path.join(DATA_DIR_Win['mm4'], 'features'), 66 | 'MM': os.path.join(DATA_DIR_Win['MM'], 'features'), 67 | } 68 | 69 | PATH_TO_OPENFACE_Win = "H:\\desktop\\Multimedia-Transformer\\MERBench-master\\tools\\openface_win_x64" 70 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/dataset.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import cv2 4 | import glob 5 | import numpy as np 6 | from PIL import Image 7 | from skimage import io 8 | from skimage import img_as_float 9 | import torch.utils.data as data 10 | 11 | 12 | class FaceDataset(data.Dataset): 13 | def __init__(self, vid, face_dir, transform=None): 14 | super(FaceDataset, self).__init__() 15 | self.vid = vid 16 | self.path = os.path.join(face_dir, vid) 17 | self.transform = transform 18 | self.frames = self.get_frames() 19 | 20 | def get_frames(self): 21 | ## image format 22 | # frames = glob.glob(os.path.join(self.path, '*')) 23 | 24 | ## npy format 25 | npypath = os.path.join(self.path, f'{self.vid}.npy') 26 | assert os.path.exists(npypath), f'Error: {self.vid} does not exist frames.npy' 27 | frames = np.load(npypath) 28 | 29 | return frames 30 | 31 | def __len__(self): 32 | return len(self.frames) 33 | 34 | def __getitem__(self, index): 35 | ## image format 36 | # path = self.frames[index] 37 | # img = Image.open(path) 38 | # name = os.path.basename(path)[:-4] 39 | 40 | ## npy format [cv2 -> Image] 41 | img = self.frames[index] 42 | img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) 43 | name = '%08d' %(index) 44 | 45 | if self.transform is not None: 46 | img = self.transform(img) 47 | return img, name 48 | 49 | 50 | class FaceDatasetForEmoNet(data.Dataset): 51 | def __init__(self, vid, face_dir, transform=None, augmentor=None): 52 | super(FaceDatasetForEmoNet, self).__init__() 53 | self.vid = vid 54 | self.path = os.path.join(face_dir, vid) 55 | self.augmentor = augmentor 56 | self.transform = transform 57 | self.frames = self.get_frames() 58 | 59 | def get_frames(self): 60 | ## image format 61 | # frames = glob.glob(os.path.join(self.path, '*')) 62 | 63 | ## npy format 64 | npypath = os.path.join(self.path, f'{self.vid}.npy') 65 | assert os.path.exists(npypath), f'error video: {self.vid}' 66 | frames = np.load(npypath) 67 | return frames 68 | 69 | def __len__(self): 70 | return len(self.frames) 71 | 72 | def __getitem__(self, index): 73 | ## image format 74 | # path = self.frames[index] 75 | # img = io.imread(path) 76 | # name = os.path.basename(path)[:-4] 77 | 78 | ## npy format [cv2 -> skimage] 79 | img = self.frames[index] 80 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 81 | name = '%08d' %(index) 82 | 83 | if self.augmentor is not None: 84 | img = self.augmentor(img)[0] 85 | if self.transform is not None: 86 | img = self.transform(img) 87 | 88 | return img, name -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/utils/benchmark_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Utilties shared among the benchmarking protocols 3 | """ 4 | import os 5 | import sys 6 | import six 7 | 8 | import torchvision.transforms as transforms 9 | 10 | 11 | def compose_transforms(meta, resize=256, center_crop=True, 12 | override_meta_imsize=False): 13 | """Compose preprocessing transforms for model 14 | 15 | The imported models use a range of different preprocessing options, 16 | depending on how they were originally trained. Models trained in MatConvNet 17 | typically require input images that have been scaled to [0,255], rather 18 | than the [0,1] range favoured by PyTorch. 19 | 20 | Args: 21 | meta (dict): model preprocessing requirements 22 | resize (int) [256]: resize the input image to this size 23 | center_crop (bool) [True]: whether to center crop the image 24 | override_meta_imsize (bool) [False]: if true, use the value of `resize` 25 | to select the image input size, rather than the properties contained 26 | in meta (this option only applies when center cropping is not used. 27 | 28 | Return: 29 | (transforms.Compose): Composition of preprocessing transforms 30 | """ 31 | normalize = transforms.Normalize(mean=meta['mean'], std=meta['std']) 32 | im_size = meta['imageSize'] 33 | assert im_size[0] == im_size[1], 'expected square image size' 34 | if center_crop: 35 | transform_list = [transforms.Resize(resize), 36 | transforms.CenterCrop(size=(im_size[0], im_size[1]))] 37 | else: 38 | if override_meta_imsize: 39 | im_size = (resize, resize) 40 | transform_list = [transforms.Resize(size=(im_size[0], im_size[1]))] 41 | transform_list += [transforms.ToTensor()] 42 | if meta['std'] == [1, 1, 1]: # common amongst mcn models 43 | transform_list += [lambda x: x * 255.0] 44 | transform_list.append(normalize) 45 | return transforms.Compose(transform_list) 46 | 47 | 48 | def load_module_2or3(model_name, model_def_path): 49 | """Load model definition module in a manner that is compatible with 50 | both Python2 and Python3 51 | 52 | Args: 53 | model_name: The name of the model to be loaded 54 | model_def_path: The filepath of the module containing the definition 55 | 56 | Return: 57 | The loaded python module.""" 58 | if six.PY3: 59 | import importlib.util 60 | spec = importlib.util.spec_from_file_location(model_name, model_def_path) 61 | mod = importlib.util.module_from_spec(spec) 62 | spec.loader.exec_module(mod) 63 | else: 64 | import importlib 65 | dirname = os.path.dirname(model_def_path) 66 | sys.path.insert(0, dirname) 67 | module_name = os.path.splitext(os.path.basename(model_def_path))[0] 68 | mod = importlib.import_module(module_name) 69 | return mod 70 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/run_fer_benchmarks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """This module evaluates imported PyTorch models on fer2013 3 | """ 4 | 5 | import os 6 | import argparse 7 | from os.path import join as pjoin 8 | from fer2013.fer import fer2013_benchmark 9 | from utils.benchmark_helpers import load_module_2or3 10 | 11 | # MODEL_DIR = os.path.expanduser('~/data/models/pytorch/mcn_imports') 12 | # FER_DIR = os.path.expanduser('~/data/datasets/fer2013+') 13 | MODEL_DIR = './pretrained/' 14 | FER_DIR = os.path.expanduser('~/Affective Computing/Dataset/FERPlus') 15 | 16 | CACHE_DIR = 'res_cache/fer2013+' 17 | 18 | def load_model(model_name): 19 | """Load imoprted PyTorch model by name 20 | 21 | Args: 22 | model_name (str): the name of the model to be loaded 23 | 24 | Return: 25 | nn.Module: the loaded network 26 | """ 27 | model_def_path = pjoin('model', model_name + '.py') 28 | weights_path = pjoin(MODEL_DIR, model_name + '.pth') 29 | mod = load_module_2or3(model_name, model_def_path) 30 | func = getattr(mod, model_name) 31 | net = func(weights_path=weights_path) 32 | return net 33 | 34 | def run_benchmarks(gpus, refresh, fer_plus): 35 | """Run bencmarks for imported models 36 | 37 | Args: 38 | gpus (str): comma separated gpu device identifiers 39 | refresh (bool): whether to overwrite the results of existing runs 40 | fer_plus (bool): whether to evaluate on the ferplus benchmark, 41 | rather than the standard fer benchmark. 42 | """ 43 | 44 | # Select models (and their batch sizes) to include in the benchmark. 45 | if fer_plus: 46 | model_list = [ 47 | ('resnet50_ferplus_dag', 32), 48 | ('senet50_ferplus_dag', 32), 49 | ] 50 | else: 51 | model_list = [ 52 | ('alexnet_face_fer_bn_dag', 32), 53 | ('vgg_m_face_bn_fer_dag', 32), 54 | ('vgg_vd_face_fer_dag', 32), 55 | ] 56 | 57 | if not os.path.exists(CACHE_DIR): 58 | os.makedirs(CACHE_DIR) 59 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 60 | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpus) 61 | 62 | opts = {'data_dir': FER_DIR, 'refresh_cache': refresh} 63 | 64 | for model_name, batch_size in model_list: 65 | cache_name = model_name 66 | if fer_plus: 67 | cache_name = cache_name + 'fer_plus' 68 | opts['res_cache'] = '{}/{}.pth'.format(CACHE_DIR, cache_name) 69 | opts['fer_plus'] = fer_plus 70 | model = load_model(model_name) 71 | print('benchmarking {}'.format(model_name)) 72 | fer2013_benchmark(model, batch_size=batch_size, **opts) 73 | 74 | parser = argparse.ArgumentParser(description='Run PyTorch benchmarks.') 75 | parser.add_argument('--gpus', nargs='?', dest='gpus', 76 | help='select gpu device id') 77 | parser.add_argument('--refresh', dest='refresh', action='store_true', 78 | help='refresh results cache') 79 | parser.add_argument('--ferplus', dest='ferplus', action='store_true', 80 | help='run ferplus (rather than fer) benchmarks') 81 | parser.set_defaults(gpus=None) 82 | parser.set_defaults(refresh=False) 83 | parsed = parser.parse_args() 84 | 85 | if __name__ == '__main__': 86 | run_benchmarks(parsed.gpus, parsed.refresh, parsed.ferplus) 87 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.metrics import mean_squared_error 4 | from sklearn.metrics import f1_score, accuracy_score 5 | 6 | from ..globals import * 7 | 8 | # 综合维度和离散的评价指标 9 | def overall_metric(emo_fscore, val_mse): 10 | final_score = emo_fscore - val_mse * 0.25 11 | return final_score 12 | 13 | 14 | # 只返回 metric 值,用于模型筛选 15 | def gain_metric_from_results(eval_results, metric_name='emoval'): 16 | 17 | if metric_name == 'emoval': 18 | fscore = eval_results['emofscore'] 19 | valmse = eval_results['valmse'] 20 | overall = overall_metric(fscore, valmse) 21 | sort_metric = overall 22 | elif metric_name == 'emo': 23 | fscore = eval_results['emofscore'] 24 | sort_metric = fscore 25 | elif metric_name == 'val': 26 | valmse = eval_results['valmse'] 27 | sort_metric = -valmse 28 | elif metric_name == 'loss': 29 | loss = eval_results['loss'] 30 | sort_metric = -loss 31 | 32 | return sort_metric 33 | 34 | 35 | def gain_cv_results(folder_save): 36 | 37 | # find all keys 38 | whole_keys = list(folder_save[0].keys()) 39 | 40 | cv_acc, cv_fscore, cv_valmse = -100, -100, -100 41 | if 'eval_emoacc' in whole_keys: 42 | cv_acc = np.mean([epoch_save['eval_emoacc'] for epoch_save in folder_save]) 43 | if 'eval_emofscore' in whole_keys: 44 | cv_fscore = np.mean([epoch_save['eval_emofscore'] for epoch_save in folder_save]) 45 | if 'eval_valmse' in whole_keys: 46 | cv_valmse = np.mean([epoch_save['eval_valmse'] for epoch_save in folder_save]) 47 | 48 | # 只显示存在的部分信息 [与test输出是一致的] 49 | outputs = [] 50 | if cv_fscore != -100: outputs.append(f'f1:{cv_fscore:.4f}') 51 | if cv_acc != -100: outputs.append(f'acc:{cv_acc:.4f}') 52 | if cv_valmse != -100: outputs.append(f'val:{cv_valmse:.4f}') 53 | outputs = "_".join(outputs) 54 | return outputs 55 | 56 | 57 | def average_folder_for_emos(folder_save, testname): 58 | 59 | try: 60 | # 因为所有test set的 shuffle都是false的,因此不同folder的结果是对应的 61 | labels = folder_save[0][f'{testname}_emolabels'] 62 | except: 63 | return [], [] 64 | 65 | num_samples = len(labels) 66 | num_folders = len(folder_save) 67 | 68 | whole_probs = [] 69 | for ii in range(num_folders): 70 | emoprobs = folder_save[ii][f'{testname}_emoprobs'] 71 | whole_probs.append(emoprobs) 72 | whole_probs = np.array(whole_probs) 73 | 74 | avg_preds = [] 75 | for ii in range(num_samples): 76 | per_probs = whole_probs[:, ii, :] 77 | avg_emoprob = np.mean(per_probs, axis=0) 78 | avg_preds.append(avg_emoprob) 79 | 80 | return labels, avg_preds 81 | 82 | # 计算 name -> val 83 | def average_folder_for_vals(folder_save, testname): 84 | 85 | try: 86 | labels = folder_save[0][f'{testname}_vallabels'] 87 | except: 88 | return [], [] 89 | 90 | num_folders = len(folder_save) 91 | 92 | whole_preds = [] 93 | for ii in range(num_folders): 94 | valpreds = folder_save[ii][f'{testname}_valpreds'] 95 | whole_preds.append(valpreds) 96 | whole_preds = np.array(whole_preds) 97 | 98 | avg_preds = np.mean(whole_preds, axis=0) 99 | return labels, avg_preds 100 | 101 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.metrics import mean_squared_error 4 | from sklearn.metrics import f1_score, accuracy_score 5 | 6 | from ..globals import * 7 | 8 | # 综合维度和离散的评价指标 9 | def overall_metric(emo_fscore, val_mse): 10 | final_score = emo_fscore - val_mse * 0.25 11 | return final_score 12 | 13 | 14 | # 只返回 metric 值,用于模型筛选 15 | def gain_metric_from_results(eval_results, metric_name='emoval'): 16 | 17 | if metric_name == 'emoval': 18 | fscore = eval_results['emofscore'] 19 | valmse = eval_results['valmse'] 20 | overall = overall_metric(fscore, valmse) 21 | sort_metric = overall 22 | elif metric_name == 'emo': 23 | fscore = eval_results['emofscore'] 24 | sort_metric = fscore 25 | elif metric_name == 'val': 26 | valmse = eval_results['valmse'] 27 | sort_metric = -valmse 28 | elif metric_name == 'loss': 29 | loss = eval_results['loss'] 30 | sort_metric = -loss 31 | 32 | return sort_metric 33 | 34 | 35 | def gain_cv_results(folder_save): 36 | 37 | # find all keys 38 | whole_keys = list(folder_save[0].keys()) 39 | 40 | cv_acc, cv_fscore, cv_valmse = -100, -100, -100 41 | if 'eval_emoacc' in whole_keys: 42 | cv_acc = np.mean([epoch_save['eval_emoacc'] for epoch_save in folder_save]) 43 | if 'eval_emofscore' in whole_keys: 44 | cv_fscore = np.mean([epoch_save['eval_emofscore'] for epoch_save in folder_save]) 45 | if 'eval_valmse' in whole_keys: 46 | cv_valmse = np.mean([epoch_save['eval_valmse'] for epoch_save in folder_save]) 47 | 48 | # 只显示存在的部分信息 [与test输出是一致的] 49 | outputs = [] 50 | if cv_fscore != -100: outputs.append(f'f1:{cv_fscore:.4f}') 51 | if cv_acc != -100: outputs.append(f'acc:{cv_acc:.4f}') 52 | if cv_valmse != -100: outputs.append(f'val:{cv_valmse:.4f}') 53 | outputs = "_".join(outputs) 54 | return outputs 55 | 56 | 57 | def average_folder_for_emos(folder_save, testname): 58 | 59 | try: 60 | # 因为所有test set的 shuffle都是false的,因此不同folder的结果是对应的 61 | labels = folder_save[0][f'{testname}_emolabels'] 62 | except: 63 | return [], [] 64 | 65 | num_samples = len(labels) 66 | num_folders = len(folder_save) 67 | 68 | whole_probs = [] 69 | for ii in range(num_folders): 70 | emoprobs = folder_save[ii][f'{testname}_emoprobs'] 71 | whole_probs.append(emoprobs) 72 | whole_probs = np.array(whole_probs) 73 | 74 | avg_preds = [] 75 | for ii in range(num_samples): 76 | per_probs = whole_probs[:, ii, :] 77 | avg_emoprob = np.mean(per_probs, axis=0) 78 | avg_preds.append(avg_emoprob) 79 | 80 | return labels, avg_preds 81 | 82 | # 计算 name -> val 83 | def average_folder_for_vals(folder_save, testname): 84 | 85 | try: 86 | labels = folder_save[0][f'{testname}_vallabels'] 87 | except: 88 | return [], [] 89 | 90 | num_folders = len(folder_save) 91 | 92 | whole_preds = [] 93 | for ii in range(num_folders): 94 | valpreds = folder_save[ii][f'{testname}_valpreds'] 95 | whole_preds.append(valpreds) 96 | whole_preds = np.array(whole_preds) 97 | 98 | avg_preds = np.mean(whole_preds, axis=0) 99 | return labels, avg_preds 100 | 101 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/manet/model/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class BasicConv(nn.Module): 7 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 8 | super(BasicConv, self).__init__() 9 | self.out_channels = out_planes 10 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 11 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 12 | self.relu = nn.ReLU() if relu else None 13 | 14 | def forward(self, x): 15 | x = self.conv(x) 16 | if self.bn is not None: 17 | x = self.bn(x) 18 | if self.relu is not None: 19 | x = self.relu(x) 20 | return x 21 | 22 | 23 | class Flatten(nn.Module): 24 | def forward(self, x): 25 | return x.view(x.size(0), -1) 26 | 27 | 28 | class ChannelGate(nn.Module): 29 | def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']): 30 | super(ChannelGate, self).__init__() 31 | self.gate_channels = gate_channels 32 | self.mlp = nn.Sequential(Flatten(), 33 | nn.Linear(gate_channels, gate_channels // reduction_ratio), 34 | nn.ReLU(), 35 | nn.Linear(gate_channels // reduction_ratio, gate_channels)) 36 | self.pool_types = pool_types 37 | 38 | def forward(self, x): 39 | channel_att_sum = None 40 | for pool_type in self.pool_types: 41 | if pool_type == 'avg': 42 | avg_pool = F.avg_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 43 | channel_att_raw = self.mlp(avg_pool ) 44 | elif pool_type == 'max': 45 | max_pool = F.max_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 46 | channel_att_raw = self.mlp(max_pool) 47 | if channel_att_sum is None: 48 | channel_att_sum = channel_att_raw 49 | else: 50 | channel_att_sum = channel_att_sum + channel_att_raw 51 | 52 | scale = torch.sigmoid(channel_att_sum).unsqueeze(2).unsqueeze(3).expand_as(x) 53 | return x * scale 54 | 55 | 56 | class ChannelPool(nn.Module): 57 | def forward(self, x): 58 | return torch.cat((torch.max(x, 1)[0].unsqueeze(1), torch.mean(x, 1).unsqueeze(1)), dim=1) 59 | 60 | 61 | class SpatialGate(nn.Module): 62 | def __init__(self): 63 | super(SpatialGate, self).__init__() 64 | kernel_size = 7 65 | self.compress = ChannelPool() 66 | self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False) 67 | 68 | def forward(self, x): 69 | x_compress = self.compress(x) 70 | x_out = self.spatial(x_compress) 71 | scale = torch.sigmoid(x_out) 72 | return x * scale 73 | 74 | 75 | class CBAM(nn.Module): 76 | def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']): 77 | super(CBAM, self).__init__() 78 | self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types) 79 | self.SpatialGate = SpatialGate() 80 | 81 | def forward(self, x): 82 | x_out = self.ChannelGate(x) 83 | x_out = self.SpatialGate(x_out) 84 | 85 | return x_out 86 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/data/feat_data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.utils.data import Dataset 4 | from toolkit.utils.read_data import * 5 | 6 | class Data_Feat(Dataset): 7 | def __init__(self, args, names, labels): 8 | 9 | # analyze path 10 | self.names = names 11 | self.labels = labels 12 | feat_root = config.PATH_TO_FEATURES[args.dataset] 13 | audio_root = os.path.join(feat_root, args.audio_feature) 14 | text_root = os.path.join(feat_root, args.text_feature ) 15 | video_root = os.path.join(feat_root, args.video_feature) 16 | print (f'audio feature root: {audio_root}') 17 | 18 | # --------------- temporal test --------------- 19 | # for name in names: assert os.path.exists(os.path.join(audio_root, name+'.npy')) 20 | 21 | # analyze params 22 | self.feat_type = args.feat_type 23 | self.feat_scale = args.feat_scale # 特征预压缩 24 | assert self.feat_scale >= 1 25 | assert self.feat_type in ['utt', 'frm_align', 'frm_unalign'] 26 | 27 | # read datas (reduce __getitem__ durations) 28 | audios, self.adim = func_read_multiprocess(audio_root, self.names, read_type='feat') 29 | texts, self.tdim = func_read_multiprocess(text_root, self.names, read_type='feat') 30 | videos, self.vdim = func_read_multiprocess(video_root, self.names, read_type='feat') 31 | 32 | ## read batch (reduce collater durations) 33 | # step1: pre-compress features 34 | audios, texts, videos = feature_scale_compress(audios, texts, videos, self.feat_scale) 35 | # step2: align to batch 36 | if self.feat_type == 'utt': # -> 每个样本每个模态的特征压缩到句子级别 37 | audios, texts, videos = align_to_utt(audios, texts, videos) 38 | elif self.feat_type == 'frm_align': 39 | audios, texts, videos = align_to_text(audios, texts, videos) # 模态级别对齐 40 | audios, texts, videos = pad_to_maxlen_pre_modality(audios, texts, videos) # 样本级别对齐 41 | elif self.feat_type == 'frm_unalign': 42 | audios, texts, videos = pad_to_maxlen_pre_modality(audios, texts, videos) # 样本级别对齐 43 | self.audios, self.texts, self.videos = audios, texts, videos 44 | 45 | 46 | def __len__(self): 47 | return len(self.names) 48 | 49 | 50 | def __getitem__(self, index): 51 | instance = dict( 52 | audio = self.audios[index], 53 | text = self.texts[index], 54 | video = self.videos[index], 55 | emo = self.labels[index]['emo'], 56 | val = self.labels[index]['val'], 57 | name = self.names[index], 58 | ) 59 | return instance 60 | 61 | 62 | def collater(self, instances): 63 | audios = [instance['audio'] for instance in instances] 64 | texts = [instance['text'] for instance in instances] 65 | videos = [instance['video'] for instance in instances] 66 | 67 | batch = dict( 68 | audios = torch.FloatTensor(np.array(audios)), 69 | texts = torch.FloatTensor(np.array(texts)), 70 | videos = torch.FloatTensor(np.array(videos)), 71 | ) 72 | 73 | emos = torch.LongTensor([instance['emo'] for instance in instances]) 74 | vals = torch.FloatTensor([instance['val'] for instance in instances]) 75 | names = [instance['name'] for instance in instances] 76 | 77 | return batch, emos, vals, names 78 | 79 | 80 | def get_featdim(self): 81 | print (f'audio dimension: {self.adim}; text dimension: {self.tdim}; video dimension: {self.vdim}') 82 | return self.adim, self.tdim, self.vdim 83 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/simsv2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from toolkit.utils.functions import * 4 | from toolkit.utils.read_files import * 5 | 6 | def func_merge_id_to_path(video_id, clip_id, video_root): 7 | video_path = os.path.join(video_root, video_id, clip_id+'.mp4') 8 | return video_path 9 | 10 | def func_convert_path_to_newname(video_id, clip_id): 11 | newname = f'{video_id}_{clip_id}' 12 | return newname 13 | 14 | # label_path -> (video_paths, labels) 15 | def read_labels(label_path, video_root): 16 | 17 | # read all items 18 | video_ids = func_read_key_from_csv(label_path, 'video_id') 19 | clip_ids = func_read_key_from_csv(label_path, 'clip_id') 20 | chis = func_read_key_from_csv(label_path, 'text') 21 | labels = func_read_key_from_csv(label_path, 'label') 22 | modes = func_read_key_from_csv(label_path, 'mode') 23 | 24 | print (f'label range -> min:{min(labels)} max:{max(labels)}') 25 | print (f'whole sample number: {len(labels)}') 26 | print ('modes: ', set(modes)) 27 | 28 | newnames, videopaths = [], [] 29 | for ii in range(len(video_ids)): 30 | newname = func_convert_path_to_newname(video_ids[ii], clip_ids[ii]) 31 | videopath = func_merge_id_to_path(video_ids[ii], clip_ids[ii], video_root) 32 | newnames.append(newname) 33 | videopaths.append(videopath) 34 | print (f'whole sample number: {len(set(newnames))}') 35 | return chis, labels, modes, videopaths, newnames 36 | 37 | 38 | # ------------------- main process ------------------- 39 | def normalize_dataset_format(data_root, save_root): 40 | # gain paths 41 | video_root = os.path.join(data_root, 'Raw') 42 | label_path = os.path.join(data_root, 'meta.csv') 43 | 44 | # read all items 45 | chis, labels, modes, videopaths, newnames = read_labels(label_path, video_root) 46 | 47 | ## output path 48 | save_video = os.path.join(save_root, 'video') 49 | save_label = os.path.join(save_root, 'label.npz') 50 | save_trans = os.path.join(save_root, 'transcription.csv') 51 | if not os.path.exists(save_root): os.makedirs(save_root) 52 | if not os.path.exists(save_video): os.makedirs(save_video) 53 | 54 | ## generate new transcripts 55 | name2key = {} 56 | for ii, newname in enumerate(newnames): 57 | name2key[newname] = [chis[ii]] 58 | func_write_key_to_csv(save_trans, newnames, name2key, ['chinese']) 59 | 60 | ## copy videos 61 | for ii, videopath in enumerate(videopaths): 62 | assert videopath.endswith('.mp4') 63 | savepath = os.path.join(save_video, newnames[ii]+'.mp4') 64 | shutil.copy(videopath, savepath) 65 | 66 | ## generate label path 67 | whole_corpus = {} 68 | for ii, newname in enumerate(newnames): 69 | mode = modes[ii] # [train, valid, test] 70 | if mode not in whole_corpus: 71 | whole_corpus[mode] = {} 72 | whole_corpus[mode][newname] = {'emo': 0, 'val': labels[ii]} 73 | 74 | np.savez_compressed(save_label, 75 | train_corpus=whole_corpus['train'], 76 | val_corpus=whole_corpus['valid'], 77 | test_corpus=whole_corpus['test']) 78 | 79 | if __name__ == '__main__': 80 | data_root = 'I:\\CH-SIMS-v2\\zip\\supervised\\ch-simsv2s' 81 | save_root = 'E:\\Dataset\\simsv2-process' 82 | normalize_dataset_format(data_root, save_root) 83 | 84 | # data_root = 'E:\\Dataset\\simsv2-process' 85 | # trans_path = os.path.join(data_root, 'transcription.csv') 86 | # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv') 87 | # func_translate_transcript_polish_merge(trans_path, polish_path) 88 | # func_translate_transcript_polish_merge(polish_path, '') 89 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/util.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import re 4 | import pandas as pd 5 | import numpy as np 6 | import struct 7 | 8 | ## for OPENFACE 9 | ## reference: https://gist.github.com/btlorch/6d259bfe6b753a7a88490c0607f07ff8 10 | def read_hog(filename, batch_size=5000): 11 | """ 12 | Read HoG features file created by OpenFace. 13 | For each frame, OpenFace extracts 12 * 12 * 31 HoG features, i.e., num_features = 4464. These features are stored in row-major order. 14 | :param filename: path to .hog file created by OpenFace 15 | :param batch_size: how many rows to read at a time 16 | :return: is_valid, hog_features 17 | is_valid: ndarray of shape [num_frames] 18 | hog_features: ndarray of shape [num_frames, num_features] 19 | """ 20 | all_feature_vectors = [] 21 | with open(filename, "rb") as f: 22 | num_cols, = struct.unpack("i", f.read(4)) # 12 23 | num_rows, = struct.unpack("i", f.read(4)) # 12 24 | num_channels, = struct.unpack("i", f.read(4)) # 31 25 | 26 | # The first four bytes encode a boolean value whether the frame is valid 27 | num_features = 1 + num_rows * num_cols * num_channels 28 | feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4)) 29 | feature_vector = np.array(feature_vector).reshape((1, num_features)) # [1, 4464+1] 30 | all_feature_vectors.append(feature_vector) 31 | 32 | # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid 33 | num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels 34 | # Read in batches of given batch_size 35 | num_floats_to_read = num_floats_per_feature_vector * batch_size 36 | # Multiply by 4 because of float32 37 | num_bytes_to_read = num_floats_to_read * 4 38 | 39 | while True: 40 | bytes = f.read(num_bytes_to_read) 41 | # For comparison how many bytes were actually read 42 | num_bytes_read = len(bytes) 43 | assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size" 44 | num_floats_read = num_bytes_read // 4 45 | assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size" 46 | num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector 47 | 48 | feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes) 49 | # Convert to array 50 | feature_vectors = np.array(feature_vectors).reshape((num_feature_vectors_read, num_floats_per_feature_vector)) 51 | # Discard the first three values in each row (num_cols, num_rows, num_channels) 52 | feature_vectors = feature_vectors[:, 3:] 53 | # Append to list of all feature vectors that have been read so far 54 | all_feature_vectors.append(feature_vectors) 55 | 56 | if num_bytes_read < num_bytes_to_read: 57 | break 58 | 59 | # Concatenate batches 60 | all_feature_vectors = np.concatenate(all_feature_vectors, axis=0) 61 | 62 | # Split into is-valid and feature vectors 63 | is_valid = all_feature_vectors[:, 0] 64 | feature_vectors = all_feature_vectors[:, 1:] 65 | 66 | return is_valid, feature_vectors 67 | 68 | 69 | ## for OPENFACE 70 | def read_csv(filename, startIdx): 71 | data = pd.read_csv(filename) 72 | all_feature_vectors = [] 73 | for index in data.index: 74 | features = np.array(data.iloc[index][startIdx:]) 75 | all_feature_vectors.append(features) 76 | all_feature_vectors = np.array(all_feature_vectors) 77 | return all_feature_vectors 78 | 79 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/dataloader/mm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import DataLoader 3 | from sklearn.metrics import f1_score, accuracy_score 4 | 5 | from ..globals import * 6 | from toolkit.data import get_datasets 7 | 8 | class mm: 9 | 10 | def __init__(self, args): 11 | self.args = args 12 | self.debug = args.debug 13 | self.batch_size = args.batch_size 14 | self.num_workers = args.num_workers 15 | self.label_path = config.PATH_TO_LABEL[args.dataset] 16 | 17 | self.dataset = args.dataset 18 | assert self.dataset in ['mm4','mm7'] 19 | 20 | # update args 21 | if self.dataset == 'mm4': 22 | args.output_dim1 = 4 23 | args.output_dim2 = 0 24 | args.metric_name = 'emo' 25 | elif self.dataset == 'mm7': 26 | args.output_dim1 = 7 27 | args.output_dim2 = 0 28 | args.metric_name = 'emo' 29 | 30 | def get_loaders(self): 31 | dataloaders = [] 32 | for data_type in ['train', 'val', 'test']: 33 | names, labels = self.read_names_labels(self.label_path, data_type, debug=self.debug) 34 | print (f'{data_type}: sample number {len(names)}') 35 | dataset = get_datasets(self.args, names, labels) 36 | 37 | if data_type in ['train']: 38 | dataloader = DataLoader(dataset, 39 | batch_size=self.batch_size, 40 | num_workers=self.num_workers, 41 | collate_fn=dataset.collater, 42 | pin_memory=True) 43 | else: 44 | dataloader = DataLoader(dataset, 45 | batch_size=self.batch_size, 46 | num_workers=self.num_workers, 47 | collate_fn=dataset.collater, 48 | shuffle=False, 49 | pin_memory=True) 50 | dataloaders.append(dataloader) 51 | train_loaders = [dataloaders[0]] 52 | eval_loaders = [dataloaders[1]] 53 | test_loaders = [dataloaders[2]] 54 | 55 | return train_loaders, eval_loaders, test_loaders 56 | 57 | 58 | def read_names_labels(self, label_path, data_type, debug=False): 59 | names, labels = [], [] 60 | if data_type == 'train': corpus = np.load(label_path, allow_pickle=True)['train_corpus'].tolist() 61 | if data_type == 'val': corpus = np.load(label_path, allow_pickle=True)['val_corpus'].tolist() 62 | if data_type == 'test': corpus = np.load(label_path, allow_pickle=True)['test_corpus'].tolist() 63 | for name in corpus: 64 | names.append(name) 65 | labels.append(corpus[name]) 66 | # for debug 67 | if debug: 68 | names = names[:100] 69 | labels = labels[:100] 70 | return names, labels 71 | 72 | 73 | # MELD 测试 7-emo classification performance 74 | def calculate_results(self, emo_probs=[], emo_labels=[], val_preds=[], val_labels=[]): 75 | 76 | emo_preds = np.argmax(emo_probs, 1) 77 | emo_accuracy = accuracy_score(emo_labels, emo_preds) 78 | emo_fscore = f1_score(emo_labels, emo_preds, average='weighted') 79 | 80 | results = { 81 | 'emoprobs': emo_probs, 82 | 'emolabels': emo_labels, 83 | 'emoacc': emo_accuracy, 84 | 'emofscore': emo_fscore 85 | } 86 | outputs = f'f1:{emo_fscore:.4f}_acc:{emo_accuracy:.4f}' 87 | 88 | return results, outputs 89 | 90 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/model/vgg_m_face_bn_fer_dag.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Vgg_m_face_bn_fer_dag(nn.Module): 8 | 9 | def __init__(self): 10 | super(Vgg_m_face_bn_fer_dag, self).__init__() 11 | self.meta = {'mean': [131.45376586914062, 103.98748016357422, 91.46234893798828], 12 | 'std': [1, 1, 1], 13 | 'imageSize': [224, 224, 3]} 14 | self.conv1 = nn.Conv2d(3, 96, kernel_size=[7, 7], stride=(2, 2)) 15 | self.bn49 = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 16 | self.relu1 = nn.ReLU() 17 | self.pool1 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 18 | self.conv2 = nn.Conv2d(96, 256, kernel_size=[5, 5], stride=(2, 2), padding=(1, 1)) 19 | self.bn50 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 20 | self.relu2 = nn.ReLU() 21 | self.pool2 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=(0, 0), dilation=1, ceil_mode=True) 22 | self.conv3 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 23 | self.bn51 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 24 | self.relu3 = nn.ReLU() 25 | self.conv4 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 26 | self.bn52 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 27 | self.relu4 = nn.ReLU() 28 | self.conv5 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 29 | self.bn53 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 30 | self.relu5 = nn.ReLU() 31 | self.pool5 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 32 | self.fc6 = nn.Conv2d(512, 4096, kernel_size=[6, 6], stride=(1, 1)) 33 | self.bn54 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | self.relu6 = nn.ReLU() 35 | self.fc7 = nn.Conv2d(4096, 4096, kernel_size=[1, 1], stride=(1, 1)) 36 | self.bn55 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | self.relu7 = nn.ReLU() 38 | self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True) 39 | 40 | def forward(self, data): 41 | x1 = self.conv1(data) 42 | x2 = self.bn49(x1) 43 | x3 = self.relu1(x2) 44 | x4 = self.pool1(x3) 45 | x5 = self.conv2(x4) 46 | x6 = self.bn50(x5) 47 | x7 = self.relu2(x6) 48 | x8 = self.pool2(x7) 49 | x9 = self.conv3(x8) 50 | x10 = self.bn51(x9) 51 | x11 = self.relu3(x10) 52 | x12 = self.conv4(x11) 53 | x13 = self.bn52(x12) 54 | x14 = self.relu4(x13) 55 | x15 = self.conv5(x14) 56 | x16 = self.bn53(x15) 57 | x17 = self.relu5(x16) 58 | x18 = self.pool5(x17) 59 | x19 = self.fc6(x18) 60 | x20 = self.bn54(x19) 61 | x21 = self.relu6(x20) 62 | x22 = self.fc7(x21) 63 | x23 = self.bn55(x22) 64 | x24_preflatten = self.relu7(x23) 65 | x24 = x24_preflatten.view(x24_preflatten.size(0), -1) 66 | prediction = self.fc8(x24) 67 | return prediction 68 | 69 | def vgg_m_face_bn_fer_dag(weights_path=None, **kwargs): 70 | """ 71 | load imported model instance 72 | 73 | Args: 74 | weights_path (str): If set, loads model weights from the given path 75 | """ 76 | model = Vgg_m_face_bn_fer_dag() 77 | if weights_path: 78 | state_dict = torch.load(weights_path) 79 | model.load_state_dict(state_dict) 80 | return model -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/model/alexnet_face_fer_bn_dag.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Alexnet_face_fer_bn_dag(nn.Module): 8 | 9 | def __init__(self): 10 | super(Alexnet_face_fer_bn_dag, self).__init__() 11 | self.meta = {'mean': [131.09375, 103.88607788085938, 91.47599792480469], 12 | 'std': [1, 1, 1], 13 | 'imageSize': [227, 227, 3]} 14 | self.conv1 = nn.Conv2d(3, 96, kernel_size=[11, 11], stride=(4, 4)) 15 | self.bn1 = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 16 | self.relu1 = nn.ReLU() 17 | self.pool1 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 18 | self.conv2 = nn.Conv2d(96, 256, kernel_size=[5, 5], stride=(1, 1), padding=(2, 2), groups=2) 19 | self.bn2 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 20 | self.relu2 = nn.ReLU() 21 | self.pool2 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 22 | self.conv3 = nn.Conv2d(256, 384, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 23 | self.bn3 = nn.BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 24 | self.relu3 = nn.ReLU() 25 | self.conv4 = nn.Conv2d(384, 384, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1), groups=2) 26 | self.bn4 = nn.BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 27 | self.relu4 = nn.ReLU() 28 | self.conv5 = nn.Conv2d(384, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1), groups=2) 29 | self.bn5 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 30 | self.relu5 = nn.ReLU() 31 | self.pool5 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 32 | self.fc6 = nn.Conv2d(256, 4096, kernel_size=[6, 6], stride=(1, 1)) 33 | self.bn6 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | self.relu6 = nn.ReLU() 35 | self.fc7 = nn.Conv2d(4096, 4096, kernel_size=[1, 1], stride=(1, 1)) 36 | self.bn7 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | self.relu7 = nn.ReLU() 38 | self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True) 39 | 40 | def forward(self, data): 41 | x1 = self.conv1(data) 42 | x2 = self.bn1(x1) 43 | x3 = self.relu1(x2) 44 | x4 = self.pool1(x3) 45 | x5 = self.conv2(x4) 46 | x6 = self.bn2(x5) 47 | x7 = self.relu2(x6) 48 | x8 = self.pool2(x7) 49 | x9 = self.conv3(x8) 50 | x10 = self.bn3(x9) 51 | x11 = self.relu3(x10) 52 | x12 = self.conv4(x11) 53 | x13 = self.bn4(x12) 54 | x14 = self.relu4(x13) 55 | x15 = self.conv5(x14) 56 | x16 = self.bn5(x15) 57 | x17 = self.relu5(x16) 58 | x18 = self.pool5(x17) 59 | x19 = self.fc6(x18) 60 | x20 = self.bn6(x19) 61 | x21 = self.relu6(x20) 62 | x22 = self.fc7(x21) 63 | x23 = self.bn7(x22) 64 | x24_preflatten = self.relu7(x23) 65 | x24 = x24_preflatten.view(x24_preflatten.size(0), -1) 66 | prediction = self.fc8(x24) 67 | return prediction 68 | 69 | def alexnet_face_fer_bn_dag(weights_path=None, **kwargs): 70 | """ 71 | load imported model instance 72 | 73 | Args: 74 | weights_path (str): If set, loads model weights from the given path 75 | """ 76 | model = Alexnet_face_fer_bn_dag() 77 | if weights_path: 78 | state_dict = torch.load(weights_path) 79 | model.load_state_dict(state_dict) 80 | return model -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/tfn.py: -------------------------------------------------------------------------------- 1 | """ 2 | paper: Tensor Fusion Network for Multimodal Sentiment Analysis 3 | From: https://github.com/A2Zadeh/TensorFusionNetwork 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from .modules.encoder import MLPEncoder, LSTMEncoder 10 | 11 | class TFN(nn.Module): 12 | 13 | def __init__(self, args): 14 | 15 | super(TFN, self).__init__() 16 | 17 | text_dim = args.text_dim 18 | audio_dim = args.audio_dim 19 | video_dim = args.video_dim 20 | output_dim1 = args.output_dim1 21 | output_dim2 = args.output_dim2 22 | dropout= args.dropout 23 | self.hidden_dim = args.hidden_dim 24 | self.grad_clip = args.grad_clip 25 | 26 | # define the pre-fusion subnetworks [感觉输入的audio/video是句子级别,但是 text是词级别信息] 27 | if args.feat_type in ['utt']: 28 | self.audio_encoder = MLPEncoder(audio_dim, self.hidden_dim, dropout) 29 | self.text_encoder = MLPEncoder(text_dim, self.hidden_dim, dropout) 30 | self.video_encoder = MLPEncoder(video_dim, self.hidden_dim, dropout) 31 | elif args.feat_type in ['frm_align', 'frm_unalign']: 32 | self.audio_encoder = LSTMEncoder(audio_dim, self.hidden_dim, dropout) 33 | self.text_encoder = LSTMEncoder(text_dim, self.hidden_dim, dropout) 34 | self.video_encoder = LSTMEncoder(video_dim, self.hidden_dim, dropout) 35 | 36 | # define the post_fusion layers 37 | self.post_fusion_dropout = nn.Dropout(p=dropout) 38 | self.post_fusion_layer_1 = nn.Linear((self.hidden_dim + 1) * (self.hidden_dim + 1) * (self.hidden_dim + 1), self.hidden_dim) 39 | self.post_fusion_layer_2 = nn.Linear(self.hidden_dim, self.hidden_dim) 40 | 41 | self.fc_out_1 = nn.Linear(self.hidden_dim, output_dim1) 42 | self.fc_out_2 = nn.Linear(self.hidden_dim, output_dim2) 43 | 44 | 45 | # audio/video是句子级别, text的word level 46 | def forward(self, batch): 47 | ''' 48 | Args: 49 | audio_x: tensor of shape (batch_size, audio_dim) 50 | video_x: tensor of shape (batch_size, video_dim) 51 | text_x: tensor of shape (batch_size, text_dim ) 52 | ''' 53 | 54 | audio_h = self.audio_encoder(batch['audios']) 55 | text_h = self.text_encoder(batch['texts']) 56 | video_h = self.video_encoder(batch['videos']) 57 | batch_size = audio_h.data.shape[0] 58 | 59 | # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product 60 | add_one = torch.ones(size=[batch_size, 1], requires_grad=False).type_as(audio_h).to(audio_h.device) 61 | _audio_h = torch.cat((add_one, audio_h), dim=1) 62 | _video_h = torch.cat((add_one, video_h), dim=1) 63 | _text_h = torch.cat((add_one, text_h), dim=1) 64 | 65 | # outer product 66 | fusion_tensor = torch.bmm(_audio_h.unsqueeze(2), _video_h.unsqueeze(1)) 67 | 68 | # next we do kronecker product between fusion_tensor and _text_h. This is even trickier 69 | # we have to reshape the fusion tensor during the computation 70 | # in the end we don't keep the 3-D tensor, instead we flatten it 71 | fusion_tensor = fusion_tensor.view(-1, (self.hidden_dim + 1) * (self.hidden_dim + 1), 1) 72 | fusion_tensor = torch.bmm(fusion_tensor, _text_h.unsqueeze(1)).view(batch_size, -1) 73 | 74 | post_fusion_dropped = self.post_fusion_dropout(fusion_tensor) 75 | post_fusion_y_1 = F.relu(self.post_fusion_layer_1(post_fusion_dropped), inplace=True) 76 | features = F.relu(self.post_fusion_layer_2(post_fusion_y_1), inplace=True) 77 | 78 | emos_out = self.fc_out_1(features) 79 | vals_out = self.fc_out_2(features) 80 | interloss = torch.tensor(0).cuda() 81 | 82 | return features, emos_out, vals_out, interloss 83 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/modules/transformers_encoder/position_embedding.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | # Code adapted from the fairseq repo. 6 | 7 | def make_positions(tensor, padding_idx, left_pad): 8 | """Replace non-padding symbols with their position numbers. 9 | Position numbers begin at padding_idx+1. 10 | Padding symbols are ignored, but it is necessary to specify whether padding 11 | is added on the left side (left_pad=True) or right side (left_pad=False). 12 | """ 13 | max_pos = padding_idx + 1 + tensor.size(1) 14 | device = tensor.get_device() 15 | buf_name = f'range_buf_{device}' 16 | if not hasattr(make_positions, buf_name): 17 | setattr(make_positions, buf_name, tensor.new()) 18 | setattr(make_positions, buf_name, getattr(make_positions, buf_name).type_as(tensor)) 19 | if getattr(make_positions, buf_name).numel() < max_pos: 20 | torch.arange(padding_idx + 1, max_pos, out=getattr(make_positions, buf_name)) 21 | mask = tensor.ne(padding_idx) 22 | positions = getattr(make_positions, buf_name)[:tensor.size(1)].expand_as(tensor) 23 | if left_pad: 24 | positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1) 25 | new_tensor = tensor.clone() 26 | return new_tensor.masked_scatter_(mask, positions[mask]).long() 27 | 28 | 29 | class SinusoidalPositionalEmbedding(nn.Module): 30 | """This module produces sinusoidal positional embeddings of any length. 31 | Padding symbols are ignored, but it is necessary to specify whether padding 32 | is added on the left side (left_pad=True) or right side (left_pad=False). 33 | """ 34 | 35 | def __init__(self, embedding_dim, padding_idx=0, left_pad=0, init_size=128): 36 | super().__init__() 37 | self.embedding_dim = embedding_dim 38 | self.padding_idx = padding_idx 39 | self.left_pad = left_pad 40 | self.weights = dict() # device --> actual weight; due to nn.DataParallel :-( 41 | self.register_buffer('_float_tensor', torch.FloatTensor(1)) 42 | 43 | @staticmethod 44 | def get_embedding(num_embeddings, embedding_dim, padding_idx=None): 45 | """Build sinusoidal embeddings. 46 | This matches the implementation in tensor2tensor, but differs slightly 47 | from the description in Section 3.5 of "Attention Is All You Need". 48 | """ 49 | half_dim = embedding_dim // 2 50 | emb = math.log(10000) / (half_dim - 1) 51 | emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) 52 | emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) 53 | emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) 54 | if embedding_dim % 2 == 1: 55 | # zero pad 56 | emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) 57 | if padding_idx is not None: 58 | emb[padding_idx, :] = 0 59 | return emb 60 | 61 | def forward(self, input): 62 | """Input is expected to be of size [bsz x seqlen].""" 63 | bsz, seq_len = input.size() 64 | max_pos = self.padding_idx + 1 + seq_len 65 | device = input.get_device() 66 | if device not in self.weights or max_pos > self.weights[device].size(0): 67 | # recompute/expand embeddings if needed 68 | self.weights[device] = SinusoidalPositionalEmbedding.get_embedding( 69 | max_pos, 70 | self.embedding_dim, 71 | self.padding_idx, 72 | ) 73 | self.weights[device] = self.weights[device].type_as(self._float_tensor).to(input.device) 74 | positions = make_positions(input, self.padding_idx, self.left_pad) 75 | return self.weights[device].index_select(0, positions.contiguous().view(-1)).view(bsz, seq_len, -1).detach() 76 | 77 | def max_positions(self): 78 | """Maximum number of supported positions.""" 79 | return int(1e5) # an arbitrary large number -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/extract_vggish_embedding.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | """ 3 | VGGish: https://arxiv.org/abs/1609.09430 4 | official github repo: https://github.com/tensorflow/models/tree/master/research/audioset/vggish 5 | """ 6 | 7 | import os 8 | import glob 9 | import time 10 | import argparse 11 | import numpy as np 12 | 13 | from vggish import vggish_input 14 | from vggish import vggish_params 15 | from vggish import vggish_slim 16 | import tensorflow.compat.v1 as tf # version: 1.15.0 (gpu) 17 | tf.disable_v2_behavior() 18 | 19 | # import config 20 | import sys 21 | sys.path.append('../../') 22 | import config 23 | 24 | def extract(audio_files, save_dir, feature_level, batch_size=2048): 25 | start_time = time.time() 26 | 27 | if feature_level == 'FRAME': label_interval = 50.0 28 | if feature_level == 'UTTERANCE': label_interval = 500.0 29 | 30 | with tf.Graph().as_default(), tf.Session() as sess: 31 | vggish_slim.define_vggish_slim(training=False) 32 | model_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, f'vggish/vggish_model.ckpt') 33 | vggish_slim.load_vggish_slim_checkpoint(sess, model_file) 34 | features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) # get one layer 35 | embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) # get one layer 36 | 37 | for i, audio_file in enumerate(audio_files, 1): 38 | print(f'Processing "{os.path.basename(audio_file)}" ({i}/{len(audio_files)})...') 39 | vid = os.path.basename(audio_file)[:-4] 40 | samples = vggish_input.wavfile_to_examples(audio_file, label_interval / 1000.0) # (segment_num, height(96), width(64)) 41 | sample_size = samples.shape[0] 42 | 43 | # model inference (max sample size: 6653, will cause OOM. Need to chunk samples.) 44 | embeddings = [] 45 | num_batches = int(np.ceil(sample_size / batch_size)) 46 | for i in range(num_batches): 47 | examples_batch = samples[i*batch_size:min((i+1)*batch_size, sample_size)] 48 | [embedding_batch] = sess.run([embedding_tensor], 49 | feed_dict={features_tensor: examples_batch}) 50 | embeddings.append(embedding_batch) 51 | embeddings = np.row_stack(embeddings) # (segment_num, featdim=128) 52 | 53 | # save feature 54 | csv_file = os.path.join(save_dir, f'{vid}.npy') 55 | if feature_level == 'UTTERANCE': 56 | embeddings = np.array(embeddings).squeeze() 57 | if len(embeddings.shape) != 1: 58 | embeddings = np.mean(embeddings, axis=0) # (featdim=128) 59 | np.save(csv_file, embeddings) 60 | else: 61 | np.save(csv_file, embeddings) 62 | 63 | end_time = time.time() 64 | print(f'Total time used: {end_time - start_time:.1f}s.') 65 | 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser(description='Run.') 69 | parser.add_argument('--gpu', type=int, default=0, help='index of gpu') 70 | parser.add_argument('--feature_level', type=str, default='FRAME', help='feature_level: FRAME or UTTERANCE') 71 | parser.add_argument('--dataset', type=str, default='MER2023', help='input dataset') 72 | args = parser.parse_args() 73 | os.environ["CUDA_VISIBLE_DEVICES"] = f'{args.gpu}' 74 | 75 | audio_dir = config.PATH_TO_RAW_AUDIO[args.dataset] 76 | save_dir = config.PATH_TO_FEATURES[args.dataset] 77 | 78 | # in: get audios 79 | audio_files = glob.glob(os.path.join(audio_dir, '*.wav')) 80 | print(f'Find total "{len(audio_files)}" audio files.') 81 | 82 | # out: check dir 83 | dir_name = f'vggish_{args.feature_level[:3]}' 84 | save_dir = os.path.join(save_dir, dir_name) 85 | if not os.path.exists(save_dir): os.makedirs(save_dir) 86 | 87 | # extract features 88 | extract(audio_files, save_dir, args.feature_level) 89 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/extract_imagenet_embedding.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torchvision 9 | from torchvision import transforms 10 | 11 | # import config 12 | import sys 13 | sys.path.append('../../') 14 | import config 15 | from dataset import FaceDataset 16 | 17 | 18 | def extract(data_loader, model): 19 | model.eval() 20 | with torch.no_grad(): 21 | features, timestamps = [], [] 22 | for images, names in data_loader: 23 | images = images.cuda() 24 | embedding = model(images) 25 | embedding = embedding.squeeze() # [32, 512, 1, 1] => [32, 512] 26 | features.append(embedding.cpu().detach().numpy()) 27 | timestamps.extend(names) 28 | features, timestamps = np.row_stack(features), np.array(timestamps) 29 | return features, timestamps 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description='Run.') 34 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset') 35 | parser.add_argument('--feature_level', type=str, default='UTTERANCE', help='feature level [FRAME or UTTERANCE]') 36 | parser.add_argument('--gpu', type=str, default='1', help='gpu id') 37 | params = parser.parse_args() 38 | os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu 39 | 40 | print('==> Extracting imagenet embedding...') 41 | face_dir = config.PATH_TO_RAW_FACE[params.dataset] 42 | save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], f'imagenet_{params.feature_level[:3]}') 43 | if not os.path.exists(save_dir): os.makedirs(save_dir) 44 | 45 | # load model 46 | model = torchvision.models.resnet18(True) 47 | model = model.cuda() 48 | model = nn.Sequential(*list(model.children())[:-1]) 49 | 50 | # transform 51 | transform = transforms.Compose([transforms.Resize((224, 224)), 52 | transforms.ToTensor(), 53 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 54 | 55 | # extract embedding video by video 56 | vids = os.listdir(face_dir) 57 | EMBEDDING_DIM = -1 58 | print(f'Find total "{len(vids)}" videos.') 59 | for i, vid in enumerate(vids, 1): 60 | print(f"Processing video '{vid}' ({i}/{len(vids)})...") 61 | 62 | # forward 63 | dataset = FaceDataset(vid, face_dir, transform=transform) 64 | if len(dataset) == 0: 65 | print("Warning: number of frames of video {} should not be zero.".format(vid)) 66 | embeddings, framenames = [], [] 67 | else: 68 | data_loader = torch.utils.data.DataLoader(dataset, 69 | batch_size=32, 70 | num_workers=4, 71 | pin_memory=True) 72 | embeddings, framenames = extract(data_loader, model) 73 | 74 | # save results 75 | indexes = np.argsort(framenames) 76 | embeddings = embeddings[indexes] 77 | framenames = framenames[indexes] 78 | EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1]) 79 | 80 | csv_file = os.path.join(save_dir, f'{vid}.npy') 81 | if params.feature_level == 'FRAME': 82 | embeddings = np.array(embeddings).squeeze() 83 | if len(embeddings) == 0: 84 | embeddings = np.zeros((1, EMBEDDING_DIM)) 85 | elif len(embeddings.shape) == 1: 86 | embeddings = embeddings[np.newaxis, :] 87 | np.save(csv_file, embeddings) 88 | else: 89 | embeddings = np.array(embeddings).squeeze() 90 | if len(embeddings) == 0: 91 | embeddings = np.zeros((EMBEDDING_DIM, )) 92 | elif len(embeddings.shape) == 2: 93 | embeddings = np.mean(embeddings, axis=0) 94 | np.save(csv_file, embeddings) 95 | 96 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/extract_wav2vec_embedding.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | """ 3 | wav2vec: https://arxiv.org/abs/1904.05862 4 | official github repo: https://github.com/pytorch/fairseq/tree/master/examples/wav2vec 5 | """ 6 | import os 7 | import time 8 | import glob 9 | import torch 10 | import argparse 11 | import numpy as np 12 | import soundfile as sf 13 | from fairseq.models.wav2vec import Wav2VecModel # Note: use fairseq version of 0.10.1, error occurred when using the newest officical script and version of 0.10.2 (pip install fairseq==0.10.1) 14 | 15 | # import config 16 | import sys 17 | sys.path.append('../../') 18 | import config 19 | 20 | def write_feature_to_npy(feature, feature_level, save_path): 21 | if feature_level == 'UTTERANCE': 22 | feature = np.array(feature).squeeze() # [C,] 23 | if len(feature.shape) != 1: # change [T, C] => [C,] 24 | feature = np.mean(feature, axis=0) 25 | np.save(save_path, feature) 26 | else: 27 | np.save(save_path, feature) 28 | 29 | def extract(audio_files, feature_level, model, save_dir, gpu=None): 30 | start_time = time.time() 31 | device = torch.device(f'cuda:{gpu}') 32 | 33 | # create folders [save two features in 'wav2vec-large'] 34 | dir_name = 'wav2vec-large' 35 | out_dir_z = os.path.join(save_dir, f'{dir_name}-z-{feature_level[:3]}') # features output by feature encoder 36 | out_dir_c = os.path.join(save_dir, f'{dir_name}-c-{feature_level[:3]}') # features output by context network 37 | if not os.path.exists(out_dir_z): os.makedirs(out_dir_z) 38 | if not os.path.exists(out_dir_c): os.makedirs(out_dir_c) 39 | 40 | # iterate audios 41 | for idx, wav_file in enumerate(audio_files, 1): 42 | file_name = os.path.basename(wav_file) 43 | vid = file_name[:-4] 44 | print(f'Processing "{file_name}" ({idx}/{len(audio_files)})...') 45 | # load audio 46 | audio, sampling_rate = sf.read(wav_file) 47 | audio = audio.astype('float32')[np.newaxis, :] 48 | audio = torch.from_numpy(audio) 49 | audio = audio.to(device) 50 | assert sampling_rate == 16000, f'Error: sampling rate ({sampling_rate}) != 16k!' 51 | with torch.no_grad(): 52 | z = model.feature_extractor(audio) # (1, C, T), stride: 10ms (100Hz), receptive field: 30ms 53 | c = model.feature_aggregator(z) # (1, C, T), stride: 10ms (100Hz), receptive field: 801ms (for large version) 54 | 55 | # save 56 | z_feature = z.detach().squeeze().t().cpu().numpy() 57 | c_feature = c.detach().squeeze().t().cpu().numpy() 58 | z_npy_file = os.path.join(out_dir_z, f'{vid}.npy') 59 | c_npy_file = os.path.join(out_dir_c, f'{vid}.npy') 60 | write_feature_to_npy(z_feature, feature_level, z_npy_file) 61 | write_feature_to_npy(c_feature, feature_level, c_npy_file) 62 | 63 | end_time = time.time() 64 | print(f'Total time used: {end_time - start_time:.1f}s.') 65 | 66 | if __name__ == '__main__': 67 | parser = argparse.ArgumentParser(description='Run.') 68 | parser.add_argument('--gpu', type=int, default=0, help='index of gpu') 69 | parser.add_argument('--feature_level', type=str, default='FRAME', help='name of feature level, FRAME or UTTERANCE') 70 | parser.add_argument('--dataset', type=str, default='MER2023', help='dataset') 71 | args = parser.parse_args() 72 | 73 | # gain paths 74 | audio_dir = config.PATH_TO_RAW_AUDIO[args.dataset] 75 | save_dir = config.PATH_TO_FEATURES[args.dataset] 76 | audio_files = glob.glob(os.path.join(audio_dir, '*.wav')) 77 | print(f'Find total "{len(audio_files)}" audio files.') 78 | 79 | # load model 80 | device = torch.device(f'cuda:{args.gpu}') 81 | model_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, f'wav2vec/wav2vec_large.pt') 82 | cp = torch.load(model_file) 83 | model = Wav2VecModel.build_model(cp['args'], task=None) 84 | model.load_state_dict(cp['model']) 85 | model.to(device) 86 | model.eval() 87 | 88 | # extract features 89 | extract(audio_files, args.feature_level, model, save_dir, args.gpu) 90 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/extract_emonet_embedding.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn.parallel 8 | import torch.optim 9 | import torch.utils.data 10 | import torch.utils.data.distributed 11 | import torchvision.transforms as transforms 12 | 13 | from emonet.models.emonet import EmoNet 14 | from dataset import FaceDatasetForEmoNet 15 | from emonet.data_augmentation import DataAugmentor 16 | 17 | # import config 18 | import sys 19 | sys.path.append('../../') 20 | import config 21 | 22 | def extract(data_loader, model): 23 | model.eval() 24 | with torch.no_grad(): 25 | features, timestamps = [], [] 26 | for images, names in data_loader: 27 | images = images.cuda() 28 | embedding = model(images, return_embedding=True) 29 | features.append(embedding.cpu().detach().numpy()) 30 | timestamps.extend(names) 31 | features, timestamps = np.row_stack(features), np.array(timestamps) 32 | return features, timestamps 33 | 34 | if __name__ == '__main__': 35 | parser = argparse.ArgumentParser(description='Run.') 36 | parser.add_argument('--dataset', type=str, default='MER2023', help='input dataset') 37 | parser.add_argument('--feature_level', type=str, default='UTTERANCE', help='feature level [FRAME or UTTERANCE]') 38 | parser.add_argument('--gpu', type=str, default='0', help='gpu id') 39 | params = parser.parse_args() 40 | os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu 41 | 42 | print(f'==> Extracting emonet embedding...') 43 | face_dir = config.PATH_TO_RAW_FACE[params.dataset] 44 | save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], f'emonet_{params.feature_level[:3]}') 45 | if not os.path.exists(save_dir): os.makedirs(save_dir) 46 | 47 | # load model 48 | model = EmoNet().cuda() 49 | checkpoint_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, 'emonet/emonet_8.pth') 50 | checkpoint = torch.load(checkpoint_file) 51 | pre_trained_dict = {k.replace('module.', ''): v for k,v in checkpoint.items()} 52 | model.load_state_dict(pre_trained_dict) 53 | 54 | # transform 55 | augmentor = DataAugmentor(256, 256) 56 | transform = transforms.Compose([transforms.ToTensor()]) 57 | 58 | # extract embedding video by video 59 | vids = os.listdir(face_dir) 60 | EMBEDDING_DIM = -1 61 | print(f'Find total "{len(vids)}" videos.') 62 | for i, vid in enumerate(vids, 1): 63 | print(f"Processing video '{vid}' ({i}/{len(vids)})...") 64 | # csv_file = os.path.join(save_dir, f'{vid}.npy') 65 | # if os.path.exists(csv_file): continue 66 | 67 | # forward 68 | dataset = FaceDatasetForEmoNet(vid, face_dir, transform=transform, augmentor=augmentor) 69 | if len(dataset) == 0: 70 | print("Warning: number of frames of video {} should not be zero.".format(vid)) 71 | embeddings, framenames = [], [] 72 | else: 73 | data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True) 74 | embeddings, framenames = extract(data_loader, model) 75 | 76 | # save results 77 | indexes = np.argsort(framenames) 78 | embeddings = embeddings[indexes] 79 | framenames = framenames[indexes] 80 | EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1]) 81 | 82 | csv_file = os.path.join(save_dir, f'{vid}.npy') 83 | if params.feature_level == 'FRAME': 84 | embeddings = np.array(embeddings).squeeze() 85 | if len(embeddings) == 0: 86 | embeddings = np.zeros((1, EMBEDDING_DIM)) 87 | elif len(embeddings.shape) == 1: 88 | embeddings = embeddings[np.newaxis, :] 89 | np.save(csv_file, embeddings) 90 | else: 91 | embeddings = np.array(embeddings).squeeze() 92 | if len(embeddings) == 0: 93 | embeddings = np.zeros((EMBEDDING_DIM, )) 94 | elif len(embeddings.shape) == 2: 95 | embeddings = np.mean(embeddings, axis=0) 96 | np.save(csv_file, embeddings) -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/vggish/vggish_postprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Post-process embeddings from VGGish.""" 17 | 18 | import numpy as np 19 | 20 | import vggish_params 21 | 22 | 23 | class Postprocessor(object): 24 | """Post-processes VGGish embeddings. 25 | 26 | The initial release of AudioSet included 128-D VGGish embeddings for each 27 | segment of AudioSet. These released embeddings were produced by applying 28 | a PCA transformation (technically, a whitening transform is included as well) 29 | and 8-bit quantization to the raw embedding output from VGGish, in order to 30 | stay compatible with the YouTube-8M project which provides visual embeddings 31 | in the same format for a large set of YouTube videos. This class implements 32 | the same PCA (with whitening) and quantization transformations. 33 | """ 34 | 35 | def __init__(self, pca_params_npz_path): 36 | """Constructs a postprocessor. 37 | 38 | Args: 39 | pca_params_npz_path: Path to a NumPy-format .npz file that 40 | contains the PCA parameters used in postprocessing. 41 | """ 42 | params = np.load(pca_params_npz_path) 43 | self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] 44 | # Load means into a column vector for easier broadcasting later. 45 | self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) 46 | assert self._pca_matrix.shape == ( 47 | vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 48 | 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) 49 | assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 50 | 'Bad PCA means shape: %r' % (self._pca_means.shape,)) 51 | 52 | def postprocess(self, embeddings_batch): 53 | """Applies postprocessing to a batch of embeddings. 54 | 55 | Args: 56 | embeddings_batch: An nparray of shape [batch_size, embedding_size] 57 | containing output from the embedding layer of VGGish. 58 | 59 | Returns: 60 | An nparray of the same shape as the input but of type uint8, 61 | containing the PCA-transformed and quantized version of the input. 62 | """ 63 | assert len(embeddings_batch.shape) == 2, ( 64 | 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) 65 | assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 66 | 'Bad batch shape: %r' % (embeddings_batch.shape,)) 67 | 68 | # Apply PCA. 69 | # - Embeddings come in as [batch_size, embedding_size]. 70 | # - Transpose to [embedding_size, batch_size]. 71 | # - Subtract pca_means column vector from each column. 72 | # - Premultiply by PCA matrix of shape [output_dims, input_dims] 73 | # where both are are equal to embedding_size in our case. 74 | # - Transpose result back to [batch_size, embedding_size]. 75 | pca_applied = np.dot(self._pca_matrix, 76 | (embeddings_batch.T - self._pca_means)).T 77 | 78 | # Quantize by: 79 | # - clipping to [min, max] range 80 | clipped_embeddings = np.clip( 81 | pca_applied, vggish_params.QUANTIZE_MIN_VAL, 82 | vggish_params.QUANTIZE_MAX_VAL) 83 | # - convert to 8-bit in range [0.0, 255.0] 84 | quantized_embeddings = ( 85 | (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * 86 | (255.0 / 87 | (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) 88 | # - cast 8-bit float to uint8 89 | quantized_embeddings = quantized_embeddings.astype(np.uint8) 90 | 91 | return quantized_embeddings 92 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/vggish/vggish_smoke_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """A smoke test for VGGish. 17 | 18 | This is a simple smoke test of a local install of VGGish and its associated 19 | downloaded files. We create a synthetic sound, extract log mel spectrogram 20 | features, run them through VGGish, post-process the embedding ouputs, and 21 | check some simple statistics of the results, allowing for variations that 22 | might occur due to platform/version differences in the libraries we use. 23 | 24 | Usage: 25 | - Download the VGGish checkpoint and PCA parameters into the same directory as 26 | the VGGish source code. If you keep them elsewhere, update the checkpoint_path 27 | and pca_params_path variables below. 28 | - Run: 29 | $ python vggish_smoke_test.py 30 | """ 31 | 32 | from __future__ import print_function 33 | 34 | import numpy as np 35 | import tensorflow.compat.v1 as tf 36 | import os 37 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 38 | tf.disable_v2_behavior() 39 | 40 | import vggish_input 41 | import vggish_params 42 | import vggish_postprocess 43 | import vggish_slim 44 | 45 | print('\nTesting your install of VGGish\n') 46 | 47 | # Paths to downloaded VGGish files. 48 | checkpoint_path = 'vggish_model.ckpt' 49 | pca_params_path = 'vggish_pca_params.npz' 50 | 51 | # Relative tolerance of errors in mean and standard deviation of embeddings. 52 | rel_error = 0.1 # Up to 10% 53 | 54 | # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate 55 | # to test resampling to 16 kHz during feature extraction). 56 | num_secs = 3 57 | freq = 1000 58 | sr = 44100 59 | t = np.linspace(0, num_secs, int(num_secs * sr)) 60 | x = np.sin(2 * np.pi * freq * t) 61 | 62 | # Produce a batch of log mel spectrogram examples. 63 | input_batch = vggish_input.waveform_to_examples(x, sr) 64 | print('Log Mel Spectrogram example: ', input_batch[0]) 65 | np.testing.assert_equal( 66 | input_batch.shape, 67 | [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) 68 | 69 | # Define VGGish, load the checkpoint, and run the batch through the model to 70 | # produce embeddings. 71 | with tf.Graph().as_default(), tf.Session() as sess: 72 | vggish_slim.define_vggish_slim() 73 | vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) 74 | 75 | features_tensor = sess.graph.get_tensor_by_name( 76 | vggish_params.INPUT_TENSOR_NAME) 77 | embedding_tensor = sess.graph.get_tensor_by_name( 78 | vggish_params.OUTPUT_TENSOR_NAME) 79 | [embedding_batch] = sess.run([embedding_tensor], 80 | feed_dict={features_tensor: input_batch}) 81 | print('VGGish embedding: ', embedding_batch[0]) 82 | expected_embedding_mean = 0.131 83 | expected_embedding_std = 0.238 84 | np.testing.assert_allclose( 85 | [np.mean(embedding_batch), np.std(embedding_batch)], 86 | [expected_embedding_mean, expected_embedding_std], 87 | rtol=rel_error) 88 | 89 | # Postprocess the results to produce whitened quantized embeddings. 90 | pproc = vggish_postprocess.Postprocessor(pca_params_path) 91 | postprocessed_batch = pproc.postprocess(embedding_batch) 92 | print('Postprocessed VGGish embedding: ', postprocessed_batch[0]) 93 | expected_postprocessed_mean = 123.0 94 | expected_postprocessed_std = 75.0 95 | np.testing.assert_allclose( 96 | [np.mean(postprocessed_batch), np.std(postprocessed_batch)], 97 | [expected_postprocessed_mean, expected_postprocessed_std], 98 | rtol=rel_error) 99 | 100 | print('\nLooks Good To Me!\n') 101 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/cmumosi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pickle 4 | from toolkit.utils.chatgpt import * 5 | from toolkit.utils.functions import * 6 | from toolkit.utils.read_files import * 7 | 8 | 9 | def generate_transcription(label_path, save_path): 10 | ## read pkl file 11 | names, eng_sentences = [], [] 12 | videoIDs, _, _, videoSentences, _, _, _ = pickle.load(open(label_path, "rb"), encoding='latin1') 13 | for vid in videoIDs: 14 | names.extend(videoIDs[vid]) 15 | eng_sentences.extend(videoSentences[vid]) 16 | print (f'whole sample number: {len(names)}') 17 | 18 | # translate eng2chi 19 | chi_sentences = [] 20 | for eng in eng_sentences: 21 | # chi = get_translate_eng2chi(eng, model='gpt-3.5-turbo-16k-0613') 22 | chi = get_translate_eng2chi(eng, model='gpt-4-0613') 23 | chi_sentences.append(chi) 24 | 25 | ## write to csv file 26 | name2key = {} 27 | for ii, name in enumerate(names): 28 | name2key[name] = [chi_sentences[ii], eng_sentences[ii]] 29 | func_write_key_to_csv(save_path, names, name2key, ['chinese', 'english']) 30 | 31 | 32 | def read_train_val_test(label_path, data_type): 33 | names, labels = [], [] 34 | assert data_type in ['train', 'val', 'test'] 35 | videoIDs, videoLabels, _, _, trainVids, valVids, testVids = pickle.load(open(label_path, "rb"), encoding='latin1') 36 | if data_type == 'train': vids = trainVids 37 | if data_type == 'val': vids = valVids 38 | if data_type == 'test': vids = testVids 39 | for vid in vids: 40 | names.extend(videoIDs[vid]) 41 | labels.extend(videoLabels[vid]) 42 | return names, labels 43 | 44 | 45 | def normalize_dataset_format(data_root, save_root): 46 | # gain paths 47 | label_path = os.path.join(save_root, 'CMUMOSI_features_raw_2way.pkl') 48 | assert os.path.exists(label_path), f'must has a pre-processed label file' 49 | video_root = os.path.join(data_root, 'Video/Segmented') 50 | 51 | # gain (names, labels) 52 | train_names, train_labels = read_train_val_test(label_path, 'train') 53 | val_names, val_labels = read_train_val_test(label_path, 'val') 54 | test_names, test_labels = read_train_val_test(label_path, 'test') 55 | print (f'train number: {len(train_names)}') 56 | print (f'val number: {len(val_names)}') 57 | print (f'test number: {len(test_names)}') 58 | 59 | ## output path 60 | save_video = os.path.join(save_root, 'subvideo') 61 | save_label = os.path.join(save_root, 'label.npz') 62 | save_trans = os.path.join(save_root, 'transcription.csv') 63 | if not os.path.exists(save_root): os.makedirs(save_root) 64 | if not os.path.exists(save_video): os.makedirs(save_video) 65 | 66 | ## generate new transcripts 67 | generate_transcription(label_path, save_trans) 68 | 69 | ## generate label path 70 | whole_corpus = {} 71 | for name, videonames, labels in [('train', train_names, train_labels), 72 | ('val', val_names, val_labels ), 73 | ('test', test_names, test_labels )]: 74 | whole_corpus[name] = {} 75 | for ii, videoname in enumerate(videonames): 76 | whole_corpus[name][videoname] = {'emo': 0, 'val': labels[ii]} 77 | 78 | # move video 79 | video_path = os.path.join(video_root, videoname+'.mp4') 80 | save_path = os.path.join(save_video, videoname+'.mp4') 81 | shutil.copy(video_path, save_path) 82 | 83 | np.savez_compressed(save_label, 84 | train_corpus=whole_corpus['train'], 85 | val_corpus=whole_corpus['val'], 86 | test_corpus=whole_corpus['test']) 87 | 88 | 89 | if __name__ == '__main__': 90 | 91 | data_root = 'G:\\CMU-MOSI\\Raw' 92 | save_root = 'E:\\Dataset\\cmumosi-process' 93 | normalize_dataset_format(data_root, save_root) 94 | 95 | # data_root = 'H:\\desktop\\Multimedia-Transformer\\chinese-mer-2023\\dataset\\cmumosi-process' 96 | # trans_path = os.path.join(data_root, 'transcription.csv') 97 | # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv') 98 | # func_translate_transcript_polish_merge(trans_path, polish_path) # 再次检测一下遗漏的部分 99 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/meld.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from toolkit.utils.chatgpt import * 4 | from toolkit.utils.functions import * 5 | from toolkit.utils.read_files import * 6 | 7 | emos = ['anger', 'joy', 'sadness', 'neutral', 'disgust', 'fear', 'surprise'] 8 | emo2idx, idx2emo = {}, {} 9 | for ii, emo in enumerate(emos): 10 | emo2idx[emo] = ii 11 | idx2emo[ii] = emo 12 | 13 | 14 | def read_labels(label_path): 15 | 16 | dia_ids = func_read_key_from_csv(label_path, 'Dialogue_ID') 17 | utt_ids = func_read_key_from_csv(label_path, 'Utterance_ID') 18 | labels = func_read_key_from_csv(label_path, 'Emotion') 19 | engs = func_read_key_from_csv(label_path, 'Utterance') 20 | 21 | names = [] 22 | for ii in range(len(dia_ids)): 23 | names.append(f'dia{dia_ids[ii]}_utt{utt_ids[ii]}') 24 | 25 | labels = [emo2idx[label] for label in labels] 26 | 27 | return names, labels, engs 28 | 29 | 30 | def normalize_dataset_format(data_root, save_root): 31 | 32 | # gain paths 33 | train_label_path = os.path.join(data_root, 'train_sent_emo.csv') 34 | train_video_root = os.path.join(data_root, 'train') 35 | val_label_path = os.path.join(data_root, 'dev_sent_emo.csv') 36 | val_video_root = os.path.join(data_root, 'dev') 37 | test_label_path = os.path.join(data_root, 'test_sent_emo.csv') 38 | test_video_root = os.path.join(data_root, 'test') 39 | 40 | # gain (names, labels) 41 | train_names, train_labels, train_engs = read_labels(train_label_path) 42 | val_names, val_labels, val_engs = read_labels(val_label_path) 43 | test_names, test_labels, test_engs = read_labels(test_label_path) 44 | print (f'train number: {len(train_names)}') 45 | print (f'val number: {len(val_names)}') 46 | print (f'test number: {len(test_names)}') 47 | 48 | ## output path 49 | save_video = os.path.join(save_root, 'subvideo') 50 | save_label = os.path.join(save_root, 'label.npz') 51 | save_trans = os.path.join(save_root, 'transcription.csv') 52 | if not os.path.exists(save_root): os.makedirs(save_root) 53 | if not os.path.exists(save_video): os.makedirs(save_video) 54 | 55 | ## generate label path 56 | name2eng = {} 57 | whole_corpus = {} 58 | for datatype, names, labels, engs, video_root in [('train', train_names, train_labels, train_engs, train_video_root), 59 | ('val', val_names, val_labels, val_engs, val_video_root), 60 | ('test', test_names, test_labels, test_engs, test_video_root)]: 61 | whole_corpus[datatype] = {} 62 | for ii, name in enumerate(names): 63 | newname = f'{datatype}_{name}' 64 | whole_corpus[datatype][newname] = {'emo': labels[ii], 'val': -10} # save labels 65 | name2eng[newname] = engs[ii] # save trans 66 | 67 | # move video 68 | video_path = os.path.join(video_root, name+'.mp4') 69 | save_path = os.path.join(save_video, newname+'.mp4') 70 | if os.path.exists(save_path): continue 71 | try: 72 | shutil.copy(video_path, save_path) 73 | except: 74 | print (f'ERROR: {video_path} does not exist!') 75 | 76 | # save labels 77 | np.savez_compressed(save_label, 78 | train_corpus=whole_corpus['train'], 79 | val_corpus=whole_corpus['val'], 80 | test_corpus=whole_corpus['test']) 81 | 82 | # save trans 83 | names = [name for name in name2eng] 84 | name2key = {} 85 | for ii, name in enumerate(names): 86 | name2key[name] = [name2eng[name]] 87 | func_write_key_to_csv(save_trans, names, name2key, ['english']) 88 | 89 | 90 | if __name__ == '__main__': 91 | 92 | data_root = 'E:\\Dataset\\MELD' 93 | save_root = 'E:\\Dataset\\meld-process' 94 | normalize_dataset_format(data_root, save_root) 95 | 96 | # data_root = 'E:\\Dataset\\meld-process' 97 | # trans_path = os.path.join(data_root, 'transcription.csv') 98 | # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv') 99 | # func_translate_transcript_polish_merge(trans_path, polish_path) # 再次检测一下遗漏的部分 100 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/vggish/vggish_input.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Compute input examples for VGGish from audio waveform.""" 17 | 18 | import numpy as np 19 | import resampy # verison: 0.2.2, pip install resampy 20 | import math 21 | from vggish import mel_features 22 | from vggish import vggish_params 23 | 24 | try: 25 | import soundfile as sf 26 | 27 | def wav_read(wav_file): 28 | wav_data, sr = sf.read(wav_file, dtype='int16') 29 | return wav_data, sr 30 | 31 | except ImportError: 32 | 33 | def wav_read(wav_file): 34 | raise NotImplementedError('WAV file reading requires soundfile package.') 35 | 36 | 37 | def waveform_to_examples(data, sample_rate, hop_sec): 38 | """Converts audio waveform into an array of examples for VGGish. 39 | 40 | Args: 41 | data: np.array of either one dimension (mono) or two dimensions 42 | (multi-channel, with the outer dimension representing channels). 43 | Each sample is generally expected to lie in the range [-1.0, +1.0], 44 | although this is not required. 45 | sample_rate: Sample rate of data. 46 | 47 | Returns: 48 | 3-D np.array of shape [num_examples, num_frames, num_bands] which represents 49 | a sequence of examples, each of which contains a patch of log mel 50 | spectrogram, covering num_frames frames of audio and num_bands mel frequency 51 | bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. 52 | """ 53 | # Convert to mono. 54 | if len(data.shape) > 1: 55 | data = np.mean(data, axis=1) 56 | # Resample to the rate assumed by VGGish. 57 | if sample_rate != vggish_params.SAMPLE_RATE: 58 | data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) 59 | 60 | # Compute log mel spectrogram features. 61 | log_mel = mel_features.log_mel_spectrogram( 62 | data, 63 | audio_sample_rate=vggish_params.SAMPLE_RATE, 64 | log_offset=vggish_params.LOG_OFFSET, 65 | window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, 66 | hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, 67 | num_mel_bins=vggish_params.NUM_MEL_BINS, 68 | lower_edge_hertz=vggish_params.MEL_MIN_HZ, 69 | upper_edge_hertz=vggish_params.MEL_MAX_HZ) 70 | 71 | # Frame features into examples. 72 | features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS 73 | example_window_length = int(round( 74 | vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) 75 | example_hop_length = int(round( 76 | hop_sec * features_sample_rate)) 77 | # vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # orginal 78 | log_mel_examples = mel_features.frame( 79 | log_mel, 80 | window_length=example_window_length, 81 | hop_length=example_hop_length) 82 | return log_mel_examples 83 | 84 | 85 | def wavfile_to_examples(wav_file, hop_sec): 86 | """Convenience wrapper around waveform_to_examples() for a common WAV format. 87 | 88 | Args: 89 | wav_file: String path to a file, or a file-like object. The file 90 | is assumed to contain WAV audio data with signed 16-bit PCM samples. 91 | 92 | Returns: 93 | See waveform_to_examples. 94 | """ 95 | wav_data, sr = wav_read(wav_file) 96 | assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype 97 | samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] 98 | 99 | ### process for samples < 1000ms, pad to longer than 1000ms 100 | if len(samples) < sr: 101 | samples = samples.tolist() 102 | samples = samples * math.ceil(sr/len(samples)) 103 | samples = np.array(samples) 104 | 105 | return waveform_to_examples(samples, sr, hop_sec) 106 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/chatgpt.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import cv2 4 | import glob 5 | import base64 6 | import numpy as np 7 | 8 | import openai 9 | 10 | # avoid RPD errors 11 | global_index = 1 12 | candidate_keys = ["sk-xxxx", "sk-xxxx", "sk-xxxx"] # Please use your own APIs, we support multiple APIs 13 | openai.api_key = candidate_keys[global_index] 14 | 15 | # 单次调用 16 | def func_get_completion(prompt, model="gpt-3.5-turbo-16k-0613"): 17 | try: 18 | messages = [{"role": "user", "content": prompt}] 19 | response = openai.ChatCompletion.create( 20 | model=model, 21 | messages=messages, 22 | temperature=0, # this is the degree of randomness 23 | max_tokens=1000, 24 | ) 25 | return response['choices'][0]['message']['content'] 26 | except Exception as e: 27 | print ('发生错误:', e) # change key to avoid RPD 28 | global global_index # 修改全局变量 29 | global_index = (global_index + 1) % 3 30 | print (f'========== key index: {global_index} ==========') 31 | openai.api_key = candidate_keys[global_index] 32 | return '' 33 | 34 | # 多次调用,避免网络异常 35 | def get_completion(prompt, model, maxtry=5): 36 | response = '' 37 | try_number = 0 38 | while len(response) == 0: 39 | try_number += 1 40 | if try_number == maxtry: 41 | print (f'fail for {maxtry} times') 42 | break 43 | response = func_get_completion(prompt, model) 44 | return response 45 | 46 | # chatgpt输出结果后处理 47 | def func_postprocess_chatgpt(response): 48 | response = response.strip() 49 | if response.startswith("输入"): response = response[len("输入"):] 50 | if response.startswith("输出"): response = response[len("输出"):] 51 | if response.startswith("翻译"): response = response[len("翻译"):] 52 | if response.startswith("让我们来翻译一下:"): response = response[len("让我们来翻译一下:"):] 53 | if response.startswith("output"): response = response[len("output"):] 54 | if response.startswith("Output"): response = response[len("Output"):] 55 | response = response.strip() 56 | if response.startswith(":"): response = response[len(":"):] 57 | if response.startswith(":"): response = response[len(":"):] 58 | response = response.strip() 59 | response = response.replace('\n', '') # remove \n 60 | response = response.strip() 61 | return response 62 | 63 | 64 | # --------------------------------------------------------------------- 65 | ## convert image/video into GPT4 support version 66 | def func_image_to_base64(image_path, grey_flag=False): # support more types 67 | image = cv2.imread(image_path) 68 | if grey_flag: 69 | image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 70 | return func_opencv_to_base64(image) 71 | 72 | def func_opencv_to_base64(image): 73 | _, buffer = cv2.imencode('.jpg', image) 74 | base64_image = base64.b64encode(buffer).decode('utf-8') 75 | return base64_image 76 | 77 | # deal with text 78 | def func_nyp_to_text(npy_path): 79 | text = np.load(npy_path).tolist() 80 | text = text.strip() 81 | text = text.replace('\n', '') # remove \n 82 | text = text.replace('\t', '') # remove \t 83 | text = text.strip() 84 | return text 85 | 86 | # --------------------------------------------------------------------- 87 | ## Translation 88 | # --------------------------------------------------------------------- 89 | def get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613'): 90 | if len(text) == 0: 91 | return "" 92 | 93 | prompt = f""" 94 | 请将以下输入翻译为中文: 95 | 96 | 输入:{text} 97 | 98 | 输出: 99 | """ 100 | response = get_completion(prompt, model) 101 | response = func_postprocess_chatgpt(response) 102 | print (text) 103 | print (response) 104 | return response 105 | 106 | 107 | def get_translate_chi2eng(text, model='gpt-3.5-turbo-16k-0613'): 108 | if len(text)==0: 109 | return "" 110 | 111 | prompt = f""" 112 | 请将以下输入翻译为英文: 113 | 114 | 输入:{text} 115 | 116 | 输出: 117 | """ 118 | response = get_completion(prompt, model) 119 | response = func_postprocess_chatgpt(response) 120 | print (text) 121 | print (response) 122 | return response 123 | 124 | 125 | if __name__ == '__main__': 126 | 127 | ## text input [test ok] 128 | text = 'The whether is sooooo good!!' 129 | get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613') 130 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/chatgpt.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import cv2 4 | import glob 5 | import base64 6 | import numpy as np 7 | 8 | import openai 9 | 10 | # avoid RPD errors 11 | global_index = 1 12 | candidate_keys = ["sk-xxxx", "sk-xxxx", "sk-xxxx"] # Please use your own APIs, we support multiple APIs 13 | openai.api_key = candidate_keys[global_index] 14 | 15 | # 单次调用 16 | def func_get_completion(prompt, model="gpt-3.5-turbo-16k-0613"): 17 | try: 18 | messages = [{"role": "user", "content": prompt}] 19 | response = openai.ChatCompletion.create( 20 | model=model, 21 | messages=messages, 22 | temperature=0, # this is the degree of randomness 23 | max_tokens=1000, 24 | ) 25 | return response['choices'][0]['message']['content'] 26 | except Exception as e: 27 | print ('发生错误:', e) # change key to avoid RPD 28 | global global_index # 修改全局变量 29 | global_index = (global_index + 1) % 3 30 | print (f'========== key index: {global_index} ==========') 31 | openai.api_key = candidate_keys[global_index] 32 | return '' 33 | 34 | # 多次调用,避免网络异常 35 | def get_completion(prompt, model, maxtry=5): 36 | response = '' 37 | try_number = 0 38 | while len(response) == 0: 39 | try_number += 1 40 | if try_number == maxtry: 41 | print (f'fail for {maxtry} times') 42 | break 43 | response = func_get_completion(prompt, model) 44 | return response 45 | 46 | # chatgpt输出结果后处理 47 | def func_postprocess_chatgpt(response): 48 | response = response.strip() 49 | if response.startswith("输入"): response = response[len("输入"):] 50 | if response.startswith("输出"): response = response[len("输出"):] 51 | if response.startswith("翻译"): response = response[len("翻译"):] 52 | if response.startswith("让我们来翻译一下:"): response = response[len("让我们来翻译一下:"):] 53 | if response.startswith("output"): response = response[len("output"):] 54 | if response.startswith("Output"): response = response[len("Output"):] 55 | response = response.strip() 56 | if response.startswith(":"): response = response[len(":"):] 57 | if response.startswith(":"): response = response[len(":"):] 58 | response = response.strip() 59 | response = response.replace('\n', '') # remove \n 60 | response = response.strip() 61 | return response 62 | 63 | 64 | # --------------------------------------------------------------------- 65 | ## convert image/video into GPT4 support version 66 | def func_image_to_base64(image_path, grey_flag=False): # support more types 67 | image = cv2.imread(image_path) 68 | if grey_flag: 69 | image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 70 | return func_opencv_to_base64(image) 71 | 72 | def func_opencv_to_base64(image): 73 | _, buffer = cv2.imencode('.jpg', image) 74 | base64_image = base64.b64encode(buffer).decode('utf-8') 75 | return base64_image 76 | 77 | # deal with text 78 | def func_nyp_to_text(npy_path): 79 | text = np.load(npy_path).tolist() 80 | text = text.strip() 81 | text = text.replace('\n', '') # remove \n 82 | text = text.replace('\t', '') # remove \t 83 | text = text.strip() 84 | return text 85 | 86 | # --------------------------------------------------------------------- 87 | ## Translation 88 | # --------------------------------------------------------------------- 89 | def get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613'): 90 | if len(text) == 0: 91 | return "" 92 | 93 | prompt = f""" 94 | 请将以下输入翻译为中文: 95 | 96 | 输入:{text} 97 | 98 | 输出: 99 | """ 100 | response = get_completion(prompt, model) 101 | response = func_postprocess_chatgpt(response) 102 | print (text) 103 | print (response) 104 | return response 105 | 106 | 107 | def get_translate_chi2eng(text, model='gpt-3.5-turbo-16k-0613'): 108 | if len(text)==0: 109 | return "" 110 | 111 | prompt = f""" 112 | 请将以下输入翻译为英文: 113 | 114 | 输入:{text} 115 | 116 | 输出: 117 | """ 118 | response = get_completion(prompt, model) 119 | response = func_postprocess_chatgpt(response) 120 | print (text) 121 | print (response) 122 | return response 123 | 124 | 125 | if __name__ == '__main__': 126 | 127 | ## text input [test ok] 128 | text = 'The whether is sooooo good!!' 129 | get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613') 130 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/lmf.py: -------------------------------------------------------------------------------- 1 | """ 2 | paper: Efficient Low-rank Multimodal Fusion with Modality-Specific Factors 3 | From: https://github.com/Justin1904/Low-rank-Multimodal-Fusion 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | from torch.nn.init import xavier_normal_ 8 | from torch.nn.parameter import Parameter 9 | from .modules.encoder import MLPEncoder, LSTMEncoder 10 | 11 | class LMF(nn.Module): 12 | 13 | def __init__(self, args): 14 | super(LMF, self).__init__() 15 | 16 | # load input and output dim 17 | text_dim = args.text_dim 18 | audio_dim = args.audio_dim 19 | video_dim = args.video_dim 20 | output_dim1 = args.output_dim1 21 | output_dim2 = args.output_dim2 22 | rank = args.rank 23 | dropout = args.dropout 24 | hidden_dim = args.hidden_dim 25 | self.grad_clip = args.grad_clip 26 | 27 | # define the pre-fusion subnetworks 28 | if args.feat_type in ['utt']: 29 | self.audio_encoder = MLPEncoder(audio_dim, hidden_dim, dropout) 30 | self.text_encoder = MLPEncoder(text_dim, hidden_dim, dropout) 31 | self.video_encoder = MLPEncoder(video_dim, hidden_dim, dropout) 32 | elif args.feat_type in ['frm_align', 'frm_unalign']: 33 | self.audio_encoder = LSTMEncoder(audio_dim, hidden_dim, dropout) 34 | self.text_encoder = LSTMEncoder(text_dim, hidden_dim, dropout) 35 | self.video_encoder = LSTMEncoder(video_dim, hidden_dim, dropout) 36 | 37 | # define the post_fusion layers 38 | self.output_dim = hidden_dim // 2 39 | self.post_fusion_dropout = nn.Dropout(p=dropout) 40 | self.audio_factor = Parameter(torch.Tensor(rank, hidden_dim + 1, self.output_dim)) 41 | self.video_factor = Parameter(torch.Tensor(rank, hidden_dim + 1, self.output_dim)) 42 | self.text_factor = Parameter(torch.Tensor(rank, hidden_dim + 1, self.output_dim)) 43 | self.fusion_weights = Parameter(torch.Tensor(1, rank)) 44 | self.fusion_bias = Parameter(torch.Tensor(1, self.output_dim)) 45 | 46 | # init teh factors 47 | xavier_normal_(self.audio_factor) 48 | xavier_normal_(self.video_factor) 49 | xavier_normal_(self.text_factor) 50 | xavier_normal_(self.fusion_weights) 51 | self.fusion_bias.data.fill_(0) 52 | 53 | self.fc_out_1 = nn.Linear(self.output_dim, output_dim1) 54 | self.fc_out_2 = nn.Linear(self.output_dim, output_dim2) 55 | 56 | 57 | def forward(self, batch): 58 | ''' 59 | Args: 60 | audio_x: tensor of shape (batch_size, audio_in) 61 | video_x: tensor of shape (batch_size, video_in) 62 | text_x: tensor of shape (batch_size, text_in) 63 | ''' 64 | audio_h = self.audio_encoder(batch['audios']) 65 | video_h = self.video_encoder(batch['videos']) 66 | text_h = self.text_encoder(batch['texts']) 67 | batch_size = audio_h.data.shape[0] 68 | 69 | # next we perform low-rank multimodal fusion 70 | # here is a more efficient implementation than the one the paper describes 71 | # basically swapping the order of summation and elementwise product 72 | # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product 73 | add_one = torch.ones(size=[batch_size, 1], requires_grad=False).type_as(audio_h).to(audio_h.device) 74 | _audio_h = torch.cat((add_one, audio_h), dim=1) 75 | _video_h = torch.cat((add_one, video_h), dim=1) 76 | _text_h = torch.cat((add_one, text_h), dim=1) 77 | 78 | # torch.matmul() 处理时会将 [batch, feat+1] -> [rank, batch, feat+1], 看结果就好像把 [feat+1] 分解为 rank * [hidden] 79 | fusion_audio = torch.matmul(_audio_h, self.audio_factor) # [batch, feat+1] * [rank, feat+1, hidden] = [rank, batch, hidden] 80 | fusion_video = torch.matmul(_video_h, self.video_factor) 81 | fusion_text = torch.matmul(_text_h, self.text_factor ) 82 | fusion_zy = fusion_audio * fusion_video * fusion_text # [rank, batch, hidden] 83 | 84 | # use linear transformation instead of simple summation, more flexibility 85 | output = torch.matmul(self.fusion_weights, fusion_zy.permute(1, 0, 2)).squeeze() + self.fusion_bias # [1, rank] * [batch, rank, hidden] -> [batch, hidden] 86 | features = output.view(-1, self.output_dim) 87 | 88 | emos_out = self.fc_out_1(features) 89 | vals_out = self.fc_out_2(features) 90 | interloss = torch.tensor(0).cuda() 91 | 92 | return features, emos_out, vals_out, interloss 93 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/extract_manet_embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn.parallel 7 | import torch.optim 8 | import torch.utils.data 9 | import torchvision.transforms as transforms 10 | 11 | # import config 12 | import sys 13 | sys.path.append('../../') 14 | import config 15 | from dataset import FaceDataset 16 | from manet.model.manet import manet 17 | 18 | class RecorderMeter(object): 19 | """Computes and stores the minimum loss value and its epoch index""" 20 | 21 | def __init__(self, total_epoch): 22 | self.reset(total_epoch) 23 | 24 | def reset(self, total_epoch): 25 | self.total_epoch = total_epoch 26 | self.current_epoch = 0 27 | self.epoch_losses = np.zeros((self.total_epoch, 2), dtype=np.float32) # [epoch, train/val] 28 | self.epoch_accuracy = np.zeros((self.total_epoch, 2), dtype=np.float32) # [epoch, train/val] 29 | 30 | def extract(data_loader, model): 31 | model.eval() 32 | with torch.no_grad(): 33 | features, timestamps = [], [] 34 | for images, names in data_loader: 35 | images = images.cuda() 36 | embedding = model(images, return_embedding=True) 37 | features.append(embedding.cpu().detach().numpy()) 38 | timestamps.extend(names) 39 | features, timestamps = np.row_stack(features), np.array(timestamps) 40 | return features, timestamps 41 | 42 | if __name__ == '__main__': 43 | parser = argparse.ArgumentParser(description='Run.') 44 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset') 45 | parser.add_argument('--feature_level', type=str, default='UTTERANCE', help='feature level [FRAME or UTTERANCE]') 46 | parser.add_argument('--gpu', type=str, default='1', help='gpu id') 47 | params = parser.parse_args() 48 | os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu 49 | 50 | print(f'==> Extracting manet embedding...') 51 | face_dir = config.PATH_TO_RAW_FACE[params.dataset] 52 | save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], f'manet_{params.feature_level[:3]}') 53 | if not os.path.exists(save_dir): os.makedirs(save_dir) 54 | 55 | # load model 56 | model = manet(num_classes=7).cuda() 57 | checkpoint_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, 'manet/[02-08]-[21-19]-model_best-acc88.33.pth') 58 | checkpoint = torch.load(checkpoint_file) 59 | pre_trained_dict = {k.replace('module.', ''): v for k,v in checkpoint['state_dict'].items()} 60 | model.load_state_dict(pre_trained_dict) 61 | 62 | # transform 63 | transform = transforms.Compose([transforms.Resize((224, 224)), 64 | transforms.ToTensor()]) 65 | 66 | # extract embedding video by video 67 | vids = os.listdir(face_dir) 68 | EMBEDDING_DIM = -1 69 | print(f'Find total "{len(vids)}" videos.') 70 | for i, vid in enumerate(vids, 1): 71 | print(f"Processing video '{vid}' ({i}/{len(vids)})...") 72 | 73 | # forward 74 | dataset = FaceDataset(vid, face_dir, transform=transform) 75 | if len(dataset) == 0: 76 | print("Warning: number of frames of video {} should not be zero.".format(vid)) 77 | embeddings, framenames = [], [] 78 | else: 79 | data_loader = torch.utils.data.DataLoader(dataset, 80 | batch_size=32, 81 | num_workers=4, 82 | pin_memory=True) 83 | embeddings, framenames = extract(data_loader, model) 84 | 85 | # save results 86 | indexes = np.argsort(framenames) 87 | embeddings = embeddings[indexes] 88 | framenames = framenames[indexes] 89 | EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1]) 90 | 91 | save_file = os.path.join(save_dir, f'{vid}.npy') 92 | if params.feature_level == 'FRAME': 93 | embeddings = np.array(embeddings).squeeze() 94 | if len(embeddings) == 0: 95 | embeddings = np.zeros((1, EMBEDDING_DIM)) 96 | elif len(embeddings.shape) == 1: 97 | embeddings = embeddings[np.newaxis, :] 98 | np.save(save_file, embeddings) 99 | else: 100 | embeddings = np.array(embeddings).squeeze() 101 | if len(embeddings) == 0: 102 | embeddings = np.zeros((EMBEDDING_DIM, )) 103 | elif len(embeddings.shape) == 2: 104 | embeddings = np.mean(embeddings, axis=0) 105 | np.save(save_file, embeddings) 106 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/model/vgg_vd_face_fer_dag.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Vgg_vd_face_fer_dag(nn.Module): 8 | 9 | def __init__(self): 10 | super(Vgg_vd_face_fer_dag, self).__init__() 11 | self.meta = {'mean': [129.186279296875, 104.76238250732422, 93.59396362304688], 12 | 'std': [1, 1, 1], 13 | 'imageSize': [224, 224, 3]} 14 | self.conv1_1 = nn.Conv2d(3, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 15 | self.relu1_1 = nn.ReLU() 16 | self.conv1_2 = nn.Conv2d(64, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 17 | self.relu1_2 = nn.ReLU() 18 | self.pool1 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 19 | self.conv2_1 = nn.Conv2d(64, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 20 | self.relu2_1 = nn.ReLU() 21 | self.conv2_2 = nn.Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 22 | self.relu2_2 = nn.ReLU() 23 | self.pool2 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 24 | self.conv3_1 = nn.Conv2d(128, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 25 | self.relu3_1 = nn.ReLU() 26 | self.conv3_2 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 27 | self.relu3_2 = nn.ReLU() 28 | self.conv3_3 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 29 | self.relu3_3 = nn.ReLU() 30 | self.pool3 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 31 | self.conv4_1 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 32 | self.relu4_1 = nn.ReLU() 33 | self.conv4_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 34 | self.relu4_2 = nn.ReLU() 35 | self.conv4_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 36 | self.relu4_3 = nn.ReLU() 37 | self.pool4 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 38 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 39 | self.relu5_1 = nn.ReLU() 40 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 41 | self.relu5_2 = nn.ReLU() 42 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 43 | self.relu5_3 = nn.ReLU() 44 | self.pool5 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 45 | self.fc6 = nn.Conv2d(512, 4096, kernel_size=[7, 7], stride=(1, 1)) 46 | self.relu6 = nn.ReLU() 47 | self.fc7 = nn.Linear(in_features=4096, out_features=4096, bias=True) 48 | self.relu7 = nn.ReLU() 49 | self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True) 50 | 51 | def forward(self, data): 52 | x1 = self.conv1_1(data) 53 | x2 = self.relu1_1(x1) 54 | x3 = self.conv1_2(x2) 55 | x4 = self.relu1_2(x3) 56 | x5 = self.pool1(x4) 57 | x6 = self.conv2_1(x5) 58 | x7 = self.relu2_1(x6) 59 | x8 = self.conv2_2(x7) 60 | x9 = self.relu2_2(x8) 61 | x10 = self.pool2(x9) 62 | x11 = self.conv3_1(x10) 63 | x12 = self.relu3_1(x11) 64 | x13 = self.conv3_2(x12) 65 | x14 = self.relu3_2(x13) 66 | x15 = self.conv3_3(x14) 67 | x16 = self.relu3_3(x15) 68 | x17 = self.pool3(x16) 69 | x18 = self.conv4_1(x17) 70 | x19 = self.relu4_1(x18) 71 | x20 = self.conv4_2(x19) 72 | x21 = self.relu4_2(x20) 73 | x22 = self.conv4_3(x21) 74 | x23 = self.relu4_3(x22) 75 | x24 = self.pool4(x23) 76 | x25 = self.conv5_1(x24) 77 | x26 = self.relu5_1(x25) 78 | x27 = self.conv5_2(x26) 79 | x28 = self.relu5_2(x27) 80 | x29 = self.conv5_3(x28) 81 | x30 = self.relu5_3(x29) 82 | x31 = self.pool5(x30) 83 | x32 = self.fc6(x31) 84 | x33_preflatten = self.relu6(x32) 85 | x33 = x33_preflatten.view(x33_preflatten.size(0), -1) 86 | x34 = self.fc7(x33) 87 | x35 = self.relu7(x34) 88 | prediction = self.fc8(x35) 89 | return prediction 90 | 91 | def vgg_vd_face_fer_dag(weights_path=None, **kwargs): 92 | """ 93 | load imported model instance 94 | 95 | Args: 96 | weights_path (str): If set, loads model weights from the given path 97 | """ 98 | model = Vgg_vd_face_fer_dag() 99 | if weights_path: 100 | state_dict = torch.load(weights_path) 101 | model.load_state_dict(state_dict) 102 | return model -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/fer2013/fer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Fer2013 benchmark 3 | 4 | The module evaluates the performance of a pytorch model on the FER2013 5 | benchmark. 6 | """ 7 | 8 | from __future__ import division 9 | 10 | import os 11 | import time 12 | 13 | import torch 14 | import numpy as np 15 | import torch.utils.data 16 | import torch.backends.cudnn as cudnn 17 | from fer2013.fer_loader import Fer2013Dataset, Fer2013PlusDataset 18 | from utils.benchmark_helpers import compose_transforms 19 | 20 | def fer2013_benchmark(model, data_dir, res_cache, refresh_cache, 21 | batch_size=256, num_workers=2, fer_plus=False): 22 | if not refresh_cache: # load result from cache, if available 23 | if os.path.isfile(res_cache): 24 | res = torch.load(res_cache) 25 | prec1_val, prec1_test = res['prec1_val'], res['prec1_test'] 26 | print("=> loaded results from '{}'".format(res_cache)) 27 | info = (prec1_val, prec1_test, res['speed']) 28 | msg = 'val acc: {:.2f}, test acc: {:.2f}, Speed: {:.1f}Hz' 29 | print(msg.format(*info)) 30 | return 31 | 32 | meta = model.meta 33 | cudnn.benchmark = True 34 | model = torch.nn.DataParallel(model).cuda() 35 | preproc_transforms = compose_transforms(meta, center_crop=False) 36 | if fer_plus: 37 | dataset = Fer2013PlusDataset 38 | else: 39 | dataset = Fer2013Dataset 40 | speeds = [] 41 | res = {} 42 | for mode in 'val', 'test': 43 | loader = torch.utils.data.DataLoader( 44 | dataset(data_dir, mode=mode, transform=preproc_transforms), 45 | batch_size=batch_size, shuffle=False, 46 | num_workers=num_workers, pin_memory=True) 47 | prec1, speed = validate(loader, model, mode) 48 | res['prec1_{}'.format(mode)] = prec1 49 | speeds.append(speed) 50 | res['speed'] = np.mean(speed) 51 | torch.save(res, res_cache) 52 | 53 | def validate(val_loader, model, mode): 54 | model.eval() 55 | top1 = AverageMeter() 56 | speed = WarmupAverageMeter() 57 | end = time.time() 58 | with torch.no_grad(): 59 | for ii, (ims, target) in enumerate(val_loader): 60 | # target = target.cuda(async=True) 61 | target = target.cuda() 62 | output = model(ims) # compute output 63 | prec1, = accuracy(output.data, target, topk=(1,)) 64 | top1.update(prec1[0], ims.size(0)) 65 | speed.update(time.time() - end, ims.size(0)) 66 | end = time.time() 67 | if ii % 10 == 0: 68 | msg = ('{0}: [{1}/{2}]\tSpeed {speed.current:.1f}Hz\t' 69 | '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f}') 70 | print(msg.format(mode, ii, len(val_loader), 71 | speed=speed, top1=top1)) 72 | print(' * Accuracy {0:.3f}'.format(top1.avg)) 73 | return top1.avg, speed.avg 74 | 75 | class WarmupAverageMeter(object): 76 | """Computes and stores the average and current value, after a fixed 77 | warmup period (useful for approximate benchmarking) 78 | 79 | Args: 80 | warmup (int) [3]: The number of updates to be ignored before the 81 | average starts to be computed. 82 | """ 83 | def __init__(self, warmup=3): 84 | self.reset() 85 | self.warmup = warmup 86 | 87 | def reset(self): 88 | self.avg = 0 89 | self.current = 0 90 | self.delta_sum = 0 91 | self.count = 0 92 | self.warmup_count = 0 93 | 94 | def update(self, delta, n): 95 | self.warmup_count = self.warmup_count + 1 96 | if self.warmup_count >= self.warmup: 97 | self.current = n / delta 98 | self.delta_sum += delta 99 | self.count += n 100 | self.avg = self.count / self.delta_sum 101 | 102 | class AverageMeter(object): 103 | """Computes and stores the average and current value""" 104 | def __init__(self): 105 | self.reset() 106 | 107 | def reset(self): 108 | self.val = 0 109 | self.avg = 0 110 | self.sum = 0 111 | self.count = 0 112 | 113 | def update(self, val, n=1): 114 | self.val = val 115 | self.sum += val * n 116 | self.count += n 117 | self.avg = self.sum / self.count 118 | 119 | def accuracy(output, target, topk=(1,)): 120 | """Computes the precision@k for the specified values of k""" 121 | maxk = max(topk) 122 | batch_size = target.size(0) 123 | output = output.squeeze(-1).squeeze(-1) 124 | _, pred = output.topk(maxk, 1, True, True) 125 | pred = pred.t() 126 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 127 | 128 | res = [] 129 | for k in topk: 130 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 131 | res.append(correct_k.mul_(100.0 / batch_size)) 132 | return res 133 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/sims.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from toolkit.utils.functions import * 4 | from toolkit.utils.read_files import * 5 | 6 | def func_convert_name_to_newname(video_id, clip_id): 7 | newname = video_id + '_%04d' %(clip_id) 8 | return newname 9 | 10 | def func_merge_id_to_path(video_id, clip_id, video_root): 11 | video_path = os.path.join(video_root, video_id, '%04d.mp4' %(clip_id)) 12 | return video_path 13 | 14 | # label_path -> (video_paths, labels) 15 | def read_labels(label_path, video_root): 16 | video_ids = func_read_key_from_csv(label_path, 'video_id') 17 | clip_ids = func_read_key_from_csv(label_path, 'clip_id') 18 | labels = func_read_key_from_csv(label_path, 'label') 19 | print (f'label range -> min:{min(labels)} max:{max(labels)}') 20 | print (f'whole sample number: {len(labels)}') 21 | 22 | video_paths = [] 23 | for ii in range(len(video_ids)): 24 | video_path = func_merge_id_to_path(video_ids[ii], clip_ids[ii], video_root) 25 | video_paths.append(video_path) 26 | 27 | return video_paths, labels 28 | 29 | # 只读取 idx_path 对应的 items并返回 30 | def gain_sub_items(video_paths, labels, idx_path): 31 | indexes = func_read_key_from_csv(idx_path, 'index') 32 | video_paths = np.array(video_paths)[indexes] 33 | labels = np.array(labels)[indexes] 34 | print (f'subset sample number: {len(labels)}') 35 | return video_paths, labels 36 | 37 | # 转化为 newname 对应的 trans 38 | def update_transcription(trans_path, save_path): 39 | video_ids = func_read_key_from_csv(trans_path, 'video_id') 40 | clip_ids = func_read_key_from_csv(trans_path, 'clip_id') 41 | chi_subtitles = func_read_key_from_csv(trans_path, 'Chinese') 42 | eng_subtitles = func_read_key_from_csv(trans_path, 'English') 43 | print (f'whole sample number: {len(video_ids)}') 44 | 45 | newnames = [] 46 | for ii in range(len(video_ids)): 47 | newname = func_convert_name_to_newname(video_ids[ii], clip_ids[ii]) 48 | newnames.append(newname) 49 | 50 | name2key = {} 51 | for ii, name in enumerate(newnames): 52 | name2key[name] = [chi_subtitles[ii], eng_subtitles[ii]] 53 | func_write_key_to_csv(save_path, newnames, name2key, ['chinese', 'english']) 54 | 55 | 56 | # ------------------- main process ------------------- 57 | def normalize_dataset_format(data_root, save_root): 58 | # gain paths 59 | video_root = os.path.join(data_root, 'Raw') 60 | label_path = os.path.join(data_root, 'metadata/sentiment/label_M.csv') 61 | train_idx_path = os.path.join(data_root, 'metadata/train_index.csv') 62 | val_idx_path = os.path.join(data_root, 'metadata/val_index.csv') 63 | test_idx_path = os.path.join(data_root, 'metadata/test_index.csv') 64 | trans_path = os.path.join(data_root, 'metadata/Translation.csv') 65 | 66 | # read all items 67 | video_paths, labels = read_labels(label_path, video_root) 68 | train_video, train_label = gain_sub_items(video_paths, labels, train_idx_path) 69 | val_video, val_label = gain_sub_items(video_paths, labels, val_idx_path) 70 | test_video, test_label = gain_sub_items(video_paths, labels, test_idx_path) 71 | 72 | ## output path 73 | save_video = os.path.join(save_root, 'video') 74 | save_label = os.path.join(save_root, 'label.npz') 75 | save_trans = os.path.join(save_root, 'transcription.csv') 76 | if not os.path.exists(save_root): os.makedirs(save_root) 77 | if not os.path.exists(save_video): os.makedirs(save_video) 78 | 79 | ## generate new transcripts 80 | update_transcription(trans_path, save_trans) 81 | 82 | ## generate label path 83 | whole_corpus = {} 84 | for name, video_paths, labels in [('train', train_video, train_label), 85 | ('val', val_video, val_label ), 86 | ('test', test_video, test_label )]: 87 | whole_corpus[name] = {} 88 | print (f'{name}: sample number: {len(video_paths)}') 89 | for ii, video_path in enumerate(video_paths): 90 | video_name = video_path.split('/')[-2] 91 | clip_name = video_path.split('/')[-1] 92 | save_path = os.path.join(save_video, f'{video_name}_{clip_name}') 93 | shutil.copy(video_path, save_path) 94 | 95 | save_name = os.path.basename(save_path)[:-4] 96 | whole_corpus[name][save_name] = {'emo': 0, 'val': labels[ii]} 97 | 98 | np.savez_compressed(save_label, 99 | train_corpus=whole_corpus['train'], 100 | val_corpus=whole_corpus['val'], 101 | test_corpus=whole_corpus['test']) 102 | 103 | if __name__ == '__main__': 104 | data_root = '/data/lianzheng/chinese-mer-2023/CH-SIMS' 105 | save_root = '/data/lianzheng/chinese-mer-2023/CH-SIMS-process' 106 | normalize_dataset_format(data_root, save_root) 107 | 108 | # data_root = 'H:\\desktop\\Multimedia-Transformer\\chinese-mer-2023\\dataset\\sims-process' 109 | # trans_path = os.path.join(data_root, 'transcription.csv') 110 | # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv') 111 | # func_translate_transcript_polish_merge(trans_path, polish_path) 112 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/imagenet/imagenet.py.bak: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Imagenet validation set benchmark 3 | 4 | The module evaluates the performance of a pytorch model on the ILSVRC 2012 5 | validation set. 6 | 7 | Based on PyTorch imagenet example: 8 | https://github.com/pytorch/examples/tree/master/imagenet 9 | """ 10 | 11 | from __future__ import division 12 | 13 | import os 14 | import time 15 | 16 | from PIL import ImageFile 17 | import torch 18 | import torch.nn.parallel 19 | import torch.utils.data 20 | import torch.backends.cudnn as cudnn 21 | import torchvision.datasets as datasets 22 | from utils.benchmark_helpers import compose_transforms 23 | 24 | ImageFile.LOAD_TRUNCATED_IMAGES = True 25 | 26 | def imagenet_benchmark(model, data_dir, res_cache, refresh_cache, 27 | batch_size=256, num_workers=20, 28 | remove_blacklist=False, center_crop=True): 29 | if not refresh_cache: # load result from cache, if available 30 | if os.path.isfile(res_cache): 31 | res = torch.load(res_cache) 32 | prec1, prec5, speed = res['prec1'], res['prec5'], res['speed'] 33 | print("=> loaded results from '{}'".format(res_cache)) 34 | info = (100 - prec1, 100 - prec5, speed) 35 | msg = 'Top 1 err: {:.2f}, Top 5 err: {:.2f}, Speed: {:.1f}Hz' 36 | print(msg.format(*info)) 37 | return 38 | 39 | meta = model.meta 40 | cudnn.benchmark = True 41 | model = torch.nn.DataParallel(model).cuda() 42 | if remove_blacklist: 43 | subset = 'val_blacklisted' 44 | else: 45 | subset = 'val' 46 | valdir = os.path.join(data_dir, subset) 47 | preproc_transforms = compose_transforms(meta, center_crop=center_crop) 48 | val_loader = torch.utils.data.DataLoader( 49 | datasets.ImageFolder(valdir, preproc_transforms), 50 | batch_size=batch_size, shuffle=False, 51 | num_workers=num_workers, pin_memory=True) 52 | prec1, prec5, speed = validate(val_loader, model) 53 | torch.save({'prec1': prec1, 'prec5': prec5, 'speed': speed}, res_cache) 54 | 55 | def validate(val_loader, model): 56 | model.eval() 57 | top1 = AverageMeter() 58 | top5 = AverageMeter() 59 | speed = WarmupAverageMeter() 60 | end = time.time() 61 | with torch.no_grad(): 62 | for ii, (ims, target) in enumerate(val_loader): 63 | target = target.cuda(async=True) 64 | # ims_var = torch.autograd.Variable(ims, volatile=True) 65 | output = model(ims) # compute output 66 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 67 | top1.update(prec1[0], ims.size(0)) 68 | top5.update(prec5[0], ims.size(0)) 69 | speed.update(time.time() - end, ims.size(0)) 70 | end = time.time() 71 | if ii % 10 == 0: 72 | msg = ('Test: [{0}/{1}]\tSpeed {speed.current:.1f}Hz\t' 73 | '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f} ' 74 | '{top5.avg:.3f}') 75 | print(msg.format(ii, len(val_loader), speed=speed, 76 | top1=top1, top5=top5)) 77 | top1_err, top5_err = 100 - top1.avg, 100 - top5.avg 78 | print(' * Err@1 {0:.3f} Err@5 {1:.3f}'.format(top1_err, top5_err)) 79 | 80 | return top1.avg, top5.avg, speed.avg 81 | 82 | class WarmupAverageMeter(object): 83 | """Computes and stores the average and current value, after a fixed 84 | warmup period (useful for approximate benchmarking) 85 | 86 | Args: 87 | warmup (int) [3]: The number of updates to be ignored before the 88 | average starts to be computed. 89 | """ 90 | def __init__(self, warmup=3): 91 | self.reset() 92 | self.warmup = warmup 93 | 94 | def reset(self): 95 | self.avg = 0 96 | self.current = 0 97 | self.delta_sum = 0 98 | self.count = 0 99 | self.warmup_count = 0 100 | 101 | def update(self, delta, n): 102 | self.warmup_count = self.warmup_count + 1 103 | if self.warmup_count >= self.warmup: 104 | self.current = n / delta 105 | self.delta_sum += delta 106 | self.count += n 107 | self.avg = self.count / self.delta_sum 108 | 109 | class AverageMeter(object): 110 | """Computes and stores the average and current value""" 111 | def __init__(self): 112 | self.reset() 113 | 114 | def reset(self): 115 | self.val = 0 116 | self.avg = 0 117 | self.sum = 0 118 | self.count = 0 119 | 120 | def update(self, val, n=1): 121 | self.val = val 122 | self.sum += val * n 123 | self.count += n 124 | self.avg = self.sum / self.count 125 | 126 | def accuracy(output, target, topk=(1,)): 127 | """Computes the precision@k for the specified values of k""" 128 | maxk = max(topk) 129 | batch_size = target.size(0) 130 | 131 | _, pred = output.topk(maxk, 1, True, True) 132 | pred = pred.t() 133 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 134 | 135 | res = [] 136 | for k in topk: 137 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 138 | res.append(correct_k.mul_(100.0 / batch_size)) 139 | return res 140 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/utils/read_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import tqdm 4 | import math 5 | import pickle 6 | import numpy as np 7 | import multiprocessing 8 | 9 | from ..globals import * 10 | from .functions import * 11 | from .read_files import * 12 | 13 | ############################################################ 14 | # ------ for feat: feature_root+name -> (seqlen, featdim) ------ 15 | def func_read_one_feat(argv=None, feature_root=None, name=None, processor=None, model_name=None): 16 | feature_root, name, processor, model_name = argv 17 | 18 | # 路径可能的两个选项 19 | feature_path = os.path.join(feature_root, name+'.npy') 20 | feature_dir = os.path.join(feature_root, name) 21 | 22 | feature = [] 23 | if os.path.exists(feature_path): # audio/text => belong to speaker 24 | single_feature = np.load(feature_path) 25 | single_feature = single_feature.squeeze() # [Dim, ] or [Time, Dim] 26 | feature.append(single_feature) 27 | elif os.path.isdir(feature_dir): 28 | facenames = os.listdir(feature_dir) # 如果是文件夹,则依次读取文件夹内所有信息 29 | for facename in sorted(facenames): 30 | facefeat = np.load(os.path.join(feature_dir, facename)) 31 | feature.append(facefeat) 32 | else: 33 | raise Exception('feature path or dir do not exist!') 34 | 35 | # feature -> (seqlen, featdim) 36 | single_feature = np.array(feature).squeeze() 37 | if len(single_feature) == 0: 38 | print ('feature has errors!!') 39 | elif len(single_feature.shape) == 1: 40 | single_feature = single_feature[np.newaxis, :] 41 | return single_feature 42 | 43 | 44 | # model_name:表示用的哪个预训练模型 45 | # read multiple data [different datasets need different processors] 46 | def func_read_multiprocess(feature_root, names, processor=None, read_type='feat', model_name=None): 47 | ## names => features 48 | params = [] 49 | for name in names: 50 | params.append((feature_root, name, processor, model_name)) 51 | 52 | # ------ debug ------ 53 | # func_read_one_feat(params[0]) 54 | # func_read_one_e2e_video(params[0]) 55 | # func_read_one_e2e_audio(params[0]) 56 | 57 | features = [] 58 | with multiprocessing.Pool(processes=8) as pool: 59 | if read_type == 'feat': 60 | features = list(tqdm.tqdm(pool.imap(func_read_one_feat, params), total=len(params))) 61 | 62 | ## save (names, features) 63 | feature_shape = np.array(features[0]).shape 64 | feature_name = os.path.basename(feature_root) 65 | print (f'Input feature {feature_name} ===> dim is {feature_shape}') 66 | assert len(names) == len(features), f'Error: len(names) != len(features)' 67 | return features, feature_shape[-1] 68 | 69 | 70 | ############################################################ 71 | # (seqlen, featdim) -> (dst_len, featdim) 72 | def func_mapping_feature(feature, dst_len): 73 | featlen, featdim = feature.shape 74 | if featlen == dst_len: 75 | return feature 76 | elif featlen < dst_len: 77 | pad_feature = np.zeros((dst_len-featlen, featdim)) 78 | feature = np.concatenate((pad_feature, feature), axis=0) 79 | else: 80 | if featlen // dst_len == featlen / dst_len: 81 | pad_len = 0 82 | pool_size = featlen // dst_len 83 | else: 84 | pad_len = dst_len - featlen % dst_len 85 | pool_size = featlen // dst_len + 1 86 | pad_feature = np.zeros((pad_len, featdim)) 87 | feature = np.concatenate([pad_feature, feature]).reshape(dst_len, pool_size, featdim) # 相邻时刻特征取平均 88 | feature = np.mean(feature, axis=1) 89 | return feature 90 | 91 | # sample-level 92 | def align_to_utt(audios, texts, videos): 93 | for ii in range(len(audios)): 94 | audios[ii] = np.mean(audios[ii], axis=0) 95 | texts[ii] = np.mean(texts[ii], axis=0) 96 | videos[ii] = np.mean(videos[ii], axis=0) 97 | return audios, texts, videos 98 | 99 | # sample-level: 每个模态的特征长度压缩到原来的scale倍 100 | def feature_scale_compress(audios, texts, videos, scale_factor=1): 101 | for ii in range(len(audios)): 102 | audios[ii] = func_mapping_feature(audios[ii], math.ceil(len(audios[ii]) / scale_factor)) 103 | texts[ii] = func_mapping_feature(texts[ii], math.ceil(len(texts[ii]) / scale_factor)) 104 | videos[ii] = func_mapping_feature(videos[ii], math.ceil(len(videos[ii]) / scale_factor)) 105 | return audios, texts, videos 106 | 107 | # sample-level: 三种模态压缩到文本长度 108 | def align_to_text(audios, texts, videos): 109 | for ii in range(len(audios)): 110 | dst_len = len(texts[ii]) 111 | audios[ii] = func_mapping_feature(audios[ii], dst_len) 112 | texts[ii] = func_mapping_feature(texts[ii], dst_len) 113 | videos[ii] = func_mapping_feature(videos[ii], dst_len) 114 | return audios, texts, videos 115 | 116 | # batch-level: generate batch 117 | def pad_to_maxlen_pre_modality(audios, texts, videos): 118 | audio_maxlen = max([len(feature) for feature in audios]) 119 | text_maxlen = max([len(feature) for feature in texts ]) 120 | video_maxlen = max([len(feature) for feature in videos]) 121 | for ii in range(len(audios)): 122 | audios[ii] = func_mapping_feature(audios[ii], audio_maxlen) 123 | texts[ii] = func_mapping_feature(texts[ii], text_maxlen) 124 | videos[ii] = func_mapping_feature(videos[ii], video_maxlen) 125 | return audios, texts, videos 126 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/utils/read_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import tqdm 4 | import math 5 | import pickle 6 | import numpy as np 7 | import multiprocessing 8 | 9 | from ..globals import * 10 | from .functions import * 11 | from .read_files import * 12 | 13 | ############################################################ 14 | # ------ for feat: feature_root+name -> (seqlen, featdim) ------ 15 | def func_read_one_feat(argv=None, feature_root=None, name=None, processor=None, model_name=None): 16 | feature_root, name, processor, model_name = argv 17 | 18 | # 路径可能的两个选项 19 | feature_path = os.path.join(feature_root, name+'.npy') 20 | feature_dir = os.path.join(feature_root, name) 21 | 22 | feature = [] 23 | if os.path.exists(feature_path): # audio/text => belong to speaker 24 | single_feature = np.load(feature_path) 25 | single_feature = single_feature.squeeze() # [Dim, ] or [Time, Dim] 26 | feature.append(single_feature) 27 | elif os.path.isdir(feature_dir): 28 | facenames = os.listdir(feature_dir) # 如果是文件夹,则依次读取文件夹内所有信息 29 | for facename in sorted(facenames): 30 | facefeat = np.load(os.path.join(feature_dir, facename)) 31 | feature.append(facefeat) 32 | else: 33 | raise Exception('feature path or dir do not exist!') 34 | 35 | # feature -> (seqlen, featdim) 36 | single_feature = np.array(feature).squeeze() 37 | if len(single_feature) == 0: 38 | print ('feature has errors!!') 39 | elif len(single_feature.shape) == 1: 40 | single_feature = single_feature[np.newaxis, :] 41 | return single_feature 42 | 43 | 44 | # model_name:表示用的哪个预训练模型 45 | # read multiple data [different datasets need different processors] 46 | def func_read_multiprocess(feature_root, names, processor=None, read_type='feat', model_name=None): 47 | ## names => features 48 | params = [] 49 | for name in names: 50 | params.append((feature_root, name, processor, model_name)) 51 | 52 | # ------ debug ------ 53 | # func_read_one_feat(params[0]) 54 | # func_read_one_e2e_video(params[0]) 55 | # func_read_one_e2e_audio(params[0]) 56 | 57 | features = [] 58 | with multiprocessing.Pool(processes=8) as pool: 59 | if read_type == 'feat': 60 | features = list(tqdm.tqdm(pool.imap(func_read_one_feat, params), total=len(params))) 61 | 62 | ## save (names, features) 63 | feature_shape = np.array(features[0]).shape 64 | feature_name = os.path.basename(feature_root) 65 | print (f'Input feature {feature_name} ===> dim is {feature_shape}') 66 | assert len(names) == len(features), f'Error: len(names) != len(features)' 67 | return features, feature_shape[-1] 68 | 69 | 70 | ############################################################ 71 | # (seqlen, featdim) -> (dst_len, featdim) 72 | def func_mapping_feature(feature, dst_len): 73 | featlen, featdim = feature.shape 74 | if featlen == dst_len: 75 | return feature 76 | elif featlen < dst_len: 77 | pad_feature = np.zeros((dst_len-featlen, featdim)) 78 | feature = np.concatenate((pad_feature, feature), axis=0) 79 | else: 80 | if featlen // dst_len == featlen / dst_len: 81 | pad_len = 0 82 | pool_size = featlen // dst_len 83 | else: 84 | pad_len = dst_len - featlen % dst_len 85 | pool_size = featlen // dst_len + 1 86 | pad_feature = np.zeros((pad_len, featdim)) 87 | feature = np.concatenate([pad_feature, feature]).reshape(dst_len, pool_size, featdim) # 相邻时刻特征取平均 88 | feature = np.mean(feature, axis=1) 89 | return feature 90 | 91 | # sample-level 92 | def align_to_utt(audios, texts, videos): 93 | for ii in range(len(audios)): 94 | audios[ii] = np.mean(audios[ii], axis=0) 95 | texts[ii] = np.mean(texts[ii], axis=0) 96 | videos[ii] = np.mean(videos[ii], axis=0) 97 | return audios, texts, videos 98 | 99 | # sample-level: 每个模态的特征长度压缩到原来的scale倍 100 | def feature_scale_compress(audios, texts, videos, scale_factor=1): 101 | for ii in range(len(audios)): 102 | audios[ii] = func_mapping_feature(audios[ii], math.ceil(len(audios[ii]) / scale_factor)) 103 | texts[ii] = func_mapping_feature(texts[ii], math.ceil(len(texts[ii]) / scale_factor)) 104 | videos[ii] = func_mapping_feature(videos[ii], math.ceil(len(videos[ii]) / scale_factor)) 105 | return audios, texts, videos 106 | 107 | # sample-level: 三种模态压缩到文本长度 108 | def align_to_text(audios, texts, videos): 109 | for ii in range(len(audios)): 110 | dst_len = len(texts[ii]) 111 | audios[ii] = func_mapping_feature(audios[ii], dst_len) 112 | texts[ii] = func_mapping_feature(texts[ii], dst_len) 113 | videos[ii] = func_mapping_feature(videos[ii], dst_len) 114 | return audios, texts, videos 115 | 116 | # batch-level: generate batch 117 | def pad_to_maxlen_pre_modality(audios, texts, videos): 118 | audio_maxlen = max([len(feature) for feature in audios]) 119 | text_maxlen = max([len(feature) for feature in texts ]) 120 | video_maxlen = max([len(feature) for feature in videos]) 121 | for ii in range(len(audios)): 122 | audios[ii] = func_mapping_feature(audios[ii], audio_maxlen) 123 | texts[ii] = func_mapping_feature(texts[ii], text_maxlen) 124 | videos[ii] = func_mapping_feature(videos[ii], video_maxlen) 125 | return audios, texts, videos 126 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/pytorch-benchmarks/imagenet/evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Imagenet validation set benchmark 3 | 4 | The module evaluates the performance of a pytorch model on the ILSVRC 2012 5 | validation set. 6 | 7 | Based on PyTorch imagenet example: 8 | https://github.com/pytorch/examples/tree/master/imagenet 9 | """ 10 | 11 | from __future__ import division 12 | 13 | import os 14 | import time 15 | 16 | from PIL import ImageFile 17 | import torch 18 | import torch.nn.parallel 19 | import torch.utils.data 20 | import torch.backends.cudnn as cudnn 21 | import torchvision.datasets as datasets 22 | from utils.benchmark_helpers import compose_transforms 23 | 24 | ImageFile.LOAD_TRUNCATED_IMAGES = True 25 | 26 | 27 | def imagenet_benchmark(model, data_dir, res_cache, refresh_cache, batch_size=256, 28 | num_workers=20, remove_blacklist=False, center_crop=True, 29 | override_meta_imsize=False): 30 | if not refresh_cache: # load result from cache, if available 31 | if os.path.isfile(res_cache): 32 | res = torch.load(res_cache) 33 | prec1, prec5, speed = res['prec1'], res['prec5'], res['speed'] 34 | print("=> loaded results from '{}'".format(res_cache)) 35 | info = (100 - prec1, 100 - prec5, speed) 36 | msg = 'Top 1 err: {:.2f}, Top 5 err: {:.2f}, Speed: {:.1f}Hz' 37 | print(msg.format(*info)) 38 | return 39 | 40 | meta = model.meta 41 | cudnn.benchmark = True 42 | 43 | if override_meta_imsize: # NOTE REMOVE THIS LATER! 44 | import torch.nn as nn 45 | model.features_8 = nn.AdaptiveAvgPool2d(1) 46 | 47 | model = torch.nn.DataParallel(model).cuda() 48 | if remove_blacklist: 49 | subset = 'val_blacklisted' 50 | else: 51 | subset = 'val' 52 | valdir = os.path.join(data_dir, subset) 53 | preproc_transforms = compose_transforms(meta, resize=256, center_crop=center_crop, 54 | override_meta_imsize=override_meta_imsize) 55 | val_loader = torch.utils.data.DataLoader( 56 | datasets.ImageFolder(valdir, preproc_transforms), batch_size=batch_size, 57 | shuffle=False, num_workers=num_workers, pin_memory=True) 58 | prec1, prec5, speed = validate(val_loader, model) 59 | torch.save({'prec1': prec1, 'prec5': prec5, 'speed': speed}, res_cache) 60 | 61 | 62 | def validate(val_loader, model): 63 | model.eval() 64 | top1 = AverageMeter() 65 | top5 = AverageMeter() 66 | speed = WarmupAverageMeter() 67 | end = time.time() 68 | with torch.no_grad(): 69 | for ii, (ims, target) in enumerate(val_loader): 70 | target = target.cuda() 71 | # ims_var = torch.autograd.Variable(ims, volatile=True) 72 | output = model(ims) # compute output 73 | prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 74 | top1.update(prec1[0], ims.size(0)) 75 | top5.update(prec5[0], ims.size(0)) 76 | speed.update(time.time() - end, ims.size(0)) 77 | end = time.time() 78 | if ii % 10 == 0: 79 | msg = ('Test: [{0}/{1}]\tSpeed {speed.current:.1f}Hz\t' 80 | '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f} ' 81 | '{top5.avg:.3f}') 82 | print(msg.format(ii, len(val_loader), speed=speed, top1=top1, 83 | top5=top5)) 84 | top1_err, top5_err = 100 - top1.avg, 100 - top5.avg 85 | print(' * Err@1 {0:.3f} Err@5 {1:.3f}'.format(top1_err, top5_err)) 86 | 87 | return top1.avg, top5.avg, speed.avg 88 | 89 | 90 | class WarmupAverageMeter(object): 91 | """Computes and stores the average and current value, after a fixed 92 | warmup period (useful for approximate benchmarking) 93 | 94 | Args: 95 | warmup (int) [3]: The number of updates to be ignored before the 96 | average starts to be computed. 97 | """ 98 | 99 | def __init__(self, warmup=3): 100 | self.reset() 101 | self.warmup = warmup 102 | 103 | def reset(self): 104 | self.avg = 0 105 | self.current = 0 106 | self.delta_sum = 0 107 | self.count = 0 108 | self.warmup_count = 0 109 | 110 | def update(self, delta, n): 111 | self.warmup_count = self.warmup_count + 1 112 | if self.warmup_count >= self.warmup: 113 | self.current = n / delta 114 | self.delta_sum += delta 115 | self.count += n 116 | self.avg = self.count / self.delta_sum 117 | 118 | 119 | class AverageMeter(object): 120 | """Computes and stores the average and current value""" 121 | 122 | def __init__(self): 123 | self.reset() 124 | 125 | def reset(self): 126 | self.val = 0 127 | self.avg = 0 128 | self.sum = 0 129 | self.count = 0 130 | 131 | def update(self, val, n=1): 132 | self.val = val 133 | self.sum += val * n 134 | self.count += n 135 | self.avg = self.sum / self.count 136 | 137 | 138 | def accuracy(output, target, topk=(1, )): 139 | """Computes the precision@k for the specified values of k""" 140 | maxk = max(topk) 141 | batch_size = target.size(0) 142 | 143 | _, pred = output.topk(maxk, 1, True, True) 144 | pred = pred.t() 145 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 146 | 147 | res = [] 148 | for k in topk: 149 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 150 | res.append(correct_k.mul_(100.0 / batch_size)) 151 | return res 152 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/audio/vggish/vggish_slim.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Defines the 'VGGish' model used to generate AudioSet embedding features. 17 | 18 | The public AudioSet release (https://research.google.com/audioset/download.html) 19 | includes 128-D features extracted from the embedding layer of a VGG-like model 20 | that was trained on a large Google-internal YouTube dataset. Here we provide 21 | a TF-Slim definition of the same model, without any dependences on libraries 22 | internal to Google. We call it 'VGGish'. 23 | 24 | Note that we only define the model up to the embedding layer, which is the 25 | penultimate layer before the final classifier layer. We also provide various 26 | hyperparameter values (in vggish_params.py) that were used to train this model 27 | internally. 28 | 29 | For comparison, here is TF-Slim's VGG definition: 30 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py 31 | """ 32 | 33 | import tensorflow.compat.v1 as tf 34 | tf.disable_v2_behavior() 35 | import tf_slim as slim # version: 1.1.0, pip install tf-slim 36 | 37 | from vggish import vggish_params as params 38 | 39 | 40 | def define_vggish_slim(training=False): 41 | """Defines the VGGish TensorFlow model. 42 | 43 | All ops are created in the current default graph, under the scope 'vggish/'. 44 | 45 | The input is a placeholder named 'vggish/input_features' of type float32 and 46 | shape [batch_size, num_frames, num_bands] where batch_size is variable and 47 | num_frames and num_bands are constants, and [num_frames, num_bands] represents 48 | a log-mel-scale spectrogram patch covering num_bands frequency bands and 49 | num_frames time frames (where each frame step is usually 10ms). This is 50 | produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). 51 | The output is an op named 'vggish/embedding' which produces the activations of 52 | a 128-D embedding layer, which is usually the penultimate layer when used as 53 | part of a full model with a final classifier layer. 54 | 55 | Args: 56 | training: If true, all parameters are marked trainable. 57 | 58 | Returns: 59 | The op 'vggish/embeddings'. 60 | """ 61 | # Defaults: 62 | # - All weights are initialized to N(0, INIT_STDDEV). 63 | # - All biases are initialized to 0. 64 | # - All activations are ReLU. 65 | # - All convolutions are 3x3 with stride 1 and SAME padding. 66 | # - All max-pools are 2x2 with stride 2 and SAME padding. 67 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 68 | weights_initializer=tf.truncated_normal_initializer( 69 | stddev=params.INIT_STDDEV), 70 | biases_initializer=tf.zeros_initializer(), 71 | activation_fn=tf.nn.relu, 72 | trainable=training), \ 73 | slim.arg_scope([slim.conv2d], 74 | kernel_size=[3, 3], stride=1, padding='SAME'), \ 75 | slim.arg_scope([slim.max_pool2d], 76 | kernel_size=[2, 2], stride=2, padding='SAME'), \ 77 | tf.variable_scope('vggish'): 78 | # Input: a batch of 2-D log-mel-spectrogram patches. 79 | features = tf.placeholder( 80 | tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), 81 | name='input_features') 82 | # Reshape to 4-D so that we can convolve a batch with conv2d(). 83 | net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) 84 | 85 | # The VGG stack of alternating convolutions and max-pools. 86 | net = slim.conv2d(net, 64, scope='conv1') 87 | net = slim.max_pool2d(net, scope='pool1') 88 | net = slim.conv2d(net, 128, scope='conv2') 89 | net = slim.max_pool2d(net, scope='pool2') 90 | net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') 91 | net = slim.max_pool2d(net, scope='pool3') 92 | net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') 93 | net = slim.max_pool2d(net, scope='pool4') 94 | 95 | # Flatten before entering fully-connected layers 96 | net = slim.flatten(net) 97 | net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') 98 | # The embedding layer. 99 | net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') 100 | return tf.identity(net, name='embedding') 101 | 102 | 103 | def load_vggish_slim_checkpoint(session, checkpoint_path): 104 | """Loads a pre-trained VGGish-compatible checkpoint. 105 | 106 | This function can be used as an initialization function (referred to as 107 | init_fn in TensorFlow documentation) which is called in a Session after 108 | initializating all variables. When used as an init_fn, this will load 109 | a pre-trained checkpoint that is compatible with the VGGish model 110 | definition. Only variables defined by VGGish will be loaded. 111 | 112 | Args: 113 | session: an active TensorFlow session. 114 | checkpoint_path: path to a file containing a checkpoint that is 115 | compatible with the VGGish model definition. 116 | """ 117 | # Get the list of names of all VGGish variables that exist in 118 | # the checkpoint (i.e., all inference-mode VGGish variables). 119 | with tf.Graph().as_default(): 120 | define_vggish_slim(training=False) 121 | vggish_var_names = [v.name for v in tf.global_variables()] 122 | 123 | # Get the list of all currently existing variables that match 124 | # the list of variable names we just computed. 125 | vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names] 126 | 127 | # Use a Saver to restore just the variables selected above. 128 | saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained', 129 | write_version=1) 130 | saver.restore(session, checkpoint_path) 131 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/models/mfn.py: -------------------------------------------------------------------------------- 1 | """ 2 | paper: Memory Fusion Network for Multi-View Sequential Learning 3 | From: https://github.com/pliang279/MFN 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | class MFN(nn.Module): 10 | def __init__(self, args): 11 | super(MFN, self).__init__() 12 | 13 | # params: analyze args 14 | audio_dim = args.audio_dim 15 | text_dim = args.text_dim 16 | video_dim = args.video_dim 17 | output_dim1 = args.output_dim1 18 | output_dim2 = args.output_dim2 19 | dropout = args.dropout 20 | self.mem_dim = args.mem_dim 21 | self.hidden_dim = args.hidden_dim 22 | self.grad_clip = args.grad_clip 23 | 24 | # params: intermedia 25 | total_h_dim = self.hidden_dim * 3 26 | attInShape = total_h_dim * args.window_dim 27 | gammaInShape = attInShape + self.mem_dim 28 | final_out = total_h_dim + self.mem_dim 29 | output_dim = self.hidden_dim // 2 30 | 31 | # each modality has one lstm cell 32 | self.lstm_l = nn.LSTMCell(text_dim, self.hidden_dim) 33 | self.lstm_a = nn.LSTMCell(audio_dim, self.hidden_dim) 34 | self.lstm_v = nn.LSTMCell(video_dim, self.hidden_dim) 35 | 36 | self.att1_fc1 = nn.Linear(attInShape, self.hidden_dim) 37 | self.att1_fc2 = nn.Linear(self.hidden_dim, attInShape) 38 | self.att1_dropout = nn.Dropout(dropout) 39 | 40 | self.att2_fc1 = nn.Linear(attInShape, self.hidden_dim) 41 | self.att2_fc2 = nn.Linear(self.hidden_dim, self.mem_dim) 42 | self.att2_dropout = nn.Dropout(dropout) 43 | 44 | self.gamma1_fc1 = nn.Linear(gammaInShape, self.hidden_dim) 45 | self.gamma1_fc2 = nn.Linear(self.hidden_dim, self.mem_dim) 46 | self.gamma1_dropout = nn.Dropout(dropout) 47 | 48 | self.gamma2_fc1 = nn.Linear(gammaInShape, self.hidden_dim) 49 | self.gamma2_fc2 = nn.Linear(self.hidden_dim, self.mem_dim) 50 | self.gamma2_dropout = nn.Dropout(dropout) 51 | 52 | self.out_fc1 = nn.Linear(final_out, self.hidden_dim) 53 | self.out_fc2 = nn.Linear(self.hidden_dim, output_dim) 54 | self.out_dropout = nn.Dropout(dropout) 55 | 56 | # output results 57 | self.fc_out_1 = nn.Linear(output_dim, output_dim1) 58 | self.fc_out_2 = nn.Linear(output_dim, output_dim2) 59 | 60 | 61 | # MFN needs aligned multimodal features 62 | def forward(self, batch): 63 | 64 | ''' 65 | simulating word-align network (for seq_len_T == seq_len_A == seq_len_V) 66 | audio_x: tensor of shape (batch, seqlen, audio_in) 67 | video_x: tensor of shape (batch, seqlen, video_in) 68 | text_x: tensor of shape (batch, seqlen, text_in) 69 | ''' 70 | assert batch['audios'].size()[1] == batch['videos'].size()[1] 71 | assert batch['audios'].size()[1] == batch['texts'].size()[1] 72 | 73 | text_x = batch['texts'].permute(1,0,2) # [seqlen, batch, dim] 74 | audio_x = batch['audios'].permute(1,0,2) # [seqlen, batch, dim] 75 | video_x = batch['videos'].permute(1,0,2) # [seqlen, batch, dim] 76 | 77 | # x is t x n x d 78 | n = text_x.size()[1] # n = batch 79 | t = text_x.size()[0] # t = seqlen 80 | self.h_l = torch.zeros(n, self.hidden_dim).cuda() 81 | self.h_a = torch.zeros(n, self.hidden_dim).cuda() 82 | self.h_v = torch.zeros(n, self.hidden_dim).cuda() 83 | self.c_l = torch.zeros(n, self.hidden_dim).cuda() 84 | self.c_a = torch.zeros(n, self.hidden_dim).cuda() 85 | self.c_v = torch.zeros(n, self.hidden_dim).cuda() 86 | self.mem = torch.zeros(n, self.mem_dim).cuda() 87 | all_h_ls = [] 88 | all_h_as = [] 89 | all_h_vs = [] 90 | all_c_ls = [] 91 | all_c_as = [] 92 | all_c_vs = [] 93 | all_mems = [] 94 | for i in range(t): # lstm 中每个step单独处理 95 | 96 | # prev time step [这里的 c 指的就是 lstm 里面的 cell state] 97 | prev_c_l = self.c_l 98 | prev_c_a = self.c_a 99 | prev_c_v = self.c_v 100 | 101 | # curr time step 102 | new_h_l, new_c_l = self.lstm_l(text_x[i], (self.h_l, self.c_l)) 103 | new_h_a, new_c_a = self.lstm_a(audio_x[i], (self.h_a, self.c_a)) 104 | new_h_v, new_c_v = self.lstm_v(video_x[i], (self.h_v, self.c_v)) 105 | 106 | # concatenate and attention 107 | prev_cs = torch.cat([prev_c_l,prev_c_a,prev_c_v], dim=1) 108 | new_cs = torch.cat([new_c_l, new_c_a, new_c_v], dim=1) 109 | cStar = torch.cat([prev_cs, new_cs], dim=1) 110 | attention = F.softmax(self.att1_fc2(self.att1_dropout(F.relu(self.att1_fc1(cStar)))),dim=1) 111 | attended = attention * cStar 112 | cHat = torch.tanh(self.att2_fc2(self.att2_dropout(F.relu(self.att2_fc1(attended))))) 113 | both = torch.cat([attended, self.mem], dim=1) 114 | gamma1 = torch.sigmoid(self.gamma1_fc2(self.gamma1_dropout(F.relu(self.gamma1_fc1(both))))) 115 | gamma2 = torch.sigmoid(self.gamma2_fc2(self.gamma2_dropout(F.relu(self.gamma2_fc1(both))))) 116 | self.mem = gamma1*self.mem + gamma2*cHat 117 | all_mems.append(self.mem) 118 | 119 | # update (hidden, cell) in lstm 120 | self.h_l, self.c_l = new_h_l, new_c_l 121 | self.h_a, self.c_a = new_h_a, new_c_a 122 | self.h_v, self.c_v = new_h_v, new_c_v 123 | 124 | all_h_ls.append(self.h_l) 125 | all_h_as.append(self.h_a) 126 | all_h_vs.append(self.h_v) 127 | all_c_ls.append(self.c_l) 128 | all_c_as.append(self.c_a) 129 | all_c_vs.append(self.c_v) 130 | 131 | # last hidden layer last_hs is n x h [就是一个逐步交互的过程] 132 | last_h_l = all_h_ls[-1] 133 | last_h_a = all_h_as[-1] 134 | last_h_v = all_h_vs[-1] 135 | last_mem = all_mems[-1] 136 | last_hs = torch.cat([last_h_l, last_h_a, last_h_v, last_mem], dim=1) 137 | features = self.out_fc2(self.out_dropout(F.relu(self.out_fc1(last_hs)))) 138 | self.last_hs = last_hs # for outside loading 139 | 140 | emos_out = self.fc_out_1(features) 141 | vals_out = self.fc_out_2(features) 142 | interloss = torch.tensor(0).cuda() 143 | 144 | return features, emos_out, vals_out, interloss 145 | -------------------------------------------------------------------------------- /EmotionTalk/feature_extraction/visual/emonet/data/affecnet.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pickle 3 | import numpy as np 4 | import torch 5 | import math 6 | from torch.utils.data import Dataset 7 | from skimage import io 8 | 9 | class AffectNet(Dataset): 10 | _expressions = {0: 'neutral', 1:'happy', 2:'sad', 3:'surprise', 4:'fear', 5:'disgust', 6:'anger', 7:'contempt', 8:'none'} 11 | _expressions_indices = {8: [0, 1, 2, 3, 4, 5, 6, 7], 12 | 5: [0, 1, 2, 3, 6]} 13 | 14 | def __init__(self, root_path, subset='test', 15 | transform_image_shape=None, transform_image=None, 16 | n_expression=5, verbose=1, cleaned_set=True): 17 | self.root_path = Path(root_path).expanduser() 18 | self.subset = subset 19 | self.image_path = self.root_path.joinpath(subset) 20 | self.transform_image_shape = transform_image_shape 21 | self.transform_image = transform_image 22 | self.verbose = verbose 23 | 24 | #if cleaned_set and (subset not in ['test', 'val']): 25 | # raise ValueError('cleaned_set can only be set to True for the val or test set, train has not been cleaned') 26 | self.cleaned_set = cleaned_set 27 | 28 | if n_expression not in [5, 8]: 29 | raise ValueError(f'n_expression should be either 5 or 8, but got n_expression={n_expression}') 30 | self.n_expression = n_expression 31 | 32 | self.pickle_path = self.root_path.joinpath(f'{subset}_fullpath.pkl') 33 | with open(self.pickle_path, 'br') as f: 34 | data = pickle.load(f) 35 | self.data = data 36 | 37 | # the keys are the image names (name.ext) 38 | self.keys = [] 39 | self.skipped = {'other':[], 'pt_pt_error':[], 'expression':[], 'cleaned':[]} 40 | # List of each expression to generate weights 41 | expressions = [] 42 | for key, value in data.items(): 43 | if key == 'folder': 44 | continue 45 | if (int(value['expression']) not in self._expressions_indices[self.n_expression]): 46 | self.skipped['expression'].append(key) 47 | continue 48 | if self.cleaned_set and (not value['expression_correct']): 49 | self.skipped['cleaned'].append(key) 50 | continue 51 | 52 | expression = int(value['expression']) 53 | if self.cleaned_set: 54 | #Automatic cleaning : expression has to match the valence and arousal values 55 | valence = float(value['valence']) 56 | arousal = float(value['arousal']) 57 | intensity = math.sqrt(valence**2+arousal**2) 58 | 59 | if expression == 0 and intensity>=0.2: 60 | self.skipped['other'].append(key) 61 | continue 62 | elif expression == 1 and (valence<=0 or intensity<=0.2): 63 | self.skipped['other'].append(key) 64 | continue 65 | elif expression == 2 and (valence>=0 or intensity<=0.2): 66 | self.skipped['other'].append(key) 67 | continue 68 | elif expression == 3 and (arousal<=0 or intensity<=0.2): 69 | self.skipped['other'].append(key) 70 | continue 71 | elif expression == 4 and (not(arousal>=0 and valence<=0) or intensity<=0.2): 72 | self.skipped['other'].append(key) 73 | continue 74 | elif expression == 5 and (valence>=0 or intensity<=0.3): 75 | self.skipped['other'].append(key) 76 | continue 77 | elif expression == 6 and (arousal<=0 or intensity<=0.2): 78 | self.skipped['other'].append(key) 79 | continue 80 | elif expression == 7 and (valence>=0 or intensity<=0.2): 81 | self.skipped['other'].append(key) 82 | continue 83 | 84 | if self.n_expression == 5 and expression == 6: 85 | expression = 4 86 | expressions.append(expression) 87 | self.keys.append(key) 88 | 89 | expressions = np.array(expressions) 90 | self.sample_per_class = {label:np.sum(expressions == label) for label in np.unique(expressions)} 91 | self.expression_weights = np.array([1./self.sample_per_class[e] for e in expressions]) 92 | self.average_per_class = int(np.mean(list(self.sample_per_class.values()))) 93 | 94 | if self.verbose: 95 | skipped = sum([len(self.skipped[key]) for key in self.skipped]) 96 | msg = f' -- {len(self.keys)} images, skipped {len(self.skipped)} images ({len(self.skipped["pt_pt_error"])} with large errors).' 97 | print(msg) 98 | print(f'Samples per class : {self.sample_per_class}') 99 | 100 | def __len__(self): 101 | return len(self.keys) 102 | 103 | def __getitem__(self, index): 104 | key = self.keys[index] 105 | sample_data = self.data[key] 106 | 107 | image_file = self.image_path.joinpath(key).as_posix() 108 | 109 | valence = torch.tensor([float(sample_data['valence'])], dtype=torch.float32) 110 | arousal = torch.tensor([float(sample_data['arousal'])], dtype=torch.float32) 111 | expression = int(sample_data['expression']) 112 | 113 | if self.n_expression == 5 and expression == 6: 114 | expression = 4 115 | 116 | landmarks = sample_data['landmarks_fan'] 117 | 118 | if isinstance(landmarks, list): 119 | landmarks = np.array(landmarks) 120 | image = io.imread(image_file) 121 | 122 | if self.transform_image_shape is not None: 123 | bounding_box = [landmarks.min(axis=0)[0], landmarks.min(axis=0)[1], 124 | landmarks.max(axis=0)[0], landmarks.max(axis=0)[1]] 125 | #image, landmarks = self.transform_image_shape(image, shape=landmarks) 126 | image, landmarks = self.transform_image_shape(image, bb=bounding_box) 127 | # Fix for PyTorch currently not supporting negative stric 128 | image = np.ascontiguousarray(image) 129 | 130 | if self.transform_image is not None: 131 | image = self.transform_image(image) 132 | 133 | return dict(valence=valence, arousal=arousal, expression=expression, image=image, au=[]) 134 | 135 | -------------------------------------------------------------------------------- /EmotionTalk/toolkit/preprocess/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ############ For LINUX ############## 4 | DATA_DIR = { 5 | 'MER2023': '/share/home/lianzheng/chinese-mer-2023/dataset/mer2023-dataset-process', 6 | 'IEMOCAPFour': '/mnt/real_sda/sunhaoqin_space/code/MERTools-master/MERBench/dataset/iemocap-process', 7 | 'IEMOCAPSix': '/mnt/real_sda/sunhaoqin_space/code/MERTools-master/MERBench/dataset/iemocap-process', 8 | 'CMUMOSI': '/share/home/lianzheng/chinese-mer-2023/dataset/cmumosi-process', 9 | 'CMUMOSEI': '/share/home/lianzheng/chinese-mer-2023/dataset/cmumosei-process', 10 | 'SIMS': '/share/home/lianzheng/chinese-mer-2023/dataset/sims-process', 11 | 'MELD': '/share/home/lianzheng/chinese-mer-2023/dataset/meld-process', 12 | 'SIMSv2': '/share/home/lianzheng/chinese-mer-2023/dataset/simsv2-process', 13 | } 14 | PATH_TO_RAW_AUDIO = { 15 | 'MER2023': os.path.join(DATA_DIR['MER2023'], 'audio'), 16 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'subaudio'), 17 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'subaudio'), 18 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'subaudio'), 19 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'subaudio'), 20 | 'SIMS': os.path.join(DATA_DIR['SIMS'], 'audio'), 21 | 'MELD': os.path.join(DATA_DIR['MELD'], 'subaudio'), 22 | 'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'audio'), 23 | } 24 | PATH_TO_RAW_VIDEO = { 25 | 'MER2023': os.path.join(DATA_DIR['MER2023'], 'video'), 26 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'subvideo-tgt'), 27 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'subvideo-tgt'), 28 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'subvideo'), 29 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'subvideo'), 30 | 'SIMS': os.path.join(DATA_DIR['SIMS'], 'video'), 31 | 'MELD': os.path.join(DATA_DIR['MELD'], 'subvideo'), 32 | 'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'video'), 33 | } 34 | PATH_TO_RAW_FACE = { 35 | 'MER2023': os.path.join(DATA_DIR['MER2023'], 'openface_face'), 36 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'openface_face'), 37 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'openface_face'), 38 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'openface_face'), 39 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'openface_face'), 40 | 'SIMS': os.path.join(DATA_DIR['SIMS'], 'openface_face'), 41 | 'MELD': os.path.join(DATA_DIR['MELD'], 'openface_face'), 42 | 'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'openface_face'), 43 | } 44 | PATH_TO_TRANSCRIPTIONS = { 45 | 'MER2023': os.path.join(DATA_DIR['MER2023'], 'transcription-engchi-polish.csv'), 46 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'transcription-engchi-polish.csv'), 47 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'transcription-engchi-polish.csv'), 48 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'transcription-engchi-polish.csv'), 49 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'transcription-engchi-polish.csv'), 50 | 'SIMS': os.path.join(DATA_DIR['SIMS'], 'transcription-engchi-polish.csv'), 51 | 'MELD': os.path.join(DATA_DIR['MELD'], 'transcription-engchi-polish.csv'), 52 | 'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'transcription-engchi-polish.csv'), 53 | } 54 | PATH_TO_FEATURES = { 55 | 'MER2023': os.path.join(DATA_DIR['MER2023'], 'features'), 56 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'features'), 57 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'features'), 58 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'features'), 59 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'features'), 60 | 'SIMS': os.path.join(DATA_DIR['SIMS'], 'features'), 61 | 'MELD': os.path.join(DATA_DIR['MELD'], 'features'), 62 | 'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'features'), 63 | } 64 | PATH_TO_LABEL = { 65 | 'MER2023': os.path.join(DATA_DIR['MER2023'], 'label-6way.npz'), 66 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'label_4way.npz'), 67 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'label_6way.npz'), 68 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'label.npz'), 69 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'label.npz'), 70 | 'SIMS': os.path.join(DATA_DIR['SIMS'], 'label.npz'), 71 | 'MELD': os.path.join(DATA_DIR['MELD'], 'label.npz'), 72 | 'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'label.npz'), 73 | } 74 | 75 | # pre-trained models, including supervised and unsupervised 76 | PATH_TO_PRETRAINED_MODELS = './tools' 77 | PATH_TO_OPENSMILE = './tools/opensmile-2.3.0/' 78 | PATH_TO_FFMPEG = '/mnt/real_sda/sunhaoqin_space/code/ffmpeg-4.4.1-i686-static/ffmpeg' 79 | 80 | # dir 81 | SAVED_ROOT = os.path.join('./saved') 82 | MODEL_DIR = os.path.join(SAVED_ROOT, 'model') 83 | LOG_DIR = os.path.join(SAVED_ROOT, 'log') 84 | PREDICTION_DIR = os.path.join(SAVED_ROOT, 'prediction') 85 | FUSION_DIR = os.path.join(SAVED_ROOT, 'fusion') 86 | SUBMISSION_DIR = os.path.join(SAVED_ROOT, 'submission') 87 | 88 | 89 | ############ For Windows [OpenFace to extract face] ############## 90 | DATA_DIR_Win = { 91 | 'CMUMOSI': 'E:\\Dataset\\CMU-MOSI\\Raw', 92 | 'CMUMOSEI': 'E:\\Dataset\\CMU-MOSEI', 93 | 'MER2023': 'H:\\desktop\\Multimedia-Transformer\\chinese-mer-2023\\mer2023-dataset-process', 94 | 'IEMOCAP': 'E:\\Dataset\\iemocap-process', 95 | 'MELD': 'E:\\Dataset\\meld-process', 96 | 'SIMS': 'F:\\CH-SIMS-process', 97 | 'SIMSv2': 'E:\Dataset\simsv2-process', 98 | } 99 | 100 | PATH_TO_RAW_FACE_Win = { 101 | 'CMUMOSI': os.path.join(DATA_DIR_Win['CMUMOSI'], 'Video\\Segmented'), 102 | 'CMUMOSEI': os.path.join(DATA_DIR_Win['CMUMOSEI'], 'video'), 103 | 'MER2023': os.path.join(DATA_DIR_Win['MER2023'], 'video'), 104 | 'SIMS': os.path.join(DATA_DIR_Win['SIMS'], 'video'), 105 | 'IEMOCAP': os.path.join(DATA_DIR_Win['IEMOCAP'], 'subvideo-tgt'), 106 | 'MELD': os.path.join(DATA_DIR_Win['MELD'], 'subvideo'), 107 | 'SIMSv2': os.path.join(DATA_DIR_Win['SIMSv2'], 'video'), 108 | } 109 | 110 | PATH_TO_FEATURES_Win = { 111 | 'CMUMOSI': os.path.join(DATA_DIR_Win['CMUMOSI'], 'features'), 112 | 'CMUMOSEI': os.path.join(DATA_DIR_Win['CMUMOSEI'], 'features'), 113 | 'MER2023': os.path.join(DATA_DIR_Win['MER2023'], 'features'), 114 | 'SIMS': os.path.join(DATA_DIR_Win['SIMS'], 'features'), 115 | 'IEMOCAP': os.path.join(DATA_DIR_Win['IEMOCAP'], 'features'), 116 | 'MELD': os.path.join(DATA_DIR_Win['MELD'], 'features'), 117 | 'SIMSv2': os.path.join(DATA_DIR_Win['SIMSv2'], 'features'), 118 | } 119 | 120 | PATH_TO_OPENFACE_Win = "H:\\desktop\\Multimedia-Transformer\\MERBench-master\\tools\\openface_win_x64" 121 | --------------------------------------------------------------------------------