├── EmotionTalk
    ├── feature_extraction
    │   ├── visual
    │   │   ├── pytorch-benchmarks
    │   │   │   ├── fer2013
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── fer.py
    │   │   │   ├── imagenet
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── imagenet.py.bak
    │   │   │   │   └── evaluation.py
    │   │   │   ├── utils
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── benchmark_helpers.py
    │   │   │   ├── .gitignore
    │   │   │   ├── LICENSE.md
    │   │   │   ├── README.md
    │   │   │   ├── run_fer_benchmarks.py
    │   │   │   └── model
    │   │   │   │   ├── vgg_m_face_bn_fer_dag.py
    │   │   │   │   ├── alexnet_face_fer_bn_dag.py
    │   │   │   │   └── vgg_vd_face_fer_dag.py
    │   │   ├── emonet
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── affecnet.py
    │   │   │   ├── models
    │   │   │   │   └── __init__.py
    │   │   │   └── metrics.py
    │   │   ├── manet
    │   │   │   ├── log
    │   │   │   │   ├── SFEW.png
    │   │   │   │   ├── CAER-S.png
    │   │   │   │   ├── FED-RO.png
    │   │   │   │   ├── RAF-DB.png
    │   │   │   │   ├── AffectNet7.png
    │   │   │   │   ├── AffectNet8.png
    │   │   │   │   ├── [02-08]-[16-22]-cnn.png
    │   │   │   │   ├── [02-08]-[19-12]-cnn.png
    │   │   │   │   ├── [02-08]-[21-19]-cnn.png
    │   │   │   │   ├── [02-08]-[22-55]-cnn.png
    │   │   │   │   ├── [02-12]-[19-11]-cnn.png
    │   │   │   │   ├── [02-12]-[22-21]-cnn.png
    │   │   │   │   └── [05-28]-[13-07]-cnn.png
    │   │   │   ├── model
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── manet.cpython-39.pyc
    │   │   │   │   │   └── attention.cpython-39.pyc
    │   │   │   │   └── attention.py
    │   │   │   ├── reorganize_rafdb.py
    │   │   │   ├── LICENSE
    │   │   │   └── README.md
    │   │   ├── dataset.py
    │   │   ├── util.py
    │   │   ├── extract_imagenet_embedding.py
    │   │   ├── extract_emonet_embedding.py
    │   │   └── extract_manet_embedding.py
    │   └── audio
    │   │   ├── vggish
    │   │       ├── vggish_pca_params.npz
    │   │       ├── vggish_params.py
    │   │       ├── vggish_postprocess.py
    │   │       ├── vggish_smoke_test.py
    │   │       ├── vggish_input.py
    │   │       └── vggish_slim.py
    │   │   ├── extract_vggish_embedding.py
    │   │   └── extract_wav2vec_embedding.py
    ├── dataset
    │   └── mm-process
    │   │   ├── mm_label.npz
    │   │   ├── mm_label4.npz
    │   │   ├── txt_label.npz
    │   │   ├── audio_label.npz
    │   │   ├── txt_label4.npz
    │   │   ├── video_label.npz
    │   │   ├── audio_label4.npz
    │   │   └── video_label4.npz
    ├── toolkit
    │   ├── models
    │   │   ├── __pycache__
    │   │   │   ├── lmf.cpython-38.pyc
    │   │   │   ├── lmf.cpython-39.pyc
    │   │   │   ├── mctn.cpython-38.pyc
    │   │   │   ├── mctn.cpython-39.pyc
    │   │   │   ├── mfm.cpython-38.pyc
    │   │   │   ├── mfm.cpython-39.pyc
    │   │   │   ├── mfn.cpython-38.pyc
    │   │   │   ├── mfn.cpython-39.pyc
    │   │   │   ├── misa.cpython-38.pyc
    │   │   │   ├── misa.cpython-39.pyc
    │   │   │   ├── mmim.cpython-38.pyc
    │   │   │   ├── mmim.cpython-39.pyc
    │   │   │   ├── mult.cpython-38.pyc
    │   │   │   ├── mult.cpython-39.pyc
    │   │   │   ├── tfn.cpython-38.pyc
    │   │   │   ├── tfn.cpython-39.pyc
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── __init__.cpython-39.pyc
    │   │   │   ├── attention.cpython-38.pyc
    │   │   │   ├── attention.cpython-39.pyc
    │   │   │   ├── graph_mfn.cpython-38.pyc
    │   │   │   └── graph_mfn.cpython-39.pyc
    │   │   ├── modules
    │   │   │   ├── __pycache__
    │   │   │   │   ├── encoder.cpython-38.pyc
    │   │   │   │   └── encoder.cpython-39.pyc
    │   │   │   ├── transformers_encoder
    │   │   │   │   ├── __pycache__
    │   │   │   │   │   ├── transformer.cpython-38.pyc
    │   │   │   │   │   ├── transformer.cpython-39.pyc
    │   │   │   │   │   ├── position_embedding.cpython-38.pyc
    │   │   │   │   │   ├── position_embedding.cpython-39.pyc
    │   │   │   │   │   ├── multihead_attention.cpython-38.pyc
    │   │   │   │   │   └── multihead_attention.cpython-39.pyc
    │   │   │   │   └── position_embedding.py
    │   │   │   └── encoder.py
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── tfn.py
    │   │   ├── lmf.py
    │   │   └── mfn.py
    │   ├── utils
    │   │   ├── __pycache__
    │   │   │   ├── loss.cpython-38.pyc
    │   │   │   ├── loss.cpython-39.pyc
    │   │   │   ├── metric.cpython-38.pyc
    │   │   │   ├── metric.cpython-39.pyc
    │   │   │   ├── chatgpt.cpython-38.pyc
    │   │   │   ├── chatgpt.cpython-39.pyc
    │   │   │   ├── functions.cpython-38.pyc
    │   │   │   ├── functions.cpython-39.pyc
    │   │   │   ├── read_data.cpython-38.pyc
    │   │   │   ├── read_data.cpython-39.pyc
    │   │   │   ├── read_files.cpython-38.pyc
    │   │   │   └── read_files.cpython-39.pyc
    │   │   ├── loss.py
    │   │   ├── metric.py
    │   │   ├── chatgpt.py
    │   │   └── read_data.py
    │   ├── preprocess
    │   │   ├── __pycache__
    │   │   │   ├── config.cpython-39.pyc
    │   │   │   └── globals.cpython-39.pyc
    │   │   ├── utils
    │   │   │   ├── __pycache__
    │   │   │   │   ├── chatgpt.cpython-39.pyc
    │   │   │   │   ├── functions.cpython-39.pyc
    │   │   │   │   └── read_files.cpython-39.pyc
    │   │   │   ├── loss.py
    │   │   │   ├── metric.py
    │   │   │   ├── chatgpt.py
    │   │   │   └── read_data.py
    │   │   ├── mer2023.py
    │   │   ├── simsv2.py
    │   │   ├── cmumosi.py
    │   │   ├── meld.py
    │   │   ├── sims.py
    │   │   └── config.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── feat_data.py
    │   ├── dataloader
    │   │   ├── __init__.py
    │   │   └── mm.py
    │   └── model-tune.yaml
    └── config.py
├── environment.yml
└── README.md


/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/fer2013/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/imagenet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/emonet/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/emonet/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .affecnet import AffectNet
2 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/emonet/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .emonet import EmoNet
2 | 
3 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__
3 | .nfs*
4 | scratch
5 | res_cache
6 | 


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/mm_label.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/mm_label.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/mm_label4.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/mm_label4.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/txt_label.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/txt_label.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/audio_label.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/audio_label.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/txt_label4.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/txt_label4.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/video_label.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/video_label.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/audio_label4.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/audio_label4.npz


--------------------------------------------------------------------------------
/EmotionTalk/dataset/mm-process/video_label4.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/dataset/mm-process/video_label4.npz


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/SFEW.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/SFEW.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/CAER-S.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/CAER-S.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/FED-RO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/FED-RO.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/RAF-DB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/RAF-DB.png


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/lmf.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/lmf.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/lmf.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/lmf.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mctn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mctn.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mctn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mctn.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mfm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfm.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mfm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfm.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mfn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfn.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mfn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mfn.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/misa.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/misa.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/misa.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/misa.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mmim.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mmim.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mmim.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mmim.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mult.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mult.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/mult.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/mult.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/tfn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/tfn.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/tfn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/tfn.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/loss.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/loss.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/metric.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/metric.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/metric.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/metric.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/chatgpt.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/AffectNet7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/AffectNet7.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/AffectNet8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/AffectNet8.png


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/attention.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/attention.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/__pycache__/graph_mfn.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/__pycache__/config.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/__pycache__/config.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/functions.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/functions.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/functions.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_data.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/utils/__pycache__/read_files.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/vggish/vggish_pca_params.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/audio/vggish/vggish_pca_params.npz


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/__pycache__/globals.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/__pycache__/globals.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/__pycache__/encoder.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[16-22]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[16-22]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[19-12]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[19-12]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[21-19]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[21-19]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[22-55]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-08]-[22-55]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[19-11]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[19-11]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[22-21]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[02-12]-[22-21]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/log/[05-28]-[13-07]-cnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/log/[05-28]-[13-07]-cnn.png


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/__pycache__/chatgpt.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/utils/__pycache__/chatgpt.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/__pycache__/functions.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/utils/__pycache__/functions.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/__pycache__/read_files.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/preprocess/utils/__pycache__/read_files.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/model/__pycache__/manet.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/model/__pycache__/manet.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/model/__pycache__/attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/feature_extraction/visual/manet/model/__pycache__/attention.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/transformer.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/position_embedding.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-38.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKU-HLT/EmotionTalk/HEAD/EmotionTalk/toolkit/models/modules/transformers_encoder/__pycache__/multihead_attention.cpython-39.pyc


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/loss.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | # classification loss
 5 | class CELoss(nn.Module):
 6 | 
 7 |     def __init__(self):
 8 |         super(CELoss, self).__init__()
 9 |         self.loss = nn.NLLLoss(reduction='sum')
10 | 
11 |     def forward(self, pred, target):
12 |         pred = F.log_softmax(pred, 1) # [n_samples, n_classes]
13 |         target = target.long()        # [n_samples]
14 |         loss = self.loss(pred, target) / len(pred)
15 |         return loss
16 | 
17 | # regression loss
18 | class MSELoss(nn.Module):
19 | 
20 |     def __init__(self):
21 |         super(MSELoss, self).__init__()
22 |         self.loss = nn.MSELoss(reduction='sum')
23 | 
24 |     def forward(self, pred, target):
25 |         pred = pred.view(-1,1)
26 |         target = target.view(-1,1)
27 |         loss = self.loss(pred, target) / len(pred)
28 |         return loss
29 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/loss.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | # classification loss
 5 | class CELoss(nn.Module):
 6 | 
 7 |     def __init__(self):
 8 |         super(CELoss, self).__init__()
 9 |         self.loss = nn.NLLLoss(reduction='sum')
10 | 
11 |     def forward(self, pred, target):
12 |         pred = F.log_softmax(pred, 1) # [n_samples, n_classes]
13 |         target = target.long()        # [n_samples]
14 |         loss = self.loss(pred, target) / len(pred)
15 |         return loss
16 | 
17 | # regression loss
18 | class MSELoss(nn.Module):
19 | 
20 |     def __init__(self):
21 |         super(MSELoss, self).__init__()
22 |         self.loss = nn.MSELoss(reduction='sum')
23 | 
24 |     def forward(self, pred, target):
25 |         pred = pred.view(-1,1)
26 |         target = target.view(-1,1)
27 |         loss = self.loss(pred, target) / len(pred)
28 |         return loss
29 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Samuel Albanie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/reorganize_rafdb.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import pandas as pd
 4 | import shutil
 5 | 
 6 | 
 7 | rafdb_path = '/data1/sunlicai/Affective Computing/Dataset/RAF-DB/basic'
 8 | src_path = os.path.join(rafdb_path, 'Image/aligned')
 9 | tgt_path = os.path.join(rafdb_path, 'Image/aligned_c') # split/class_id/img_file
10 | label_file = os.path.join(rafdb_path, 'EmoLabel/list_patition_label.txt')
11 | df = pd.read_csv(label_file, header=None, delimiter=' ')
12 | file_names, label_ids = df[0].values, df[1].values
13 | print(f'Number of images: {len(df)}.')
14 | name_to_label = dict(zip(file_names, label_ids))
15 | img_files = glob.glob(os.path.join(src_path, '*.jpg'))
16 | 
17 | for src_file in img_files:
18 |     img_name = os.path.basename(src_file).replace('_aligned', '')
19 |     label = name_to_label[img_name]
20 |     split = img_name.split('_')[0]
21 |     saved_path = os.path.join(tgt_path, split, str(label))
22 |     if not os.path.exists(saved_path):
23 |         os.makedirs(saved_path)
24 |     tgt_file = os.path.join(saved_path, img_name)
25 |     shutil.copyfile(src_file, tgt_file)
26 |     print(f'Copy "{src_file}" to "{tgt_file}".')


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Zengqun Zhao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | 
 3 | from .feat_data import Data_Feat
 4 | 
 5 | # 目标：输入 (names, labels, data_type)，得到所有特征与标签
 6 | class get_datasets(Dataset):
 7 | 
 8 |     def __init__(self, args, names, labels):
 9 | 
10 |         MODEL_DATASET_MAP = {
11 |             
12 |             # 解析特征
13 |             'attention': Data_Feat,
14 |             'lf_dnn': Data_Feat,
15 |             'lmf': Data_Feat,
16 |             'misa': Data_Feat,
17 |             'mmim': Data_Feat,
18 |             'tfn': Data_Feat,
19 |             'mfn': Data_Feat,
20 |             'graph_mfn': Data_Feat,
21 |             'ef_lstm': Data_Feat, 
22 |             'mfm': Data_Feat,
23 |             'mctn': Data_Feat,
24 |             'mult': Data_Feat,
25 | 
26 |         }
27 | 
28 |         self.dataset_class = MODEL_DATASET_MAP[args.model]
29 |         self.dataset = self.dataset_class(args, names, labels)
30 | 
31 |     def __len__(self):
32 |         return self.dataset.__len__()
33 | 
34 |     def __getitem__(self, index):
35 |         return self.dataset.__getitem__(index)
36 | 
37 |     def collater(self, instances):
38 |         return self.dataset.collater(instances)
39 |          
40 |     def get_featdim(self):
41 |         return self.dataset.get_featdim()


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | get_models: get models and load default configs; 
 3 | link: https://github.com/thuiar/MMSA-FET/tree/master
 4 | """
 5 | import torch
 6 | 
 7 | from .tfn import TFN
 8 | from .lmf import LMF
 9 | from .mfn import MFN
10 | from .mfm import MFM
11 | from .mult import MULT
12 | from .misa import MISA
13 | from .mctn import MCTN
14 | from .mmim import MMIM
15 | from .graph_mfn import Graph_MFN
16 | from .attention import Attention
17 | 
18 | class get_models(torch.nn.Module):
19 |     def __init__(self, args):
20 |         super(get_models, self).__init__()
21 |         # misa/mmim在有些参数配置下会存在梯度爆炸的风险
22 |         # tfn 显存占比比较高
23 | 
24 |         MODEL_MAP = {
25 |             
26 |             # 特征压缩到句子级再处理，所以支持 utt/align/unalign
27 |             'attention': Attention,
28 |             'lmf': LMF,
29 |             'misa': MISA,
30 |             'mmim': MMIM,
31 |             'tfn': TFN,
32 |             
33 |             # 只支持align
34 |             'mfn': MFN, # slow
35 |             'graph_mfn': Graph_MFN, # slow
36 |             'mfm': MFM, # slow
37 |             'mctn': MCTN, # slow
38 | 
39 |             # 支持align/unalign
40 |             'mult': MULT, # slow
41 | 
42 |         }
43 |         self.model = MODEL_MAP[args.model](args)
44 | 
45 |     def forward(self, batch):
46 |         return self.model(batch)
47 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/README.md:
--------------------------------------------------------------------------------
 1 | # MA-Net
 2 | 
 3 | PyTorch implementation of the paper *“Learning Deep Global Multi-scale and Local Attention Features 
 4 | for Facial Expression Recognition in the Wild”*, This work is under submission.
 5 | 
 6 | ## Requirements
 7 | - Python $\geq$3.6
 8 | - PyTorch $\geq$1.2
 9 | - torchvision $\geq$0.4.0
10 | - numpy
11 | - matplotlib
12 | - datetime
13 | - shutil
14 | - time
15 | - argparse
16 | - os
17 | 
18 | ## Training
19 | 
20 | - Step 1: download basic emotions dataset of [RAF-DB](http://www.whdeng.cn/raf/model1.html), and make sure it have the structure like following:
21 |  
22 | ```
23 | ./RAF-DB/
24 |          train/
25 |                0/
26 |                  train_09748.jpg
27 |                  ...
28 |                  train_12271.jpg
29 |                1/
30 |                ...
31 |                6/
32 |          test/
33 |               0/
34 |               ...
35 |               6/
36 | 
37 | [Note] 0: Neutral; 1: Happiness; 2: Sadness; 3: Surprise; 4: Fear; 5: Disgust; 6: Anger
38 | ```
39 | 
40 | - Step 2: download pre-trained model from
41 |    [Google Drive](https://drive.google.com/file/d/1tro_RCovLKNACt4MKYp3dmIvvxiOC2pi/view?usp=sharing),
42 |     and put it into ***./checkpoint***.
43 |     
44 | - Step 3: change the ***project_path*** and ***data_path*** in *main.py* to your path 
45 | 
46 | - Step 4: run ```python main.py ```
47 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | ### pytorch-benchmark
 2 | 
 3 | Some scripts for validating models on common benchmarks. Assumes at least Python3 and PyTorch 4.0.
 4 | 
 5 | 
 6 | ### Supported datasets:
 7 | 
 8 | * **ImageNet** (this is essentially just a cut-down version of the [official example](https://github.com/pytorch/examples/tree/master/imagenet))
 9 | * **Fer2013** - A dataset of greyscale faces labelled with emotions.
10 | 
11 | 
12 | 
13 | ### References
14 | 
15 | **ImageNet**: [paper](https://arxiv.org/abs/1409.0575)
16 | 
17 | ```
18 | @article{ILSVRC15,
19 | Author = {Olga Russakovsky and Jia Deng and Hao Su and Jonathan Krause and Sanjeev Satheesh and Sean Ma and Zhiheng Huang and Andrej Karpathy and Aditya Khosla and Michael Bernstein and Alexander C. Berg and Li Fei-Fei},
20 | Title = {{ImageNet Large Scale Visual Recognition Challenge}},
21 | Year = {2015},
22 | journal   = {International Journal of Computer Vision (IJCV)},
23 | doi = {10.1007/s11263-015-0816-y},
24 | volume={115},
25 | number={3},
26 | pages={211-252}
27 | }
28 | ```
29 | 
30 | **FER2013**: [paper](https://arxiv.org/abs/1307.0414)
31 | 
32 | ```
33 | @inproceedings{goodfellow2013challenges,
34 |   title={Challenges in representation learning: A report on three machine learning contests},
35 |   author={Goodfellow, Ian J and Erhan, Dumitru and Carrier, Pierre Luc and Courville, Aaron and Mirza, Mehdi and Hamner, Ben and Cukierski, Will and Tang, Yichuan and Thaler, David and Lee, Dong-Hyun and others},
36 |   booktitle={International Conference on Neural Information Processing},
37 |   pages={117--124},
38 |   year={2013},
39 |   organization={Springer}
40 | }
41 | ```
42 | 
43 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: mertools
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 |   - anaconda
 6 | dependencies:
 7 |   - python=3.9
 8 |   - cudatoolkit
 9 |   - pip
10 |   - pytorch=1.12.1
11 |   - pytorch-mutex=1.0=cuda
12 |   - torchaudio=0.12.1
13 |   - torchvision=0.13.1
14 |   
15 |   - pip:
16 |     - accelerate==0.16.0
17 |     - aiohttp==3.8.4
18 |     - aiosignal==1.3.1
19 |     - async-timeout==4.0.2
20 |     - attrs==22.2.0
21 |     - bitsandbytes==0.37.0
22 |     - cchardet==2.1.7
23 |     - chardet==5.1.0
24 |     - contourpy==1.0.7
25 |     - cycler==0.11.0
26 |     - filelock==3.9.0
27 |     - fonttools==4.38.0
28 |     - frozenlist==1.3.3
29 |     - huggingface-hub==0.13.4
30 |     - importlib-resources==5.12.0
31 |     - kiwisolver==1.4.4
32 |     - matplotlib==3.7.0
33 |     - multidict==6.0.4
34 |     - openai==0.27.0
35 |     - packaging==23.0
36 |     - psutil==5.9.4
37 |     - pycocotools==2.0.6
38 |     - pyparsing==3.0.9
39 |     - python-dateutil==2.8.2
40 |     - pyyaml==6.0
41 |     - regex==2022.10.31
42 |     - tokenizers==0.13.2
43 |     - tqdm==4.64.1
44 |     - transformers==4.28.0
45 |     - timm==0.6.13
46 |     - spacy==3.5.1
47 |     - webdataset==0.2.48
48 |     - scikit-learn==1.2.2
49 |     - scipy==1.10.1
50 |     - yarl==1.8.2
51 |     - zipp==3.14.0
52 |     - omegaconf==2.3.0
53 |     - opencv-python==4.7.0.72
54 |     - iopath==0.1.10
55 |     - decord==0.6.0
56 |     - tenacity==8.2.2
57 |     - peft
58 |     - pycocoevalcap
59 |     - sentence-transformers
60 |     - umap-learn
61 |     - notebook
62 |     - gradio==3.24.1
63 |     - gradio-client==0.0.8
64 |     - wandb
65 |     - einops
66 |     - SentencePiece
67 |     - ftfy
68 |     - thop
69 |     - pytorchvideo==0.1.5
70 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/dataloader/__init__.py:
--------------------------------------------------------------------------------
 1 | from .iemocap import IEMOCAP
 2 | from .cmudata import CMUDATA
 3 | from .mer2023 import MER2023
 4 | from .sims import SIMS
 5 | from .meld import MELD
 6 | from .simsv2 import SIMSv2
 7 | from .crossdim import CROSSDIM
 8 | from .crossdis import CROSSDIS
 9 | from .mm import mm
10 | from .MM import MM
11 | 
12 | DIM_DATASET = ['CMUMOSI', 'CMUMOSEI', 'SIMS', 'SIMSv2','MM']
13 | DIS_DATASET = ['IEMOCAPFour', 'IEMOCAPSix', 'MER2023', 'MELD','mm4','mm7']
14 | 
15 | # 输入数据库名称，得到 dataloaders
16 | class get_dataloaders:
17 | 
18 |     def __init__(self, args):
19 | 
20 |         if args.train_dataset is None:
21 |             DATALOADER_MAP = {
22 |                 
23 |                 'IEMOCAPFour': IEMOCAP,
24 |                 'mm4': mm,
25 |                 'mm7': mm,
26 |                 'MM': MM,
27 |                 'IEMOCAPSix':  IEMOCAP,
28 |                 'CMUMOSI':     CMUDATA,
29 |                 'CMUMOSEI':    CMUDATA,
30 |                 'MER2023':     MER2023,
31 |                 'SIMS': SIMS,
32 |                 'SIMSv2': SIMSv2,
33 |                 'MELD': MELD,
34 |             }
35 |             self.dataloader = DATALOADER_MAP[args.dataset](args)
36 |         elif args.train_dataset in DIM_DATASET:
37 |             assert args.test_dataset in DIM_DATASET
38 |             self.dataloader = CROSSDIM(args)
39 |         elif args.train_dataset in DIS_DATASET:
40 |             assert args.test_dataset in DIS_DATASET
41 |             self.dataloader = CROSSDIS(args)
42 | 
43 |     def get_loaders(self):
44 |         return self.dataloader.get_loaders()
45 |     
46 |     def calculate_results(self, emo_probs=[], emo_labels=[], val_preds=[], val_labels=[]):
47 |         return self.dataloader.calculate_results(emo_probs, emo_labels, val_preds, val_labels)
48 |     
49 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/model-tune.yaml:
--------------------------------------------------------------------------------
 1 | tfn:
 2 |   hidden_dim: [64, 128]
 3 |   dropout: [0.2, 0.3, 0.4, 0.5]
 4 |   grad_clip: [-1.0]
 5 |   lr: [1e-3, 1e-4]
 6 | 
 7 | lmf:
 8 |   hidden_dim: [32, 64, 128, 256]
 9 |   dropout: [0.2, 0.3, 0.4, 0.5]
10 |   rank: [3, 4, 5, 6]
11 |   grad_clip: [-1.0]
12 |   lr: [1e-3, 1e-4]
13 | 
14 | mmim:
15 |   hidden_dim: [64, 128, 256]
16 |   dropout: [0.0, 0.1, 0.2, 0.3]
17 |   cpc_layers: [1, 2, 3, 4]
18 |   alpha: [0.0, 0.1, 0.2]
19 |   beta:  [0.0, 0.1, 0.2]
20 |   grad_clip: [0.6, 0.8, 1.0]
21 |   lr: [1e-3, 1e-4]
22 | 
23 | misa:
24 |   dropout: [0.2, 0.3, 0.4, 0.5]
25 |   hidden_dim: [64, 128, 256]
26 |   sim_weight: [0.0, 0.1, 0.2]
27 |   diff_weight: [0.0, 0.1, 0.2]
28 |   recon_weight: [0.0, 0.1, 0.2]
29 |   grad_clip: [-1.0, 0.8, 1.0]
30 |   lr: [1e-4, 1e-5]
31 | 
32 | mfn:
33 |   hidden_dim: [128, 256]
34 |   mem_dim: [128]
35 |   dropout: [0.0, 0.3, 0.5, 0.7]
36 |   window_dim: [2]
37 |   grad_clip: [-1.0]
38 |   lr: [1e-3, 1e-4]
39 |   # lr: [1e-3, 1e-4]
40 | 
41 | graph_mfn:
42 |   hidden_dim: [128, 256]
43 |   mem_dim: [128]
44 |   dropout: [0.0, 0.3, 0.5, 0.7]
45 |   grad_clip: [-1.0]
46 |   lr: [1e-3, 1e-4]
47 |   # lr: [1e-3, 1e-4]
48 | 
49 | mfm:
50 |   hidden_dim: [128, 256]
51 |   mem_dim: [128]
52 |   dropout: [0.0, 0.3, 0.5, 0.7]
53 |   window_dim: [2]
54 |   lda_xl:  [0.01, 0.1, 0.5, 1.0]
55 |   lda_xa:  [0.01, 0.1, 0.5, 1.0]
56 |   lda_xv:  [0.01, 0.1, 0.5, 1.0]
57 |   lda_mmd: [10, 50, 100]
58 |   grad_clip: [-1.0]
59 |   lr: [1e-3, 1e-4]
60 | 
61 | mult:
62 |   layers: [2, 4, 6]
63 |   dropout: [0.0, 0.1, 0.2, 0.3]
64 |   num_heads: [8]
65 |   hidden_dim: [64, 128, 256]
66 |   conv1d_kernel_size: [1, 3]
67 |   grad_clip: [0.6, 0.8, 1.0]
68 |   lr: [1e-3, 1e-4]
69 |   # lr: [1e-3, 1e-4]
70 | 
71 | mctn:
72 |   hidden_dim: [64, 128, 256]
73 |   dropout: [0.0, 0.1, 0.2, 0.3]
74 |   teacher_forcing_ratio: [0.3, 0.5]
75 |   loss_weight: [0.1, 0.3, 0.5, 0.8, 1.0]
76 |   grad_clip: [0.6, 0.8, 1.0]
77 |   lr: [1e-3, 1e-4]
78 | 
79 | attention:
80 |   hidden_dim: [64, 128, 256]
81 |   dropout: [0.2, 0.3, 0.4, 0.5]
82 |   grad_clip: [-1.0]
83 |   lr: [1e-3, 1e-4]
84 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/emonet/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def ACC(ground_truth, predictions):
 5 |     """Evaluates the mean accuracy
 6 |     """
 7 |     return np.mean(ground_truth.astype(int) == predictions.astype(int))
 8 | 
 9 | def RMSE(ground_truth, predictions):
10 |     """
11 |         Evaluates the RMSE between estimate and ground truth.
12 |     """
13 |     return np.sqrt(np.mean((ground_truth-predictions)**2))
14 | 
15 | 
16 | def SAGR(ground_truth, predictions):
17 |     """
18 |         Evaluates the SAGR between estimate and ground truth.
19 |     """
20 |     return np.mean(np.sign(ground_truth) == np.sign(predictions))
21 | 
22 | 
23 | def PCC(ground_truth, predictions):
24 |     """
25 |         Evaluates the Pearson Correlation Coefficient.
26 |         Inputs are numpy arrays.
27 |         Corr = Cov(GT, Est)/(std(GT)std(Est))
28 |     """
29 |     return np.corrcoef(ground_truth, predictions)[0,1]
30 | 
31 | 
32 | def CCC(ground_truth, predictions):
33 |     """
34 |         Evaluates the Concordance Correlation Coefficient.
35 |         Inputs are numpy arrays.
36 |     """
37 |     mean_pred = np.mean(predictions)
38 |     mean_gt = np.mean(ground_truth)
39 | 
40 |     std_pred= np.std(predictions)
41 |     std_gt = np.std(ground_truth)
42 | 
43 |     pearson = PCC(ground_truth, predictions)
44 |     return 2.0*pearson*std_pred*std_gt/(std_pred**2+std_gt**2+(mean_pred-mean_gt)**2)
45 | 
46 | def ICC(labels, predictions):
47 |     """Evaluates the ICC(3, 1) 
48 |     """
49 |     naus = predictions.shape[1]
50 |     icc = np.zeros(naus)
51 | 
52 |     n = predictions.shape[0]
53 | 
54 |     for i in range(0,naus):
55 |         a = np.asmatrix(labels[:,i]).transpose()
56 |         b = np.asmatrix(predictions[:,i]).transpose()
57 |         dat = np.hstack((a, b))
58 |         mpt = np.mean(dat, axis=1)
59 |         mpr = np.mean(dat, axis=0)
60 |         tm  = np.mean(mpt, axis=0)
61 |         BSS = np.sum(np.square(mpt-tm))*2
62 |         BMS = BSS/(n-1)
63 |         RSS = np.sum(np.square(mpr-tm))*n
64 |         tmp = np.square(dat - np.hstack((mpt,mpt)))
65 |         WSS = np.sum(np.sum(tmp, axis=1))
66 |         ESS = WSS - RSS
67 |         EMS = ESS/(n-1)
68 |         icc[i] = (BMS - EMS)/(BMS + EMS)
69 | 
70 |     return icc
71 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/vggish/vggish_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Global parameters for the VGGish model.
17 | 
18 | See vggish_slim.py for more information.
19 | """
20 | 
21 | # Architectural constants.
22 | NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
23 | NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
24 | EMBEDDING_SIZE = 128  # Size of embedding layer.
25 | 
26 | # Hyperparameters used in feature and example generation.
27 | SAMPLE_RATE = 16000
28 | STFT_WINDOW_LENGTH_SECONDS = 0.025
29 | STFT_HOP_LENGTH_SECONDS = 0.010
30 | NUM_MEL_BINS = NUM_BANDS
31 | MEL_MIN_HZ = 125
32 | MEL_MAX_HZ = 7500
33 | LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
34 | EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
35 | # Note: original value for EXAMPLE_HOP_SECONDS is 0.96, i.e. no overlapping between adjacent examples
36 | # EXAMPLE_HOP_SECONDS = 0.25     # with zero overlap.
37 | 
38 | # Parameters used for embedding postprocessing.
39 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
40 | PCA_MEANS_NAME = 'pca_means'
41 | QUANTIZE_MIN_VAL = -2.0
42 | QUANTIZE_MAX_VAL = +2.0
43 | 
44 | # Hyperparameters used in training.
45 | INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
46 | LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
47 | ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
48 | 
49 | # Names of ops, tensors, and features.
50 | INPUT_OP_NAME = 'vggish/input_features'
51 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
52 | OUTPUT_OP_NAME = 'vggish/embedding'
53 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
54 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <<<<<<< HEAD
 2 | 
 3 | 
 4 | # EmotionTalk 
 5 | 
 6 | > An Interactive Chinese Multimodal Emotion Dataset With Rich Annotations.
 7 | 
 8 | [![Python](https://img.shields.io/badge/python-3.9+-blue.svg)]()
 9 | [![Project Page](https://img.shields.io/badge/Project-Website-blue.svg)](https://github.com/NKU-HLT/EmotionTalk)
10 | 
11 | ---
12 | 
13 | 
14 | ## 📖 Overview
15 | 
16 | We propose EmotionTalk, an interactive Chinese multimodal emotion dataset with rich annotations. This dataset provides multimodal information from 19 actors participating in dyadic conversational settings, incorporating acoustic, visual, and textual modalities. It includes 23.6 hours of speech (19,250 utterances), annotations for 7 utterance-level emotion categories (happy, surprise, sad, disgust, anger, fear, and neutral), 5-dimensional sentiment labels (negative, weakly negative, neutral, weakly positive, and positive) and 4-dimensional speech captions (speaker, speaking style, emotion and overall). The dataset is well-suited for research on unimodal and multimodal emotion recognition, missing modality challenges, and speech captioning tasks. To our knowledge, it represents the first high-quality and versatile Chinese dialogue multimodal emotion dataset, which is a valuable contribution to research on cross-cultural emotion analysis and recognition. Additionally, we conduct experiments on EmotionTalk to demonstrate the effectiveness and quality of the dataset. It will be open-source and freely available for all academic purposes. The dataset and codes will be made available at [EmotionTalk](https://github.com/NKU-HLT/EmotionTalk).
17 | 
18 | ## 🚀 Getting Started
19 | ### Environment
20 | 
21 | ```shell
22 | conda env create -f environment.yml
23 | ```
24 | 
25 | ## 🤗 Dataset Download
26 | 
27 | You can access the Emotiontalk dataset on HuggingFace Datasets:
28 | 
29 | [https://huggingface.co/datasets/BAAI/Emotiontalk](https://huggingface.co/datasets/BAAI/Emotiontalk)
30 | 
31 | ### Tool
32 | openface_win_x64  (https://drive.google.com/file/d/1-O8epcTDYCrRUU_mtXgjrS3OWA4HTp0-/view?usp=share_link  -> tools/openface_win_x64)
33 | 
34 | You need to follow the steps to run in EmotionTalk/run.sh. 
35 | 
36 | Please refer to run.sh for more details.
37 | 
38 | ## 🙏 Acknowledgements
39 | 
40 | This project builds upon prior work from the [zeroQiaoba/MERTools](https://github.com/zeroQiaoba/MERTools) repository. We thank them for their contributions! 
41 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/encoder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Ref paper: Tensor Fusion Network for Multimodal Sentiment Analysis
 3 | Ref url: https://github.com/Justin1904/TensorFusionNetworks
 4 | """
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | ## 这两个模块都是用在 TFN 中的 (video|audio)
 9 | class MLPEncoder(nn.Module):
10 |     '''
11 |     The subnetwork that is used in TFN for video and audio in the pre-fusion stage
12 |     '''
13 | 
14 |     def __init__(self, in_size, hidden_size, dropout):
15 |         '''
16 |         Args:
17 |             in_size: input dimension
18 |             hidden_size: hidden layer dimension
19 |             dropout: dropout probability
20 |         Output:
21 |             (return value in forward) a tensor of shape (batch_size, hidden_size)
22 |         '''
23 |         super(MLPEncoder, self).__init__()
24 |         # self.norm = nn.BatchNorm1d(in_size)
25 |         self.drop = nn.Dropout(p=dropout)
26 |         self.linear_1 = nn.Linear(in_size, hidden_size)
27 |         self.linear_2 = nn.Linear(hidden_size, hidden_size)
28 |         self.linear_3 = nn.Linear(hidden_size, hidden_size)
29 | 
30 |     def forward(self, x):
31 |         '''
32 |         Args:
33 |             x: tensor of shape (batch_size, in_size)
34 |         '''
35 |         # normed = self.norm(x)
36 |         dropped = self.drop(x)
37 |         y_1 = F.relu(self.linear_1(dropped))
38 |         y_2 = F.relu(self.linear_2(y_1))
39 |         y_3 = F.relu(self.linear_3(y_2))
40 | 
41 |         return y_3
42 | 
43 | 
44 | # TFN 中的文本编码，额外需要lstm 操作 [感觉是audio|video]
45 | class LSTMEncoder(nn.Module):
46 |     '''
47 |     The LSTM-based subnetwork that is used in TFN for text
48 |     '''
49 | 
50 |     def __init__(self, in_size, hidden_size, dropout, num_layers=1, bidirectional=False):
51 | 
52 |         super(LSTMEncoder, self).__init__()
53 | 
54 |         if num_layers == 1:
55 |             rnn_dropout = 0.0
56 |         else:
57 |             rnn_dropout = dropout
58 | 
59 |         self.rnn = nn.LSTM(in_size, hidden_size, num_layers=num_layers, dropout=rnn_dropout, bidirectional=bidirectional, batch_first=True)
60 |         self.dropout = nn.Dropout(dropout)
61 |         self.linear_1 = nn.Linear(hidden_size, hidden_size)
62 | 
63 |     def forward(self, x):
64 |         '''
65 |         Args:
66 |             x: tensor of shape (batch_size, sequence_len, in_size)
67 |             因为用的是 final_states ，所以特征的 padding 是放在前面的
68 |         '''
69 |         _, final_states = self.rnn(x)
70 |         h = self.dropout(final_states[0].squeeze(0))
71 |         y_1 = self.linear_1(h)
72 |         return y_1
73 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/attention.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Description: unimodal encoder + concat + attention fusion
 3 | '''
 4 | import torch
 5 | import torch.nn as nn
 6 | from .modules.encoder import MLPEncoder, LSTMEncoder
 7 | 
 8 | class Attention(nn.Module):
 9 |     def __init__(self, args):
10 |         super(Attention, self).__init__()
11 |         
12 |         text_dim    = args.text_dim
13 |         audio_dim   = args.audio_dim
14 |         video_dim   = args.video_dim
15 |         output_dim1 = args.output_dim1
16 |         output_dim2 = args.output_dim2
17 |         dropout = args.dropout
18 |         hidden_dim = args.hidden_dim
19 |         self.grad_clip = args.grad_clip
20 | 
21 |         if args.feat_type in ['utt']:
22 |             self.audio_encoder = MLPEncoder(audio_dim, hidden_dim, dropout)
23 |             self.text_encoder  = MLPEncoder(text_dim,  hidden_dim, dropout)
24 |             self.video_encoder = MLPEncoder(video_dim, hidden_dim, dropout)
25 |         elif args.feat_type in ['frm_align', 'frm_unalign']:
26 |             self.audio_encoder = LSTMEncoder(audio_dim, hidden_dim, dropout)
27 |             self.text_encoder  = LSTMEncoder(text_dim,  hidden_dim, dropout)
28 |             self.video_encoder = LSTMEncoder(video_dim, hidden_dim, dropout)
29 | 
30 |         self.attention_mlp = MLPEncoder(hidden_dim * 3, hidden_dim, dropout)
31 | 
32 |         self.fc_att   = nn.Linear(hidden_dim, 3)
33 |         self.fc_out_1 = nn.Linear(hidden_dim, output_dim1)
34 |         self.fc_out_2 = nn.Linear(hidden_dim, output_dim2)
35 |     
36 |     def forward(self, batch):
37 |         '''
38 |             support feat_type: utt | frm-align | frm-unalign
39 |         '''
40 |         audio_hidden = self.audio_encoder(batch['audios']) # [32, 128]
41 |         text_hidden  = self.text_encoder(batch['texts'])   # [32, 128]
42 |         video_hidden = self.video_encoder(batch['videos']) # [32, 128]
43 | 
44 |         multi_hidden1 = torch.cat([audio_hidden, text_hidden, video_hidden], dim=1) # [32, 384]
45 |         attention = self.attention_mlp(multi_hidden1)
46 |         attention = self.fc_att(attention)
47 |         attention = torch.unsqueeze(attention, 2) # [32, 3, 1]
48 | 
49 |         multi_hidden2 = torch.stack([audio_hidden, text_hidden, video_hidden], dim=2) # [32, 128, 3]
50 |         fused_feat = torch.matmul(multi_hidden2, attention)  # [32, 128, 3] * [32, 3, 1] = [32, 128, 1]
51 | 
52 |         features  = fused_feat.squeeze(axis=2) # [32, 128] => 解决batch=1报错的问题
53 |         emos_out  = self.fc_out_1(features)
54 |         vals_out  = self.fc_out_2(features)
55 |         interloss = torch.tensor(0).cuda()
56 | 
57 |         return features, emos_out, vals_out, interloss
58 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/mer2023.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import shutil
 4 | from toolkit.globals import *
 5 | from toolkit.utils.read_files import *
 6 | from toolkit.utils.functions import *
 7 | 
 8 | def normalize_dataset_format(data_root, save_root):
 9 |     ## input path
10 |     train_video, train_label = os.path.join(data_root, 'train'), os.path.join(data_root, 'train-label.csv')
11 |     test1_video, test1_label = os.path.join(data_root, 'test1'), os.path.join(data_root, 'test1-label.csv')
12 |     test2_video, test2_label = os.path.join(data_root, 'test2'), os.path.join(data_root, 'test2-label.csv')
13 |     test3_video, test3_label = os.path.join(data_root, 'test3'), os.path.join(data_root, 'test3-label.csv')
14 | 
15 |     ## output path
16 |     save_video = os.path.join(save_root, 'video')
17 |     save_label = os.path.join(save_root, 'label-6way.npz')
18 |     if not os.path.exists(save_root):  os.makedirs(save_root)
19 |     if not os.path.exists(save_video): os.makedirs(save_video)
20 | 
21 |     ## generate label path
22 |     whole_corpus = {}
23 |     for name, video_root, label_path in [('train', train_video, train_label),
24 |                                          ('test1', test1_video, test1_label),
25 |                                          ('test2', test2_video, test2_label),
26 |                                          ('test3', test3_video, test3_label)]:
27 |         
28 |         whole_corpus[name] = {}
29 |         names = func_read_key_from_csv(label_path, 'name')
30 |         emos  = func_read_key_from_csv(label_path, 'discrete')
31 |         vals  = func_read_key_from_csv(label_path, 'valence')
32 |         # process for test3 [test3 do not have vals]
33 |         if name == 'test3': vals = [-10] * len(names)
34 |         print (f'{name}: sample number: {len(names)}')
35 |         for ii in range(len(names)):
36 |             whole_corpus[name][names[ii]] = {'emo': emos[ii], 'val': vals[ii]}
37 |             # copy video
38 |             video_path = glob.glob(os.path.join(video_root, f'{names[ii]}*'))[0]
39 |             video_name = os.path.basename(video_path)
40 |             new_path = os.path.join(save_video, video_name)
41 |             shutil.copy(video_path, new_path)
42 | 
43 |     np.savez_compressed(save_label,
44 |                         train_corpus=whole_corpus['train'],
45 |                         test1_corpus=whole_corpus['test1'],
46 |                         test2_corpus=whole_corpus['test2'],
47 |                         test3_corpus=whole_corpus['test3'])
48 | 
49 | if __name__ == '__main__':
50 |     data_root = '/data/lianzheng/chinese-mer-2023/mer2023-dataset'
51 |     save_root = '/data/lianzheng/chinese-mer-2023/mer2023-dataset-process'
52 |     normalize_dataset_format(data_root, save_root)
53 | 


--------------------------------------------------------------------------------
/EmotionTalk/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | ############ For LINUX ##############
 4 | DATA_DIR = {
 5 |     'mm4':  '/mnt/real_sda/MERTools-master/MERBench/dataset/mm-process',
 6 |     'mm7':  '/mnt/real_sda/MERTools-master/MERBench/dataset/mm-process',
 7 | }
 8 | PATH_TO_RAW_AUDIO = {
 9 |     'mm4': os.path.join(DATA_DIR['mm4'], 'subaudio'),
10 |     'mm7': os.path.join(DATA_DIR['mm4'], 'subaudio'),
11 | }
12 | PATH_TO_RAW_VIDEO = {
13 |     'mm4': os.path.join(DATA_DIR['mm4'], 'subvideo-tgt'),
14 |     'mm7': os.path.join(DATA_DIR['mm4'], 'subvideo-tgt'),
15 | }
16 | PATH_TO_RAW_FACE = {
17 | 
18 |     'mm4': os.path.join(DATA_DIR['mm4'], 'openface_face'),
19 |     'mm7': os.path.join(DATA_DIR['mm4'], 'openface_face'),
20 | }
21 | PATH_TO_TRANSCRIPTIONS = {
22 |     'mm4': os.path.join(DATA_DIR['mm4'], 'transcription.csv'),
23 |     'mm7': os.path.join(DATA_DIR['mm4'], 'transcription.csv'),
24 | }
25 | PATH_TO_FEATURES = {
26 |     'mm4': os.path.join(DATA_DIR['mm4'], 'features'),
27 |     'mm7': os.path.join(DATA_DIR['mm4'], 'features'),
28 |     'MM': os.path.join(DATA_DIR['mm4'], 'features'), 
29 | }
30 | PATH_TO_LABEL = {	
31 |     'mm4': os.path.join(DATA_DIR['mm4'], 'mm_label4.npz'),
32 |     'mm7': os.path.join(DATA_DIR['mm7'], 'mm_label.npz'),
33 |     'MM': os.path.join(DATA_DIR['mm7'], 'mm_label.npz'),
34 | }
35 | 
36 | # pre-trained models, including supervised and unsupervised
37 | # PATH_TO_PRETRAINED_MODELS = '/mnt/real_sda/wangxuechen_space/code/MERBench/models/'
38 | PATH_TO_PRETRAINED_MODELS = '/mnt/real_sda/MERTools-master/MERBench/feature_extraction/checkpoint'
39 | PATH_TO_OPENSMILE = './tools/opensmile-2.3.0/'
40 | PATH_TO_FFMPEG = '/mnt/real_sda/ffmpeg-4.4.1-i686-static/ffmpeg'
41 | 
42 | # dir
43 | SAVED_ROOT = os.path.join('./saved')
44 | MODEL_DIR = os.path.join(SAVED_ROOT, 'model')
45 | LOG_DIR = os.path.join(SAVED_ROOT, 'log')
46 | PREDICTION_DIR = os.path.join(SAVED_ROOT, 'prediction')
47 | FUSION_DIR = os.path.join(SAVED_ROOT, 'fusion')
48 | SUBMISSION_DIR = os.path.join(SAVED_ROOT, 'submission')
49 | 
50 | 
51 | ############ For Windows [OpenFace to extract face] ##############
52 | DATA_DIR_Win = {
53 |     'mm4': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm4-process',
54 |     'mm7': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm4-process',
55 |     'MM': '/mnt/real_sda/MERTools-master/MERBench/dataset/mm4-process',
56 | }
57 | 
58 | PATH_TO_RAW_FACE_Win = {
59 | 	'mm4':   os.path.join(DATA_DIR_Win['mm4'],   'video'),
60 | 	'mm7':   os.path.join(DATA_DIR_Win['mm4'],   'video'),
61 | }
62 | 
63 | PATH_TO_FEATURES_Win = {
64 | 	'mm4':   os.path.join(DATA_DIR_Win['mm4'],   'features'),
65 | 	'mm7':   os.path.join(DATA_DIR_Win['mm4'],   'features'),
66 | 	'MM':   os.path.join(DATA_DIR_Win['MM'],   'features'),
67 | }
68 | 
69 | PATH_TO_OPENFACE_Win = "H:\\desktop\\Multimedia-Transformer\\MERBench-master\\tools\\openface_win_x64"
70 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/dataset.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | import os
 3 | import cv2
 4 | import glob
 5 | import numpy as np
 6 | from PIL import Image
 7 | from skimage import io
 8 | from skimage import img_as_float
 9 | import torch.utils.data as data
10 | 
11 | 
12 | class FaceDataset(data.Dataset):
13 |     def __init__(self, vid, face_dir, transform=None):
14 |         super(FaceDataset, self).__init__()
15 |         self.vid = vid
16 |         self.path = os.path.join(face_dir, vid)
17 |         self.transform = transform
18 |         self.frames = self.get_frames()
19 | 
20 |     def get_frames(self):
21 |         ## image format
22 |         # frames = glob.glob(os.path.join(self.path, '*'))
23 | 
24 |         ## npy format
25 |         npypath = os.path.join(self.path, f'{self.vid}.npy')
26 |         assert os.path.exists(npypath), f'Error: {self.vid} does not exist frames.npy'
27 |         frames = np.load(npypath)
28 | 
29 |         return frames
30 | 
31 |     def __len__(self):
32 |         return len(self.frames)
33 | 
34 |     def __getitem__(self, index):
35 |         ## image format
36 |         # path = self.frames[index]
37 |         # img = Image.open(path)
38 |         # name = os.path.basename(path)[:-4]
39 | 
40 |         ## npy format [cv2 -> Image]
41 |         img = self.frames[index]
42 |         img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
43 |         name = '%08d' %(index)
44 | 
45 |         if self.transform is not None:
46 |             img = self.transform(img)
47 |         return img, name
48 | 
49 | 
50 | class FaceDatasetForEmoNet(data.Dataset):
51 |     def __init__(self, vid, face_dir, transform=None, augmentor=None):
52 |         super(FaceDatasetForEmoNet, self).__init__()
53 |         self.vid = vid
54 |         self.path = os.path.join(face_dir, vid)
55 |         self.augmentor = augmentor
56 |         self.transform = transform
57 |         self.frames = self.get_frames()
58 | 
59 |     def get_frames(self):
60 |         ## image format
61 |         # frames = glob.glob(os.path.join(self.path, '*'))
62 | 
63 |         ## npy format
64 |         npypath = os.path.join(self.path, f'{self.vid}.npy')
65 |         assert os.path.exists(npypath), f'error video: {self.vid}'
66 |         frames = np.load(npypath)
67 |         return frames
68 | 
69 |     def __len__(self):
70 |         return len(self.frames)
71 | 
72 |     def __getitem__(self, index):
73 |         ## image format
74 |         # path = self.frames[index]
75 |         # img = io.imread(path)
76 |         # name = os.path.basename(path)[:-4]
77 | 
78 |         ## npy format [cv2 -> skimage]
79 |         img = self.frames[index]
80 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
81 |         name = '%08d' %(index)
82 | 
83 |         if self.augmentor is not None:
84 |             img = self.augmentor(img)[0]
85 |         if self.transform is not None:
86 |             img = self.transform(img)
87 |         
88 |         return img, name


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/utils/benchmark_helpers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Utilties shared among the benchmarking protocols
 3 | """
 4 | import os
 5 | import sys
 6 | import six
 7 | 
 8 | import torchvision.transforms as transforms
 9 | 
10 | 
11 | def compose_transforms(meta, resize=256, center_crop=True,
12 |                        override_meta_imsize=False):
13 |     """Compose preprocessing transforms for model
14 | 
15 |     The imported models use a range of different preprocessing options,
16 |     depending on how they were originally trained. Models trained in MatConvNet
17 |     typically require input images that have been scaled to [0,255], rather
18 |     than the [0,1] range favoured by PyTorch.
19 | 
20 |     Args:
21 |         meta (dict): model preprocessing requirements
22 |         resize (int) [256]: resize the input image to this size
23 |         center_crop (bool) [True]: whether to center crop the image
24 |         override_meta_imsize (bool) [False]: if true, use the value of `resize`
25 |            to select the image input size, rather than the properties contained
26 |            in meta (this option only applies when center cropping is not used.
27 | 
28 |     Return:
29 |         (transforms.Compose): Composition of preprocessing transforms
30 |     """
31 |     normalize = transforms.Normalize(mean=meta['mean'], std=meta['std'])
32 |     im_size = meta['imageSize']
33 |     assert im_size[0] == im_size[1], 'expected square image size'
34 |     if center_crop:
35 |         transform_list = [transforms.Resize(resize),
36 |                           transforms.CenterCrop(size=(im_size[0], im_size[1]))]
37 |     else:
38 |         if override_meta_imsize:
39 |             im_size = (resize, resize)
40 |         transform_list = [transforms.Resize(size=(im_size[0], im_size[1]))]
41 |     transform_list += [transforms.ToTensor()]
42 |     if meta['std'] == [1, 1, 1]:  # common amongst mcn models
43 |         transform_list += [lambda x: x * 255.0]
44 |     transform_list.append(normalize)
45 |     return transforms.Compose(transform_list)
46 | 
47 | 
48 | def load_module_2or3(model_name, model_def_path):
49 |     """Load model definition module in a manner that is compatible with
50 |     both Python2 and Python3
51 | 
52 |     Args:
53 |         model_name: The name of the model to be loaded
54 |         model_def_path: The filepath of the module containing the definition
55 | 
56 |     Return:
57 |         The loaded python module."""
58 |     if six.PY3:
59 |         import importlib.util
60 |         spec = importlib.util.spec_from_file_location(model_name, model_def_path)
61 |         mod = importlib.util.module_from_spec(spec)
62 |         spec.loader.exec_module(mod)
63 |     else:
64 |         import importlib
65 |         dirname = os.path.dirname(model_def_path)
66 |         sys.path.insert(0, dirname)
67 |         module_name = os.path.splitext(os.path.basename(model_def_path))[0]
68 |         mod = importlib.import_module(module_name)
69 |     return mod
70 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/run_fer_benchmarks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """This module evaluates imported PyTorch models on fer2013
 3 | """
 4 | 
 5 | import os
 6 | import argparse
 7 | from os.path import join as pjoin
 8 | from fer2013.fer import fer2013_benchmark
 9 | from utils.benchmark_helpers import load_module_2or3
10 | 
11 | # MODEL_DIR = os.path.expanduser('~/data/models/pytorch/mcn_imports')
12 | # FER_DIR = os.path.expanduser('~/data/datasets/fer2013+')
13 | MODEL_DIR = './pretrained/'
14 | FER_DIR = os.path.expanduser('~/Affective Computing/Dataset/FERPlus')
15 | 
16 | CACHE_DIR = 'res_cache/fer2013+'
17 | 
18 | def load_model(model_name):
19 |     """Load imoprted PyTorch model by name
20 | 
21 |     Args:
22 |         model_name (str): the name of the model to be loaded
23 | 
24 |     Return:
25 |         nn.Module: the loaded network
26 |     """
27 |     model_def_path = pjoin('model', model_name + '.py')
28 |     weights_path = pjoin(MODEL_DIR, model_name + '.pth')
29 |     mod = load_module_2or3(model_name, model_def_path)
30 |     func = getattr(mod, model_name)
31 |     net = func(weights_path=weights_path)
32 |     return net
33 | 
34 | def run_benchmarks(gpus, refresh, fer_plus):
35 |     """Run bencmarks for imported models
36 | 
37 |     Args:
38 |         gpus (str): comma separated gpu device identifiers
39 |         refresh (bool): whether to overwrite the results of existing runs
40 |         fer_plus (bool): whether to evaluate on the ferplus benchmark,
41 |           rather than the standard fer benchmark.
42 |     """
43 | 
44 |     # Select models (and their batch sizes) to include in the benchmark.
45 |     if fer_plus:
46 |         model_list = [
47 |             ('resnet50_ferplus_dag', 32),
48 |             ('senet50_ferplus_dag', 32),
49 |         ]
50 |     else:
51 |         model_list = [
52 |             ('alexnet_face_fer_bn_dag', 32),
53 |             ('vgg_m_face_bn_fer_dag', 32),
54 |             ('vgg_vd_face_fer_dag', 32),
55 |         ]
56 | 
57 |     if not os.path.exists(CACHE_DIR):
58 |         os.makedirs(CACHE_DIR)
59 |     os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
60 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(gpus)
61 | 
62 |     opts = {'data_dir': FER_DIR, 'refresh_cache': refresh}
63 | 
64 |     for model_name, batch_size in model_list:
65 |         cache_name = model_name
66 |         if fer_plus:
67 |             cache_name = cache_name + 'fer_plus'
68 |         opts['res_cache'] = '{}/{}.pth'.format(CACHE_DIR, cache_name)
69 |         opts['fer_plus'] = fer_plus
70 |         model = load_model(model_name)
71 |         print('benchmarking {}'.format(model_name))
72 |         fer2013_benchmark(model, batch_size=batch_size, **opts)
73 | 
74 | parser = argparse.ArgumentParser(description='Run PyTorch benchmarks.')
75 | parser.add_argument('--gpus', nargs='?', dest='gpus',
76 |                     help='select gpu device id')
77 | parser.add_argument('--refresh', dest='refresh', action='store_true',
78 |                     help='refresh results cache')
79 | parser.add_argument('--ferplus', dest='ferplus', action='store_true',
80 |                     help='run ferplus (rather than fer) benchmarks')
81 | parser.set_defaults(gpus=None)
82 | parser.set_defaults(refresh=False)
83 | parsed = parser.parse_args()
84 | 
85 | if __name__ == '__main__':
86 |     run_benchmarks(parsed.gpus, parsed.refresh, parsed.ferplus)
87 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/metric.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.metrics import mean_squared_error
  4 | from sklearn.metrics import f1_score, accuracy_score
  5 | 
  6 | from ..globals import *
  7 | 
  8 | # 综合维度和离散的评价指标
  9 | def overall_metric(emo_fscore, val_mse):
 10 |     final_score = emo_fscore - val_mse * 0.25
 11 |     return final_score
 12 | 
 13 | 
 14 | # 只返回 metric 值，用于模型筛选 
 15 | def gain_metric_from_results(eval_results, metric_name='emoval'):
 16 | 
 17 |     if metric_name == 'emoval':
 18 |         fscore = eval_results['emofscore']
 19 |         valmse = eval_results['valmse']
 20 |         overall = overall_metric(fscore, valmse)
 21 |         sort_metric = overall
 22 |     elif metric_name == 'emo':
 23 |         fscore = eval_results['emofscore']
 24 |         sort_metric = fscore
 25 |     elif metric_name == 'val':
 26 |         valmse = eval_results['valmse']
 27 |         sort_metric = -valmse
 28 |     elif metric_name == 'loss':
 29 |         loss = eval_results['loss']
 30 |         sort_metric = -loss
 31 | 
 32 |     return sort_metric
 33 | 
 34 | 
 35 | def gain_cv_results(folder_save):
 36 | 
 37 |     # find all keys
 38 |     whole_keys = list(folder_save[0].keys())
 39 | 
 40 |     cv_acc, cv_fscore, cv_valmse = -100, -100, -100
 41 |     if 'eval_emoacc' in whole_keys:
 42 |         cv_acc = np.mean([epoch_save['eval_emoacc'] for epoch_save in folder_save])
 43 |     if 'eval_emofscore' in whole_keys:
 44 |         cv_fscore = np.mean([epoch_save['eval_emofscore'] for epoch_save in folder_save])
 45 |     if 'eval_valmse' in whole_keys:
 46 |         cv_valmse = np.mean([epoch_save['eval_valmse'] for epoch_save in folder_save])
 47 |     
 48 |     # 只显示存在的部分信息 [与test输出是一致的]
 49 |     outputs = []
 50 |     if cv_fscore != -100: outputs.append(f'f1:{cv_fscore:.4f}')
 51 |     if cv_acc    != -100: outputs.append(f'acc:{cv_acc:.4f}')
 52 |     if cv_valmse != -100: outputs.append(f'val:{cv_valmse:.4f}')
 53 |     outputs = "_".join(outputs)
 54 |     return outputs
 55 | 
 56 | 
 57 | def average_folder_for_emos(folder_save, testname):
 58 | 
 59 |     try:
 60 |         # 因为所有test set的 shuffle都是false的，因此不同folder的结果是对应的
 61 |         labels = folder_save[0][f'{testname}_emolabels']
 62 |     except:
 63 |         return [], []
 64 | 
 65 |     num_samples = len(labels)
 66 |     num_folders = len(folder_save)
 67 | 
 68 |     whole_probs = []
 69 |     for ii in range(num_folders):
 70 |         emoprobs = folder_save[ii][f'{testname}_emoprobs']
 71 |         whole_probs.append(emoprobs)
 72 |     whole_probs = np.array(whole_probs)
 73 | 
 74 |     avg_preds = []
 75 |     for ii in range(num_samples):
 76 |         per_probs = whole_probs[:, ii, :]
 77 |         avg_emoprob = np.mean(per_probs, axis=0)
 78 |         avg_preds.append(avg_emoprob)
 79 |     
 80 |     return labels, avg_preds
 81 | 
 82 | # 计算 name -> val
 83 | def average_folder_for_vals(folder_save, testname):
 84 | 
 85 |     try:
 86 |         labels = folder_save[0][f'{testname}_vallabels']
 87 |     except:
 88 |         return  [], []
 89 | 
 90 |     num_folders = len(folder_save)
 91 | 
 92 |     whole_preds = []
 93 |     for ii in range(num_folders):
 94 |         valpreds = folder_save[ii][f'{testname}_valpreds']
 95 |         whole_preds.append(valpreds)
 96 |     whole_preds = np.array(whole_preds)
 97 | 
 98 |     avg_preds = np.mean(whole_preds, axis=0)
 99 |     return labels, avg_preds
100 | 
101 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/metric.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.metrics import mean_squared_error
  4 | from sklearn.metrics import f1_score, accuracy_score
  5 | 
  6 | from ..globals import *
  7 | 
  8 | # 综合维度和离散的评价指标
  9 | def overall_metric(emo_fscore, val_mse):
 10 |     final_score = emo_fscore - val_mse * 0.25
 11 |     return final_score
 12 | 
 13 | 
 14 | # 只返回 metric 值，用于模型筛选 
 15 | def gain_metric_from_results(eval_results, metric_name='emoval'):
 16 | 
 17 |     if metric_name == 'emoval':
 18 |         fscore = eval_results['emofscore']
 19 |         valmse = eval_results['valmse']
 20 |         overall = overall_metric(fscore, valmse)
 21 |         sort_metric = overall
 22 |     elif metric_name == 'emo':
 23 |         fscore = eval_results['emofscore']
 24 |         sort_metric = fscore
 25 |     elif metric_name == 'val':
 26 |         valmse = eval_results['valmse']
 27 |         sort_metric = -valmse
 28 |     elif metric_name == 'loss':
 29 |         loss = eval_results['loss']
 30 |         sort_metric = -loss
 31 | 
 32 |     return sort_metric
 33 | 
 34 | 
 35 | def gain_cv_results(folder_save):
 36 | 
 37 |     # find all keys
 38 |     whole_keys = list(folder_save[0].keys())
 39 | 
 40 |     cv_acc, cv_fscore, cv_valmse = -100, -100, -100
 41 |     if 'eval_emoacc' in whole_keys:
 42 |         cv_acc = np.mean([epoch_save['eval_emoacc'] for epoch_save in folder_save])
 43 |     if 'eval_emofscore' in whole_keys:
 44 |         cv_fscore = np.mean([epoch_save['eval_emofscore'] for epoch_save in folder_save])
 45 |     if 'eval_valmse' in whole_keys:
 46 |         cv_valmse = np.mean([epoch_save['eval_valmse'] for epoch_save in folder_save])
 47 |     
 48 |     # 只显示存在的部分信息 [与test输出是一致的]
 49 |     outputs = []
 50 |     if cv_fscore != -100: outputs.append(f'f1:{cv_fscore:.4f}')
 51 |     if cv_acc    != -100: outputs.append(f'acc:{cv_acc:.4f}')
 52 |     if cv_valmse != -100: outputs.append(f'val:{cv_valmse:.4f}')
 53 |     outputs = "_".join(outputs)
 54 |     return outputs
 55 | 
 56 | 
 57 | def average_folder_for_emos(folder_save, testname):
 58 | 
 59 |     try:
 60 |         # 因为所有test set的 shuffle都是false的，因此不同folder的结果是对应的
 61 |         labels = folder_save[0][f'{testname}_emolabels']
 62 |     except:
 63 |         return [], []
 64 | 
 65 |     num_samples = len(labels)
 66 |     num_folders = len(folder_save)
 67 | 
 68 |     whole_probs = []
 69 |     for ii in range(num_folders):
 70 |         emoprobs = folder_save[ii][f'{testname}_emoprobs']
 71 |         whole_probs.append(emoprobs)
 72 |     whole_probs = np.array(whole_probs)
 73 | 
 74 |     avg_preds = []
 75 |     for ii in range(num_samples):
 76 |         per_probs = whole_probs[:, ii, :]
 77 |         avg_emoprob = np.mean(per_probs, axis=0)
 78 |         avg_preds.append(avg_emoprob)
 79 |     
 80 |     return labels, avg_preds
 81 | 
 82 | # 计算 name -> val
 83 | def average_folder_for_vals(folder_save, testname):
 84 | 
 85 |     try:
 86 |         labels = folder_save[0][f'{testname}_vallabels']
 87 |     except:
 88 |         return  [], []
 89 | 
 90 |     num_folders = len(folder_save)
 91 | 
 92 |     whole_preds = []
 93 |     for ii in range(num_folders):
 94 |         valpreds = folder_save[ii][f'{testname}_valpreds']
 95 |         whole_preds.append(valpreds)
 96 |     whole_preds = np.array(whole_preds)
 97 | 
 98 |     avg_preds = np.mean(whole_preds, axis=0)
 99 |     return labels, avg_preds
100 | 
101 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/manet/model/attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class BasicConv(nn.Module):
 7 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
 8 |         super(BasicConv, self).__init__()
 9 |         self.out_channels = out_planes
10 |         self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
11 |         self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None
12 |         self.relu = nn.ReLU() if relu else None
13 | 
14 |     def forward(self, x):
15 |         x = self.conv(x)
16 |         if self.bn is not None:
17 |             x = self.bn(x)
18 |         if self.relu is not None:
19 |             x = self.relu(x)
20 |         return x
21 | 
22 | 
23 | class Flatten(nn.Module):
24 |     def forward(self, x):
25 |         return x.view(x.size(0), -1)
26 | 
27 | 
28 | class ChannelGate(nn.Module):
29 |     def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
30 |         super(ChannelGate, self).__init__()
31 |         self.gate_channels = gate_channels
32 |         self.mlp = nn.Sequential(Flatten(),
33 |                                  nn.Linear(gate_channels, gate_channels // reduction_ratio),
34 |                                  nn.ReLU(),
35 |                                  nn.Linear(gate_channels // reduction_ratio, gate_channels))
36 |         self.pool_types = pool_types
37 | 
38 |     def forward(self, x):
39 |         channel_att_sum = None
40 |         for pool_type in self.pool_types:
41 |             if pool_type == 'avg':
42 |                 avg_pool = F.avg_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
43 |                 channel_att_raw = self.mlp(avg_pool )
44 |             elif pool_type == 'max':
45 |                 max_pool = F.max_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
46 |                 channel_att_raw = self.mlp(max_pool)
47 |             if channel_att_sum is None:
48 |                 channel_att_sum = channel_att_raw
49 |             else:
50 |                 channel_att_sum = channel_att_sum + channel_att_raw
51 | 
52 |         scale = torch.sigmoid(channel_att_sum).unsqueeze(2).unsqueeze(3).expand_as(x)
53 |         return x * scale
54 | 
55 | 
56 | class ChannelPool(nn.Module):
57 |     def forward(self, x):
58 |         return torch.cat((torch.max(x, 1)[0].unsqueeze(1), torch.mean(x, 1).unsqueeze(1)), dim=1)
59 | 
60 | 
61 | class SpatialGate(nn.Module):
62 |     def __init__(self):
63 |         super(SpatialGate, self).__init__()
64 |         kernel_size = 7
65 |         self.compress = ChannelPool()
66 |         self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
67 | 
68 |     def forward(self, x):
69 |         x_compress = self.compress(x)
70 |         x_out = self.spatial(x_compress)
71 |         scale = torch.sigmoid(x_out)
72 |         return x * scale
73 | 
74 | 
75 | class CBAM(nn.Module):
76 |     def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
77 |         super(CBAM, self).__init__()
78 |         self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
79 |         self.SpatialGate = SpatialGate()
80 | 
81 |     def forward(self, x):
82 |         x_out = self.ChannelGate(x)
83 |         x_out = self.SpatialGate(x_out)
84 | 
85 |         return x_out
86 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/data/feat_data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from torch.utils.data import Dataset
 4 | from toolkit.utils.read_data import *
 5 | 
 6 | class Data_Feat(Dataset):
 7 |     def __init__(self, args, names, labels):
 8 | 
 9 |         # analyze path
10 |         self.names = names
11 |         self.labels = labels
12 |         feat_root  = config.PATH_TO_FEATURES[args.dataset]
13 |         audio_root = os.path.join(feat_root, args.audio_feature)
14 |         text_root  = os.path.join(feat_root, args.text_feature )
15 |         video_root = os.path.join(feat_root, args.video_feature)
16 |         print (f'audio feature root: {audio_root}')
17 | 
18 |         # --------------- temporal test ---------------
19 |         # for name in names: assert os.path.exists(os.path.join(audio_root, name+'.npy'))
20 | 
21 |         # analyze params
22 |         self.feat_type = args.feat_type
23 |         self.feat_scale = args.feat_scale # 特征预压缩
24 |         assert self.feat_scale >= 1
25 |         assert self.feat_type in ['utt', 'frm_align', 'frm_unalign']
26 | 
27 |         # read datas (reduce __getitem__ durations)
28 |         audios, self.adim = func_read_multiprocess(audio_root, self.names, read_type='feat')
29 |         texts,  self.tdim = func_read_multiprocess(text_root,  self.names, read_type='feat')
30 |         videos, self.vdim = func_read_multiprocess(video_root, self.names, read_type='feat')
31 | 
32 |         ## read batch (reduce collater durations)
33 |         # step1: pre-compress features
34 |         audios, texts, videos = feature_scale_compress(audios, texts, videos, self.feat_scale)
35 |         # step2: align to batch
36 |         if self.feat_type == 'utt': # -> 每个样本每个模态的特征压缩到句子级别
37 |             audios, texts, videos = align_to_utt(audios, texts, videos)
38 |         elif self.feat_type == 'frm_align':
39 |             audios, texts, videos = align_to_text(audios, texts, videos) # 模态级别对齐
40 |             audios, texts, videos = pad_to_maxlen_pre_modality(audios, texts, videos) # 样本级别对齐
41 |         elif self.feat_type == 'frm_unalign':
42 |             audios, texts, videos = pad_to_maxlen_pre_modality(audios, texts, videos) # 样本级别对齐
43 |         self.audios, self.texts, self.videos = audios, texts, videos
44 | 
45 |  
46 |     def __len__(self):
47 |         return len(self.names)
48 | 
49 | 
50 |     def __getitem__(self, index):
51 |         instance = dict(
52 |             audio = self.audios[index],
53 |             text  = self.texts[index],
54 |             video = self.videos[index],
55 |             emo   = self.labels[index]['emo'],
56 |             val   = self.labels[index]['val'],
57 |             name  = self.names[index],
58 |         )
59 |         return instance
60 |     
61 | 
62 |     def collater(self, instances):
63 |         audios = [instance['audio'] for instance in instances]
64 |         texts  = [instance['text']  for instance in instances]
65 |         videos = [instance['video'] for instance in instances]
66 | 
67 |         batch = dict(
68 |             audios = torch.FloatTensor(np.array(audios)),
69 |             texts  = torch.FloatTensor(np.array(texts)),
70 |             videos = torch.FloatTensor(np.array(videos)),
71 |         )
72 |         
73 |         emos  = torch.LongTensor([instance['emo']  for instance in instances])
74 |         vals  = torch.FloatTensor([instance['val']  for instance in instances])
75 |         names = [instance['name'] for instance in instances]
76 | 
77 |         return batch, emos, vals, names
78 |     
79 | 
80 |     def get_featdim(self):
81 |         print (f'audio dimension: {self.adim}; text dimension: {self.tdim}; video dimension: {self.vdim}')
82 |         return self.adim, self.tdim, self.vdim
83 |     


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/simsv2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from toolkit.utils.functions import *
 4 | from toolkit.utils.read_files import *
 5 | 
 6 | def func_merge_id_to_path(video_id, clip_id, video_root):
 7 |     video_path = os.path.join(video_root, video_id, clip_id+'.mp4')
 8 |     return video_path
 9 | 
10 | def func_convert_path_to_newname(video_id, clip_id):
11 |     newname = f'{video_id}_{clip_id}'
12 |     return newname
13 | 
14 | # label_path -> (video_paths, labels)
15 | def read_labels(label_path, video_root):
16 | 
17 |     # read all items
18 |     video_ids = func_read_key_from_csv(label_path, 'video_id')
19 |     clip_ids = func_read_key_from_csv(label_path, 'clip_id')
20 |     chis = func_read_key_from_csv(label_path, 'text')
21 |     labels = func_read_key_from_csv(label_path, 'label')
22 |     modes = func_read_key_from_csv(label_path, 'mode')
23 | 
24 |     print (f'label range ->  min:{min(labels)}  max:{max(labels)}')
25 |     print (f'whole sample number: {len(labels)}')
26 |     print ('modes: ', set(modes))
27 | 
28 |     newnames, videopaths = [], []
29 |     for ii in range(len(video_ids)):
30 |         newname = func_convert_path_to_newname(video_ids[ii], clip_ids[ii])
31 |         videopath = func_merge_id_to_path(video_ids[ii], clip_ids[ii], video_root)
32 |         newnames.append(newname)
33 |         videopaths.append(videopath)
34 |     print (f'whole sample number: {len(set(newnames))}')
35 |     return chis, labels, modes, videopaths, newnames
36 | 
37 | 
38 | # ------------------- main process -------------------
39 | def normalize_dataset_format(data_root, save_root):
40 |     # gain paths
41 |     video_root = os.path.join(data_root, 'Raw')
42 |     label_path = os.path.join(data_root, 'meta.csv')
43 |     
44 |     # read all items
45 |     chis, labels, modes, videopaths, newnames = read_labels(label_path, video_root)
46 | 
47 |     ## output path
48 |     save_video = os.path.join(save_root, 'video')
49 |     save_label = os.path.join(save_root, 'label.npz')
50 |     save_trans = os.path.join(save_root, 'transcription.csv')
51 |     if not os.path.exists(save_root):  os.makedirs(save_root)
52 |     if not os.path.exists(save_video): os.makedirs(save_video)
53 | 
54 |     ## generate new transcripts
55 |     name2key = {}
56 |     for ii, newname in enumerate(newnames):
57 |         name2key[newname] = [chis[ii]]
58 |     func_write_key_to_csv(save_trans, newnames, name2key, ['chinese'])
59 | 
60 |     ## copy videos
61 |     for ii, videopath in enumerate(videopaths):
62 |         assert videopath.endswith('.mp4')
63 |         savepath = os.path.join(save_video, newnames[ii]+'.mp4')
64 |         shutil.copy(videopath, savepath)
65 | 
66 |     ## generate label path
67 |     whole_corpus = {}
68 |     for ii, newname in enumerate(newnames):
69 |         mode = modes[ii] # [train, valid, test]
70 |         if mode not in whole_corpus: 
71 |             whole_corpus[mode] = {}
72 |         whole_corpus[mode][newname] = {'emo': 0, 'val': labels[ii]}
73 | 
74 |     np.savez_compressed(save_label,
75 |                         train_corpus=whole_corpus['train'],
76 |                         val_corpus=whole_corpus['valid'],
77 |                         test_corpus=whole_corpus['test'])
78 | 
79 | if __name__ == '__main__':
80 |     data_root = 'I:\\CH-SIMS-v2\\zip\\supervised\\ch-simsv2s'
81 |     save_root = 'E:\\Dataset\\simsv2-process'
82 |     normalize_dataset_format(data_root, save_root)
83 | 
84 |     # data_root = 'E:\\Dataset\\simsv2-process'
85 |     # trans_path = os.path.join(data_root, 'transcription.csv')
86 |     # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv')
87 |     # func_translate_transcript_polish_merge(trans_path, polish_path)
88 |     # func_translate_transcript_polish_merge(polish_path, '')
89 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/util.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | import os
 3 | import re
 4 | import pandas as pd
 5 | import numpy as np
 6 | import struct
 7 | 
 8 | ## for OPENFACE
 9 | ## reference: https://gist.github.com/btlorch/6d259bfe6b753a7a88490c0607f07ff8
10 | def read_hog(filename, batch_size=5000):
11 |     """
12 |     Read HoG features file created by OpenFace.
13 |     For each frame, OpenFace extracts 12 * 12 * 31 HoG features, i.e., num_features = 4464. These features are stored in row-major order.
14 |     :param filename: path to .hog file created by OpenFace
15 |     :param batch_size: how many rows to read at a time
16 |     :return: is_valid, hog_features
17 |         is_valid: ndarray of shape [num_frames]
18 |         hog_features: ndarray of shape [num_frames, num_features]
19 |     """
20 |     all_feature_vectors = []
21 |     with open(filename, "rb") as f:
22 |         num_cols, = struct.unpack("i", f.read(4)) # 12
23 |         num_rows, = struct.unpack("i", f.read(4)) # 12
24 |         num_channels, = struct.unpack("i", f.read(4)) # 31
25 | 
26 |         # The first four bytes encode a boolean value whether the frame is valid
27 |         num_features = 1 + num_rows * num_cols * num_channels
28 |         feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4))
29 |         feature_vector = np.array(feature_vector).reshape((1, num_features)) # [1, 4464+1]
30 |         all_feature_vectors.append(feature_vector)
31 | 
32 |         # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid
33 |         num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels
34 |         # Read in batches of given batch_size
35 |         num_floats_to_read = num_floats_per_feature_vector * batch_size
36 |         # Multiply by 4 because of float32
37 |         num_bytes_to_read = num_floats_to_read * 4
38 | 
39 |         while True:
40 |             bytes = f.read(num_bytes_to_read)
41 |             # For comparison how many bytes were actually read
42 |             num_bytes_read = len(bytes)
43 |             assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size"
44 |             num_floats_read = num_bytes_read // 4
45 |             assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size"
46 |             num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector
47 | 
48 |             feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes)
49 |             # Convert to array
50 |             feature_vectors = np.array(feature_vectors).reshape((num_feature_vectors_read, num_floats_per_feature_vector))
51 |             # Discard the first three values in each row (num_cols, num_rows, num_channels)
52 |             feature_vectors = feature_vectors[:, 3:]
53 |             # Append to list of all feature vectors that have been read so far
54 |             all_feature_vectors.append(feature_vectors)
55 | 
56 |             if num_bytes_read < num_bytes_to_read:
57 |                 break
58 | 
59 |         # Concatenate batches
60 |         all_feature_vectors = np.concatenate(all_feature_vectors, axis=0)
61 | 
62 |         # Split into is-valid and feature vectors
63 |         is_valid = all_feature_vectors[:, 0]
64 |         feature_vectors = all_feature_vectors[:, 1:]
65 | 
66 |         return is_valid, feature_vectors
67 | 
68 | 
69 | ## for OPENFACE
70 | def read_csv(filename, startIdx):
71 |     data = pd.read_csv(filename)
72 |     all_feature_vectors = []
73 |     for index in data.index:
74 |         features = np.array(data.iloc[index][startIdx:])
75 |         all_feature_vectors.append(features)
76 |     all_feature_vectors = np.array(all_feature_vectors)
77 |     return all_feature_vectors
78 | 
79 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/dataloader/mm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import DataLoader
 3 | from sklearn.metrics import f1_score, accuracy_score
 4 | 
 5 | from ..globals import *
 6 | from toolkit.data import get_datasets
 7 | 
 8 | class mm:
 9 | 
10 |     def __init__(self, args):
11 |         self.args = args
12 |         self.debug = args.debug
13 |         self.batch_size = args.batch_size
14 |         self.num_workers = args.num_workers
15 |         self.label_path = config.PATH_TO_LABEL[args.dataset]
16 | 
17 |         self.dataset = args.dataset
18 |         assert self.dataset in ['mm4','mm7']
19 |         
20 |         # update args
21 |         if self.dataset == 'mm4':
22 |             args.output_dim1 = 4
23 |             args.output_dim2 = 0
24 |             args.metric_name = 'emo'
25 |         elif self.dataset == 'mm7':
26 |             args.output_dim1 = 7
27 |             args.output_dim2 = 0
28 |             args.metric_name = 'emo'
29 | 
30 |     def get_loaders(self):
31 |         dataloaders = []
32 |         for data_type in ['train', 'val', 'test']:
33 |             names, labels = self.read_names_labels(self.label_path, data_type, debug=self.debug)
34 |             print (f'{data_type}: sample number {len(names)}')
35 |             dataset = get_datasets(self.args, names, labels)
36 | 
37 |             if data_type in ['train']:
38 |                 dataloader = DataLoader(dataset,
39 |                                         batch_size=self.batch_size,
40 |                                         num_workers=self.num_workers,
41 |                                         collate_fn=dataset.collater,
42 |                                         pin_memory=True)
43 |             else:
44 |                 dataloader = DataLoader(dataset,
45 |                                         batch_size=self.batch_size,
46 |                                         num_workers=self.num_workers,
47 |                                         collate_fn=dataset.collater,
48 |                                         shuffle=False,
49 |                                         pin_memory=True)
50 |             dataloaders.append(dataloader)
51 |         train_loaders = [dataloaders[0]]
52 |         eval_loaders  = [dataloaders[1]]
53 |         test_loaders  = [dataloaders[2]]
54 |                 
55 |         return train_loaders, eval_loaders, test_loaders
56 |     
57 | 
58 |     def read_names_labels(self, label_path, data_type, debug=False):
59 |         names, labels = [], []
60 |         if data_type == 'train': corpus = np.load(label_path, allow_pickle=True)['train_corpus'].tolist()
61 |         if data_type == 'val':   corpus = np.load(label_path, allow_pickle=True)['val_corpus'].tolist()
62 |         if data_type == 'test':  corpus = np.load(label_path, allow_pickle=True)['test_corpus'].tolist()
63 |         for name in corpus:
64 |             names.append(name)
65 |             labels.append(corpus[name])
66 |         # for debug
67 |         if debug: 
68 |             names = names[:100]
69 |             labels = labels[:100]
70 |         return names, labels
71 | 
72 | 
73 |     # MELD 测试 7-emo classification performance
74 |     def calculate_results(self, emo_probs=[], emo_labels=[], val_preds=[], val_labels=[]):
75 |         
76 |         emo_preds = np.argmax(emo_probs, 1)
77 |         emo_accuracy = accuracy_score(emo_labels, emo_preds)
78 |         emo_fscore = f1_score(emo_labels, emo_preds, average='weighted')
79 | 
80 |         results = {
81 |                     'emoprobs':  emo_probs,
82 |                     'emolabels': emo_labels,
83 |                     'emoacc':    emo_accuracy,
84 |                     'emofscore': emo_fscore
85 |                     }
86 |         outputs = f'f1:{emo_fscore:.4f}_acc:{emo_accuracy:.4f}'
87 | 
88 |         return results, outputs
89 |     
90 |    


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/model/vgg_m_face_bn_fer_dag.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class Vgg_m_face_bn_fer_dag(nn.Module):
 8 | 
 9 |     def __init__(self):
10 |         super(Vgg_m_face_bn_fer_dag, self).__init__()
11 |         self.meta = {'mean': [131.45376586914062, 103.98748016357422, 91.46234893798828],
12 |                      'std': [1, 1, 1],
13 |                      'imageSize': [224, 224, 3]}
14 |         self.conv1 = nn.Conv2d(3, 96, kernel_size=[7, 7], stride=(2, 2))
15 |         self.bn49 = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
16 |         self.relu1 = nn.ReLU()
17 |         self.pool1 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
18 |         self.conv2 = nn.Conv2d(96, 256, kernel_size=[5, 5], stride=(2, 2), padding=(1, 1))
19 |         self.bn50 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
20 |         self.relu2 = nn.ReLU()
21 |         self.pool2 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=(0, 0), dilation=1, ceil_mode=True)
22 |         self.conv3 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
23 |         self.bn51 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
24 |         self.relu3 = nn.ReLU()
25 |         self.conv4 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
26 |         self.bn52 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
27 |         self.relu4 = nn.ReLU()
28 |         self.conv5 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
29 |         self.bn53 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
30 |         self.relu5 = nn.ReLU()
31 |         self.pool5 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
32 |         self.fc6 = nn.Conv2d(512, 4096, kernel_size=[6, 6], stride=(1, 1))
33 |         self.bn54 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
34 |         self.relu6 = nn.ReLU()
35 |         self.fc7 = nn.Conv2d(4096, 4096, kernel_size=[1, 1], stride=(1, 1))
36 |         self.bn55 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
37 |         self.relu7 = nn.ReLU()
38 |         self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True)
39 | 
40 |     def forward(self, data):
41 |         x1 = self.conv1(data)
42 |         x2 = self.bn49(x1)
43 |         x3 = self.relu1(x2)
44 |         x4 = self.pool1(x3)
45 |         x5 = self.conv2(x4)
46 |         x6 = self.bn50(x5)
47 |         x7 = self.relu2(x6)
48 |         x8 = self.pool2(x7)
49 |         x9 = self.conv3(x8)
50 |         x10 = self.bn51(x9)
51 |         x11 = self.relu3(x10)
52 |         x12 = self.conv4(x11)
53 |         x13 = self.bn52(x12)
54 |         x14 = self.relu4(x13)
55 |         x15 = self.conv5(x14)
56 |         x16 = self.bn53(x15)
57 |         x17 = self.relu5(x16)
58 |         x18 = self.pool5(x17)
59 |         x19 = self.fc6(x18)
60 |         x20 = self.bn54(x19)
61 |         x21 = self.relu6(x20)
62 |         x22 = self.fc7(x21)
63 |         x23 = self.bn55(x22)
64 |         x24_preflatten = self.relu7(x23)
65 |         x24 = x24_preflatten.view(x24_preflatten.size(0), -1)
66 |         prediction = self.fc8(x24)
67 |         return prediction
68 | 
69 | def vgg_m_face_bn_fer_dag(weights_path=None, **kwargs):
70 |     """
71 |     load imported model instance
72 | 
73 |     Args:
74 |         weights_path (str): If set, loads model weights from the given path
75 |     """
76 |     model = Vgg_m_face_bn_fer_dag()
77 |     if weights_path:
78 |         state_dict = torch.load(weights_path)
79 |         model.load_state_dict(state_dict)
80 |     return model


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/model/alexnet_face_fer_bn_dag.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class Alexnet_face_fer_bn_dag(nn.Module):
 8 | 
 9 |     def __init__(self):
10 |         super(Alexnet_face_fer_bn_dag, self).__init__()
11 |         self.meta = {'mean': [131.09375, 103.88607788085938, 91.47599792480469],
12 |                      'std': [1, 1, 1],
13 |                      'imageSize': [227, 227, 3]}
14 |         self.conv1 = nn.Conv2d(3, 96, kernel_size=[11, 11], stride=(4, 4))
15 |         self.bn1 = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
16 |         self.relu1 = nn.ReLU()
17 |         self.pool1 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
18 |         self.conv2 = nn.Conv2d(96, 256, kernel_size=[5, 5], stride=(1, 1), padding=(2, 2), groups=2)
19 |         self.bn2 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
20 |         self.relu2 = nn.ReLU()
21 |         self.pool2 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
22 |         self.conv3 = nn.Conv2d(256, 384, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
23 |         self.bn3 = nn.BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
24 |         self.relu3 = nn.ReLU()
25 |         self.conv4 = nn.Conv2d(384, 384, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1), groups=2)
26 |         self.bn4 = nn.BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
27 |         self.relu4 = nn.ReLU()
28 |         self.conv5 = nn.Conv2d(384, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1), groups=2)
29 |         self.bn5 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
30 |         self.relu5 = nn.ReLU()
31 |         self.pool5 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
32 |         self.fc6 = nn.Conv2d(256, 4096, kernel_size=[6, 6], stride=(1, 1))
33 |         self.bn6 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
34 |         self.relu6 = nn.ReLU()
35 |         self.fc7 = nn.Conv2d(4096, 4096, kernel_size=[1, 1], stride=(1, 1))
36 |         self.bn7 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
37 |         self.relu7 = nn.ReLU()
38 |         self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True)
39 | 
40 |     def forward(self, data):
41 |         x1 = self.conv1(data)
42 |         x2 = self.bn1(x1)
43 |         x3 = self.relu1(x2)
44 |         x4 = self.pool1(x3)
45 |         x5 = self.conv2(x4)
46 |         x6 = self.bn2(x5)
47 |         x7 = self.relu2(x6)
48 |         x8 = self.pool2(x7)
49 |         x9 = self.conv3(x8)
50 |         x10 = self.bn3(x9)
51 |         x11 = self.relu3(x10)
52 |         x12 = self.conv4(x11)
53 |         x13 = self.bn4(x12)
54 |         x14 = self.relu4(x13)
55 |         x15 = self.conv5(x14)
56 |         x16 = self.bn5(x15)
57 |         x17 = self.relu5(x16)
58 |         x18 = self.pool5(x17)
59 |         x19 = self.fc6(x18)
60 |         x20 = self.bn6(x19)
61 |         x21 = self.relu6(x20)
62 |         x22 = self.fc7(x21)
63 |         x23 = self.bn7(x22)
64 |         x24_preflatten = self.relu7(x23)
65 |         x24 = x24_preflatten.view(x24_preflatten.size(0), -1)
66 |         prediction = self.fc8(x24)
67 |         return prediction
68 | 
69 | def alexnet_face_fer_bn_dag(weights_path=None, **kwargs):
70 |     """
71 |     load imported model instance
72 | 
73 |     Args:
74 |         weights_path (str): If set, loads model weights from the given path
75 |     """
76 |     model = Alexnet_face_fer_bn_dag()
77 |     if weights_path:
78 |         state_dict = torch.load(weights_path)
79 |         model.load_state_dict(state_dict)
80 |     return model


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/tfn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | paper: Tensor Fusion Network for Multimodal Sentiment Analysis
 3 | From: https://github.com/A2Zadeh/TensorFusionNetwork
 4 | """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | from .modules.encoder import MLPEncoder, LSTMEncoder
10 | 
11 | class TFN(nn.Module):
12 | 
13 |     def __init__(self, args):
14 | 
15 |         super(TFN, self).__init__()
16 | 
17 |         text_dim    = args.text_dim
18 |         audio_dim   = args.audio_dim
19 |         video_dim   = args.video_dim
20 |         output_dim1 = args.output_dim1
21 |         output_dim2 = args.output_dim2
22 |         dropout= args.dropout
23 |         self.hidden_dim = args.hidden_dim
24 |         self.grad_clip = args.grad_clip
25 |        
26 |         # define the pre-fusion subnetworks [感觉输入的audio/video是句子级别，但是 text是词级别信息]
27 |         if args.feat_type in ['utt']:
28 |             self.audio_encoder = MLPEncoder(audio_dim, self.hidden_dim, dropout)
29 |             self.text_encoder  = MLPEncoder(text_dim,  self.hidden_dim, dropout)
30 |             self.video_encoder = MLPEncoder(video_dim, self.hidden_dim, dropout)
31 |         elif args.feat_type in ['frm_align', 'frm_unalign']:
32 |             self.audio_encoder = LSTMEncoder(audio_dim, self.hidden_dim, dropout)
33 |             self.text_encoder  = LSTMEncoder(text_dim,  self.hidden_dim, dropout)
34 |             self.video_encoder = LSTMEncoder(video_dim, self.hidden_dim, dropout)
35 |         
36 |         # define the post_fusion layers
37 |         self.post_fusion_dropout = nn.Dropout(p=dropout)
38 |         self.post_fusion_layer_1 = nn.Linear((self.hidden_dim + 1) * (self.hidden_dim + 1) * (self.hidden_dim + 1), self.hidden_dim)
39 |         self.post_fusion_layer_2 = nn.Linear(self.hidden_dim, self.hidden_dim)
40 | 
41 |         self.fc_out_1 = nn.Linear(self.hidden_dim, output_dim1)
42 |         self.fc_out_2 = nn.Linear(self.hidden_dim, output_dim2)
43 | 
44 | 
45 |     # audio/video是句子级别, text的word level
46 |     def forward(self, batch):
47 |         '''
48 |         Args:
49 |             audio_x: tensor of shape (batch_size, audio_dim)
50 |             video_x: tensor of shape (batch_size, video_dim)
51 |             text_x:  tensor of shape (batch_size, text_dim )
52 |         '''
53 | 
54 |         audio_h = self.audio_encoder(batch['audios'])
55 |         text_h  = self.text_encoder(batch['texts'])
56 |         video_h = self.video_encoder(batch['videos'])
57 |         batch_size = audio_h.data.shape[0]
58 | 
59 |         # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product
60 |         add_one  = torch.ones(size=[batch_size, 1], requires_grad=False).type_as(audio_h).to(audio_h.device)
61 |         _audio_h = torch.cat((add_one, audio_h), dim=1)
62 |         _video_h = torch.cat((add_one, video_h), dim=1)
63 |         _text_h  = torch.cat((add_one, text_h), dim=1)
64 | 
65 |         # outer product
66 |         fusion_tensor = torch.bmm(_audio_h.unsqueeze(2), _video_h.unsqueeze(1))
67 |         
68 |         # next we do kronecker product between fusion_tensor and _text_h. This is even trickier
69 |         # we have to reshape the fusion tensor during the computation
70 |         # in the end we don't keep the 3-D tensor, instead we flatten it
71 |         fusion_tensor = fusion_tensor.view(-1, (self.hidden_dim + 1) * (self.hidden_dim + 1), 1)
72 |         fusion_tensor = torch.bmm(fusion_tensor, _text_h.unsqueeze(1)).view(batch_size, -1)
73 | 
74 |         post_fusion_dropped = self.post_fusion_dropout(fusion_tensor)
75 |         post_fusion_y_1 = F.relu(self.post_fusion_layer_1(post_fusion_dropped), inplace=True)
76 |         features = F.relu(self.post_fusion_layer_2(post_fusion_y_1), inplace=True)
77 | 
78 |         emos_out  = self.fc_out_1(features)
79 |         vals_out  = self.fc_out_2(features)
80 |         interloss = torch.tensor(0).cuda()
81 | 
82 |         return features, emos_out, vals_out, interloss
83 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/modules/transformers_encoder/position_embedding.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | # Code adapted from the fairseq repo.
 6 | 
 7 | def make_positions(tensor, padding_idx, left_pad):
 8 |     """Replace non-padding symbols with their position numbers.
 9 |     Position numbers begin at padding_idx+1.
10 |     Padding symbols are ignored, but it is necessary to specify whether padding
11 |     is added on the left side (left_pad=True) or right side (left_pad=False).
12 |     """
13 |     max_pos = padding_idx + 1 + tensor.size(1)
14 |     device = tensor.get_device()
15 |     buf_name = f'range_buf_{device}'
16 |     if not hasattr(make_positions, buf_name):
17 |         setattr(make_positions, buf_name, tensor.new())
18 |     setattr(make_positions, buf_name, getattr(make_positions, buf_name).type_as(tensor))
19 |     if getattr(make_positions, buf_name).numel() < max_pos:
20 |         torch.arange(padding_idx + 1, max_pos, out=getattr(make_positions, buf_name))
21 |     mask = tensor.ne(padding_idx)
22 |     positions = getattr(make_positions, buf_name)[:tensor.size(1)].expand_as(tensor)
23 |     if left_pad:
24 |         positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
25 |     new_tensor = tensor.clone()
26 |     return new_tensor.masked_scatter_(mask, positions[mask]).long()
27 | 
28 | 
29 | class SinusoidalPositionalEmbedding(nn.Module):
30 |     """This module produces sinusoidal positional embeddings of any length.
31 |     Padding symbols are ignored, but it is necessary to specify whether padding
32 |     is added on the left side (left_pad=True) or right side (left_pad=False).
33 |     """
34 | 
35 |     def __init__(self, embedding_dim, padding_idx=0, left_pad=0, init_size=128):
36 |         super().__init__()
37 |         self.embedding_dim = embedding_dim
38 |         self.padding_idx = padding_idx
39 |         self.left_pad = left_pad
40 |         self.weights = dict()   # device --> actual weight; due to nn.DataParallel :-(
41 |         self.register_buffer('_float_tensor', torch.FloatTensor(1))
42 | 
43 |     @staticmethod
44 |     def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
45 |         """Build sinusoidal embeddings.
46 |         This matches the implementation in tensor2tensor, but differs slightly
47 |         from the description in Section 3.5 of "Attention Is All You Need".
48 |         """
49 |         half_dim = embedding_dim // 2
50 |         emb = math.log(10000) / (half_dim - 1)
51 |         emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
52 |         emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
53 |         emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
54 |         if embedding_dim % 2 == 1:
55 |             # zero pad
56 |             emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
57 |         if padding_idx is not None:
58 |             emb[padding_idx, :] = 0
59 |         return emb
60 | 
61 |     def forward(self, input):
62 |         """Input is expected to be of size [bsz x seqlen]."""
63 |         bsz, seq_len = input.size()
64 |         max_pos = self.padding_idx + 1 + seq_len
65 |         device = input.get_device()
66 |         if device not in self.weights or max_pos > self.weights[device].size(0):
67 |             # recompute/expand embeddings if needed
68 |             self.weights[device] = SinusoidalPositionalEmbedding.get_embedding(
69 |                 max_pos,
70 |                 self.embedding_dim,
71 |                 self.padding_idx,
72 |             )
73 |         self.weights[device] = self.weights[device].type_as(self._float_tensor).to(input.device)
74 |         positions = make_positions(input, self.padding_idx, self.left_pad)
75 |         return self.weights[device].index_select(0, positions.contiguous().view(-1)).view(bsz, seq_len, -1).detach()
76 | 
77 |     def max_positions(self):
78 |         """Maximum number of supported positions."""
79 |         return int(1e5)  # an arbitrary large number


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/extract_vggish_embedding.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | """
 3 | VGGish: https://arxiv.org/abs/1609.09430
 4 | official github repo: https://github.com/tensorflow/models/tree/master/research/audioset/vggish
 5 | """
 6 | 
 7 | import os
 8 | import glob
 9 | import time
10 | import argparse
11 | import numpy as np
12 | 
13 | from vggish import vggish_input
14 | from vggish import vggish_params
15 | from vggish import vggish_slim
16 | import tensorflow.compat.v1 as tf # version: 1.15.0 (gpu)
17 | tf.disable_v2_behavior()
18 | 
19 | # import config
20 | import sys
21 | sys.path.append('../../')
22 | import config
23 | 
24 | def extract(audio_files, save_dir, feature_level, batch_size=2048):
25 |     start_time = time.time()
26 |     
27 |     if feature_level == 'FRAME':     label_interval = 50.0
28 |     if feature_level == 'UTTERANCE': label_interval = 500.0
29 | 
30 |     with tf.Graph().as_default(), tf.Session() as sess:
31 |         vggish_slim.define_vggish_slim(training=False)
32 |         model_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, f'vggish/vggish_model.ckpt')
33 |         vggish_slim.load_vggish_slim_checkpoint(sess, model_file)
34 |         features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)   # get one layer
35 |         embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) # get one layer
36 | 
37 |         for i, audio_file in enumerate(audio_files, 1):
38 |             print(f'Processing "{os.path.basename(audio_file)}" ({i}/{len(audio_files)})...')
39 |             vid = os.path.basename(audio_file)[:-4]
40 |             samples = vggish_input.wavfile_to_examples(audio_file, label_interval / 1000.0) # (segment_num, height(96), width(64))
41 |             sample_size = samples.shape[0]
42 | 
43 |             # model inference (max sample size: 6653, will cause OOM. Need to chunk samples.)
44 |             embeddings = []
45 |             num_batches =  int(np.ceil(sample_size / batch_size))
46 |             for i in range(num_batches):
47 |                 examples_batch = samples[i*batch_size:min((i+1)*batch_size, sample_size)]
48 |                 [embedding_batch] = sess.run([embedding_tensor],
49 |                                              feed_dict={features_tensor: examples_batch})
50 |                 embeddings.append(embedding_batch) 
51 |             embeddings = np.row_stack(embeddings) # (segment_num, featdim=128)
52 | 
53 |             # save feature
54 |             csv_file = os.path.join(save_dir, f'{vid}.npy')
55 |             if feature_level == 'UTTERANCE':
56 |                 embeddings = np.array(embeddings).squeeze()
57 |                 if len(embeddings.shape) != 1:
58 |                     embeddings = np.mean(embeddings, axis=0) # (featdim=128)
59 |                 np.save(csv_file, embeddings)
60 |             else:
61 |                 np.save(csv_file, embeddings)
62 | 
63 |     end_time = time.time()
64 |     print(f'Total time used: {end_time - start_time:.1f}s.')
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     parser = argparse.ArgumentParser(description='Run.')
69 |     parser.add_argument('--gpu', type=int, default=0, help='index of gpu')
70 |     parser.add_argument('--feature_level', type=str, default='FRAME', help='feature_level: FRAME or UTTERANCE')
71 |     parser.add_argument('--dataset', type=str, default='MER2023', help='input dataset')
72 |     args = parser.parse_args()
73 |     os.environ["CUDA_VISIBLE_DEVICES"] = f'{args.gpu}'
74 | 
75 |     audio_dir = config.PATH_TO_RAW_AUDIO[args.dataset]
76 |     save_dir = config.PATH_TO_FEATURES[args.dataset]
77 | 
78 |     # in: get audios
79 |     audio_files = glob.glob(os.path.join(audio_dir, '*.wav'))
80 |     print(f'Find total "{len(audio_files)}" audio files.')
81 | 
82 |     # out: check dir
83 |     dir_name = f'vggish_{args.feature_level[:3]}'
84 |     save_dir = os.path.join(save_dir, dir_name)
85 |     if not os.path.exists(save_dir): os.makedirs(save_dir)
86 | 
87 |     # extract features
88 |     extract(audio_files, save_dir, args.feature_level)
89 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/extract_imagenet_embedding.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | import os
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torchvision
 9 | from torchvision import transforms
10 | 
11 | # import config
12 | import sys
13 | sys.path.append('../../')
14 | import config
15 | from dataset import FaceDataset
16 | 
17 | 
18 | def extract(data_loader, model):
19 |     model.eval()
20 |     with torch.no_grad():
21 |         features, timestamps = [], []
22 |         for images, names in data_loader:
23 |             images = images.cuda()
24 |             embedding = model(images)
25 |             embedding = embedding.squeeze() # [32, 512, 1, 1] => [32, 512]
26 |             features.append(embedding.cpu().detach().numpy())
27 |             timestamps.extend(names)
28 |         features, timestamps = np.row_stack(features), np.array(timestamps)
29 |         return features, timestamps
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     parser = argparse.ArgumentParser(description='Run.')
34 |     parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset')
35 |     parser.add_argument('--feature_level', type=str, default='UTTERANCE', help='feature level [FRAME or UTTERANCE]')
36 |     parser.add_argument('--gpu', type=str, default='1', help='gpu id')
37 |     params = parser.parse_args()
38 |     os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu
39 | 
40 |     print('==> Extracting imagenet embedding...')
41 |     face_dir = config.PATH_TO_RAW_FACE[params.dataset]
42 |     save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], f'imagenet_{params.feature_level[:3]}')
43 |     if not os.path.exists(save_dir): os.makedirs(save_dir)
44 | 
45 |     # load model
46 |     model = torchvision.models.resnet18(True)
47 |     model = model.cuda()
48 |     model = nn.Sequential(*list(model.children())[:-1])
49 | 
50 |     # transform
51 |     transform = transforms.Compose([transforms.Resize((224, 224)),
52 |                                     transforms.ToTensor(),
53 |                                     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
54 | 
55 |     # extract embedding video by video
56 |     vids = os.listdir(face_dir)
57 |     EMBEDDING_DIM = -1
58 |     print(f'Find total "{len(vids)}" videos.')
59 |     for i, vid in enumerate(vids, 1):
60 |         print(f"Processing video '{vid}' ({i}/{len(vids)})...")
61 | 
62 |         # forward
63 |         dataset = FaceDataset(vid, face_dir, transform=transform)
64 |         if len(dataset) == 0:
65 |             print("Warning: number of frames of video {} should not be zero.".format(vid))
66 |             embeddings, framenames = [], []
67 |         else:
68 |             data_loader = torch.utils.data.DataLoader(dataset,
69 |                                                       batch_size=32,
70 |                                                       num_workers=4,
71 |                                                       pin_memory=True)
72 |             embeddings, framenames = extract(data_loader, model)
73 | 
74 |         # save results
75 |         indexes = np.argsort(framenames)
76 |         embeddings = embeddings[indexes]
77 |         framenames = framenames[indexes]
78 |         EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1])
79 | 
80 |         csv_file = os.path.join(save_dir, f'{vid}.npy')
81 |         if params.feature_level == 'FRAME':
82 |             embeddings = np.array(embeddings).squeeze()
83 |             if len(embeddings) == 0:
84 |                 embeddings = np.zeros((1, EMBEDDING_DIM))
85 |             elif len(embeddings.shape) == 1:
86 |                 embeddings = embeddings[np.newaxis, :]
87 |             np.save(csv_file, embeddings)
88 |         else:
89 |             embeddings = np.array(embeddings).squeeze()
90 |             if len(embeddings) == 0:
91 |                 embeddings = np.zeros((EMBEDDING_DIM, ))
92 |             elif len(embeddings.shape) == 2:
93 |                 embeddings = np.mean(embeddings, axis=0)
94 |             np.save(csv_file, embeddings)
95 |  
96 |  


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/extract_wav2vec_embedding.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | """
 3 | wav2vec: https://arxiv.org/abs/1904.05862
 4 | official github repo: https://github.com/pytorch/fairseq/tree/master/examples/wav2vec
 5 | """
 6 | import os
 7 | import time
 8 | import glob
 9 | import torch
10 | import argparse
11 | import numpy as np
12 | import soundfile as sf
13 | from fairseq.models.wav2vec import Wav2VecModel # Note: use fairseq version of 0.10.1, error occurred when using the newest officical script and version of 0.10.2 (pip install fairseq==0.10.1)
14 | 
15 | # import config
16 | import sys
17 | sys.path.append('../../')
18 | import config
19 | 
20 | def write_feature_to_npy(feature, feature_level, save_path):
21 |     if feature_level == 'UTTERANCE':
22 |         feature = np.array(feature).squeeze() # [C,]
23 |         if len(feature.shape) != 1: # change [T, C] => [C,]
24 |             feature = np.mean(feature, axis=0)
25 |         np.save(save_path, feature)
26 |     else:
27 |         np.save(save_path, feature)
28 | 
29 | def extract(audio_files, feature_level, model, save_dir, gpu=None):
30 |     start_time = time.time()
31 |     device = torch.device(f'cuda:{gpu}')
32 | 
33 |     # create folders [save two features in 'wav2vec-large']
34 |     dir_name = 'wav2vec-large'
35 |     out_dir_z = os.path.join(save_dir, f'{dir_name}-z-{feature_level[:3]}') # features output by feature encoder
36 |     out_dir_c = os.path.join(save_dir, f'{dir_name}-c-{feature_level[:3]}') # features output by context network
37 |     if not os.path.exists(out_dir_z): os.makedirs(out_dir_z)
38 |     if not os.path.exists(out_dir_c): os.makedirs(out_dir_c)
39 |     
40 |     # iterate audios
41 |     for idx, wav_file in enumerate(audio_files, 1):
42 |         file_name = os.path.basename(wav_file)
43 |         vid = file_name[:-4]
44 |         print(f'Processing "{file_name}" ({idx}/{len(audio_files)})...')
45 |         # load audio
46 |         audio, sampling_rate = sf.read(wav_file)
47 |         audio = audio.astype('float32')[np.newaxis, :]
48 |         audio = torch.from_numpy(audio)
49 |         audio = audio.to(device)
50 |         assert sampling_rate == 16000, f'Error: sampling rate ({sampling_rate}) != 16k!'
51 |         with torch.no_grad():
52 |             z = model.feature_extractor(audio) # (1, C, T), stride: 10ms (100Hz), receptive field: 30ms
53 |             c = model.feature_aggregator(z)    # (1, C, T), stride: 10ms (100Hz), receptive field: 801ms (for large version)
54 | 
55 |         # save
56 |         z_feature = z.detach().squeeze().t().cpu().numpy()
57 |         c_feature = c.detach().squeeze().t().cpu().numpy()
58 |         z_npy_file = os.path.join(out_dir_z, f'{vid}.npy')
59 |         c_npy_file = os.path.join(out_dir_c, f'{vid}.npy')
60 |         write_feature_to_npy(z_feature, feature_level, z_npy_file)
61 |         write_feature_to_npy(c_feature, feature_level, c_npy_file)
62 | 
63 |     end_time = time.time()
64 |     print(f'Total time used: {end_time - start_time:.1f}s.')
65 | 
66 | if __name__ == '__main__':
67 |     parser = argparse.ArgumentParser(description='Run.')
68 |     parser.add_argument('--gpu', type=int, default=0, help='index of gpu')
69 |     parser.add_argument('--feature_level', type=str, default='FRAME', help='name of feature level, FRAME or UTTERANCE')
70 |     parser.add_argument('--dataset', type=str, default='MER2023', help='dataset')
71 |     args = parser.parse_args()
72 | 
73 |     # gain paths
74 |     audio_dir = config.PATH_TO_RAW_AUDIO[args.dataset]
75 |     save_dir  = config.PATH_TO_FEATURES[args.dataset]
76 |     audio_files = glob.glob(os.path.join(audio_dir, '*.wav'))
77 |     print(f'Find total "{len(audio_files)}" audio files.')
78 | 
79 |     # load model
80 |     device = torch.device(f'cuda:{args.gpu}')
81 |     model_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, f'wav2vec/wav2vec_large.pt')
82 |     cp = torch.load(model_file)
83 |     model = Wav2VecModel.build_model(cp['args'], task=None)
84 |     model.load_state_dict(cp['model'])
85 |     model.to(device)
86 |     model.eval()
87 | 
88 |     # extract features
89 |     extract(audio_files, args.feature_level, model, save_dir, args.gpu)
90 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/extract_emonet_embedding.py:
--------------------------------------------------------------------------------
 1 | # *_*coding:utf-8 *_*
 2 | import os
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | import torch.nn.parallel
 8 | import torch.optim
 9 | import torch.utils.data
10 | import torch.utils.data.distributed
11 | import torchvision.transforms as transforms
12 | 
13 | from emonet.models.emonet import EmoNet
14 | from dataset import FaceDatasetForEmoNet
15 | from emonet.data_augmentation import DataAugmentor
16 | 
17 | # import config
18 | import sys
19 | sys.path.append('../../')
20 | import config
21 | 
22 | def extract(data_loader, model):
23 |     model.eval()
24 |     with torch.no_grad():
25 |         features, timestamps = [], []
26 |         for images, names in data_loader:
27 |             images = images.cuda()
28 |             embedding = model(images, return_embedding=True)
29 |             features.append(embedding.cpu().detach().numpy())
30 |             timestamps.extend(names)
31 |         features, timestamps = np.row_stack(features), np.array(timestamps)
32 |         return features, timestamps
33 | 
34 | if __name__ == '__main__':
35 |     parser = argparse.ArgumentParser(description='Run.')
36 |     parser.add_argument('--dataset', type=str, default='MER2023', help='input dataset')
37 |     parser.add_argument('--feature_level', type=str, default='UTTERANCE', help='feature level [FRAME or UTTERANCE]')
38 |     parser.add_argument('--gpu', type=str, default='0', help='gpu id')
39 |     params = parser.parse_args()
40 |     os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu
41 | 
42 |     print(f'==> Extracting emonet embedding...')
43 |     face_dir = config.PATH_TO_RAW_FACE[params.dataset]
44 |     save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], f'emonet_{params.feature_level[:3]}')
45 |     if not os.path.exists(save_dir): os.makedirs(save_dir)
46 | 
47 |     # load model
48 |     model = EmoNet().cuda()
49 |     checkpoint_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, 'emonet/emonet_8.pth')
50 |     checkpoint = torch.load(checkpoint_file)
51 |     pre_trained_dict = {k.replace('module.', ''): v for k,v in checkpoint.items()}
52 |     model.load_state_dict(pre_trained_dict)
53 | 
54 |     # transform
55 |     augmentor = DataAugmentor(256, 256)
56 |     transform = transforms.Compose([transforms.ToTensor()])
57 | 
58 |     # extract embedding video by video
59 |     vids = os.listdir(face_dir)
60 |     EMBEDDING_DIM = -1
61 |     print(f'Find total "{len(vids)}" videos.')
62 |     for i, vid in enumerate(vids, 1):
63 |         print(f"Processing video '{vid}' ({i}/{len(vids)})...")
64 |         # csv_file = os.path.join(save_dir, f'{vid}.npy')
65 |         # if os.path.exists(csv_file): continue
66 | 
67 |         # forward
68 |         dataset = FaceDatasetForEmoNet(vid, face_dir, transform=transform, augmentor=augmentor)
69 |         if len(dataset) == 0:
70 |             print("Warning: number of frames of video {} should not be zero.".format(vid))
71 |             embeddings, framenames = [], []
72 |         else:
73 |             data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True)
74 |             embeddings, framenames = extract(data_loader, model)
75 |             
76 |         # save results
77 |         indexes = np.argsort(framenames)
78 |         embeddings = embeddings[indexes]
79 |         framenames = framenames[indexes]
80 |         EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1])
81 | 
82 |         csv_file = os.path.join(save_dir, f'{vid}.npy')
83 |         if params.feature_level == 'FRAME':
84 |             embeddings = np.array(embeddings).squeeze()
85 |             if len(embeddings) == 0:
86 |                 embeddings = np.zeros((1, EMBEDDING_DIM))
87 |             elif len(embeddings.shape) == 1:
88 |                 embeddings = embeddings[np.newaxis, :]
89 |             np.save(csv_file, embeddings)
90 |         else:
91 |             embeddings = np.array(embeddings).squeeze()
92 |             if len(embeddings) == 0:
93 |                 embeddings = np.zeros((EMBEDDING_DIM, ))
94 |             elif len(embeddings.shape) == 2:
95 |                 embeddings = np.mean(embeddings, axis=0)
96 |             np.save(csv_file, embeddings)


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/vggish/vggish_postprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Post-process embeddings from VGGish."""
17 | 
18 | import numpy as np
19 | 
20 | import vggish_params
21 | 
22 | 
23 | class Postprocessor(object):
24 |   """Post-processes VGGish embeddings.
25 | 
26 |   The initial release of AudioSet included 128-D VGGish embeddings for each
27 |   segment of AudioSet. These released embeddings were produced by applying
28 |   a PCA transformation (technically, a whitening transform is included as well)
29 |   and 8-bit quantization to the raw embedding output from VGGish, in order to
30 |   stay compatible with the YouTube-8M project which provides visual embeddings
31 |   in the same format for a large set of YouTube videos. This class implements
32 |   the same PCA (with whitening) and quantization transformations.
33 |   """
34 | 
35 |   def __init__(self, pca_params_npz_path):
36 |     """Constructs a postprocessor.
37 | 
38 |     Args:
39 |       pca_params_npz_path: Path to a NumPy-format .npz file that
40 |         contains the PCA parameters used in postprocessing.
41 |     """
42 |     params = np.load(pca_params_npz_path)
43 |     self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
44 |     # Load means into a column vector for easier broadcasting later.
45 |     self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
46 |     assert self._pca_matrix.shape == (
47 |         vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
48 |             'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
49 |     assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
50 |         'Bad PCA means shape: %r' % (self._pca_means.shape,))
51 | 
52 |   def postprocess(self, embeddings_batch):
53 |     """Applies postprocessing to a batch of embeddings.
54 | 
55 |     Args:
56 |       embeddings_batch: An nparray of shape [batch_size, embedding_size]
57 |         containing output from the embedding layer of VGGish.
58 | 
59 |     Returns:
60 |       An nparray of the same shape as the input but of type uint8,
61 |       containing the PCA-transformed and quantized version of the input.
62 |     """
63 |     assert len(embeddings_batch.shape) == 2, (
64 |         'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
65 |     assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
66 |         'Bad batch shape: %r' % (embeddings_batch.shape,))
67 | 
68 |     # Apply PCA.
69 |     # - Embeddings come in as [batch_size, embedding_size].
70 |     # - Transpose to [embedding_size, batch_size].
71 |     # - Subtract pca_means column vector from each column.
72 |     # - Premultiply by PCA matrix of shape [output_dims, input_dims]
73 |     #   where both are are equal to embedding_size in our case.
74 |     # - Transpose result back to [batch_size, embedding_size].
75 |     pca_applied = np.dot(self._pca_matrix,
76 |                          (embeddings_batch.T - self._pca_means)).T
77 | 
78 |     # Quantize by:
79 |     # - clipping to [min, max] range
80 |     clipped_embeddings = np.clip(
81 |         pca_applied, vggish_params.QUANTIZE_MIN_VAL,
82 |         vggish_params.QUANTIZE_MAX_VAL)
83 |     # - convert to 8-bit in range [0.0, 255.0]
84 |     quantized_embeddings = (
85 |         (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
86 |         (255.0 /
87 |          (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
88 |     # - cast 8-bit float to uint8
89 |     quantized_embeddings = quantized_embeddings.astype(np.uint8)
90 | 
91 |     return quantized_embeddings
92 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/vggish/vggish_smoke_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """A smoke test for VGGish.
 17 | 
 18 | This is a simple smoke test of a local install of VGGish and its associated
 19 | downloaded files. We create a synthetic sound, extract log mel spectrogram
 20 | features, run them through VGGish, post-process the embedding ouputs, and
 21 | check some simple statistics of the results, allowing for variations that
 22 | might occur due to platform/version differences in the libraries we use.
 23 | 
 24 | Usage:
 25 | - Download the VGGish checkpoint and PCA parameters into the same directory as
 26 |   the VGGish source code. If you keep them elsewhere, update the checkpoint_path
 27 |   and pca_params_path variables below.
 28 | - Run:
 29 |   $ python vggish_smoke_test.py
 30 | """
 31 | 
 32 | from __future__ import print_function
 33 | 
 34 | import numpy as np
 35 | import tensorflow.compat.v1 as tf
 36 | import os
 37 | os.environ['CUDA_VISIBLE_DEVICES'] = '6'
 38 | tf.disable_v2_behavior()
 39 | 
 40 | import vggish_input
 41 | import vggish_params
 42 | import vggish_postprocess
 43 | import vggish_slim
 44 | 
 45 | print('\nTesting your install of VGGish\n')
 46 | 
 47 | # Paths to downloaded VGGish files.
 48 | checkpoint_path = 'vggish_model.ckpt'
 49 | pca_params_path = 'vggish_pca_params.npz'
 50 | 
 51 | # Relative tolerance of errors in mean and standard deviation of embeddings.
 52 | rel_error = 0.1  # Up to 10%
 53 | 
 54 | # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate
 55 | # to test resampling to 16 kHz during feature extraction).
 56 | num_secs = 3
 57 | freq = 1000
 58 | sr = 44100
 59 | t = np.linspace(0, num_secs, int(num_secs * sr))
 60 | x = np.sin(2 * np.pi * freq * t)
 61 | 
 62 | # Produce a batch of log mel spectrogram examples.
 63 | input_batch = vggish_input.waveform_to_examples(x, sr)
 64 | print('Log Mel Spectrogram example: ', input_batch[0])
 65 | np.testing.assert_equal(
 66 |     input_batch.shape,
 67 |     [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])
 68 | 
 69 | # Define VGGish, load the checkpoint, and run the batch through the model to
 70 | # produce embeddings.
 71 | with tf.Graph().as_default(), tf.Session() as sess:
 72 |   vggish_slim.define_vggish_slim()
 73 |   vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
 74 | 
 75 |   features_tensor = sess.graph.get_tensor_by_name(
 76 |       vggish_params.INPUT_TENSOR_NAME)
 77 |   embedding_tensor = sess.graph.get_tensor_by_name(
 78 |       vggish_params.OUTPUT_TENSOR_NAME)
 79 |   [embedding_batch] = sess.run([embedding_tensor],
 80 |                                feed_dict={features_tensor: input_batch})
 81 |   print('VGGish embedding: ', embedding_batch[0])
 82 |   expected_embedding_mean = 0.131
 83 |   expected_embedding_std = 0.238
 84 |   np.testing.assert_allclose(
 85 |       [np.mean(embedding_batch), np.std(embedding_batch)],
 86 |       [expected_embedding_mean, expected_embedding_std],
 87 |       rtol=rel_error)
 88 | 
 89 | # Postprocess the results to produce whitened quantized embeddings.
 90 | pproc = vggish_postprocess.Postprocessor(pca_params_path)
 91 | postprocessed_batch = pproc.postprocess(embedding_batch)
 92 | print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
 93 | expected_postprocessed_mean = 123.0
 94 | expected_postprocessed_std = 75.0
 95 | np.testing.assert_allclose(
 96 |     [np.mean(postprocessed_batch), np.std(postprocessed_batch)],
 97 |     [expected_postprocessed_mean, expected_postprocessed_std],
 98 |     rtol=rel_error)
 99 | 
100 | print('\nLooks Good To Me!\n')
101 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/cmumosi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import pickle
 4 | from toolkit.utils.chatgpt import *
 5 | from toolkit.utils.functions import *
 6 | from toolkit.utils.read_files import *
 7 | 
 8 | 
 9 | def generate_transcription(label_path, save_path):
10 |     ## read pkl file
11 |     names, eng_sentences = [], []
12 |     videoIDs, _, _, videoSentences, _, _, _ = pickle.load(open(label_path, "rb"), encoding='latin1')
13 |     for vid in videoIDs:
14 |         names.extend(videoIDs[vid])
15 |         eng_sentences.extend(videoSentences[vid])
16 |     print (f'whole sample number: {len(names)}')
17 |     
18 |     # translate eng2chi
19 |     chi_sentences = []
20 |     for eng in eng_sentences:
21 |         # chi = get_translate_eng2chi(eng, model='gpt-3.5-turbo-16k-0613')
22 |         chi = get_translate_eng2chi(eng, model='gpt-4-0613')
23 |         chi_sentences.append(chi)
24 | 
25 |     ## write to csv file
26 |     name2key = {}
27 |     for ii, name in enumerate(names):
28 |         name2key[name] = [chi_sentences[ii], eng_sentences[ii]]
29 |     func_write_key_to_csv(save_path, names, name2key, ['chinese', 'english'])
30 | 
31 | 
32 | def read_train_val_test(label_path, data_type):
33 |     names, labels = [], []
34 |     assert data_type in ['train', 'val', 'test']
35 |     videoIDs, videoLabels, _, _, trainVids, valVids, testVids = pickle.load(open(label_path, "rb"), encoding='latin1')
36 |     if data_type == 'train': vids = trainVids
37 |     if data_type == 'val':   vids = valVids
38 |     if data_type == 'test':  vids = testVids
39 |     for vid in vids:
40 |         names.extend(videoIDs[vid])
41 |         labels.extend(videoLabels[vid])
42 |     return names, labels
43 | 
44 | 
45 | def normalize_dataset_format(data_root, save_root):
46 |     # gain paths
47 |     label_path = os.path.join(save_root, 'CMUMOSI_features_raw_2way.pkl')
48 |     assert os.path.exists(label_path), f'must has a pre-processed label file'
49 |     video_root = os.path.join(data_root, 'Video/Segmented')
50 | 
51 |     # gain (names, labels)
52 |     train_names, train_labels = read_train_val_test(label_path, 'train')
53 |     val_names,   val_labels   = read_train_val_test(label_path, 'val')
54 |     test_names,  test_labels  = read_train_val_test(label_path, 'test')
55 |     print (f'train number: {len(train_names)}')
56 |     print (f'val   number: {len(val_names)}')
57 |     print (f'test  number: {len(test_names)}')
58 |     
59 |     ## output path
60 |     save_video = os.path.join(save_root, 'subvideo')
61 |     save_label = os.path.join(save_root, 'label.npz')
62 |     save_trans = os.path.join(save_root, 'transcription.csv')
63 |     if not os.path.exists(save_root):  os.makedirs(save_root)
64 |     if not os.path.exists(save_video): os.makedirs(save_video)
65 | 
66 |     ## generate new transcripts
67 |     generate_transcription(label_path, save_trans)
68 | 
69 |     ## generate label path
70 |     whole_corpus = {}
71 |     for name, videonames, labels in [('train', train_names, train_labels),
72 |                                      ('val',   val_names,   val_labels  ),
73 |                                      ('test',  test_names,  test_labels )]:
74 |         whole_corpus[name] = {}
75 |         for ii, videoname in enumerate(videonames):
76 |             whole_corpus[name][videoname] = {'emo': 0, 'val': labels[ii]}
77 |             
78 |             # move video
79 |             video_path = os.path.join(video_root, videoname+'.mp4')
80 |             save_path  = os.path.join(save_video, videoname+'.mp4')
81 |             shutil.copy(video_path, save_path)
82 |             
83 |     np.savez_compressed(save_label,
84 |                         train_corpus=whole_corpus['train'],
85 |                         val_corpus=whole_corpus['val'],
86 |                         test_corpus=whole_corpus['test'])
87 | 
88 | 
89 | if __name__ == '__main__':
90 | 
91 |     data_root = 'G:\\CMU-MOSI\\Raw'
92 |     save_root = 'E:\\Dataset\\cmumosi-process'
93 |     normalize_dataset_format(data_root, save_root)
94 | 
95 |     # data_root = 'H:\\desktop\\Multimedia-Transformer\\chinese-mer-2023\\dataset\\cmumosi-process'
96 |     # trans_path = os.path.join(data_root, 'transcription.csv')
97 |     # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv')
98 |     # func_translate_transcript_polish_merge(trans_path, polish_path) # 再次检测一下遗漏的部分
99 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/meld.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from toolkit.utils.chatgpt import *
  4 | from toolkit.utils.functions import *
  5 | from toolkit.utils.read_files import *
  6 | 
  7 | emos = ['anger', 'joy', 'sadness', 'neutral', 'disgust', 'fear', 'surprise']
  8 | emo2idx, idx2emo = {}, {}
  9 | for ii, emo in enumerate(emos):
 10 |     emo2idx[emo] = ii
 11 |     idx2emo[ii]  = emo
 12 | 
 13 | 
 14 | def read_labels(label_path):
 15 | 
 16 |     dia_ids = func_read_key_from_csv(label_path, 'Dialogue_ID')
 17 |     utt_ids = func_read_key_from_csv(label_path, 'Utterance_ID')
 18 |     labels  = func_read_key_from_csv(label_path, 'Emotion')
 19 |     engs    = func_read_key_from_csv(label_path, 'Utterance')
 20 | 
 21 |     names = []
 22 |     for ii in range(len(dia_ids)):
 23 |         names.append(f'dia{dia_ids[ii]}_utt{utt_ids[ii]}')
 24 |     
 25 |     labels = [emo2idx[label] for label in labels]
 26 | 
 27 |     return names, labels, engs
 28 | 
 29 | 
 30 | def normalize_dataset_format(data_root, save_root):
 31 | 
 32 |     # gain paths
 33 |     train_label_path = os.path.join(data_root, 'train_sent_emo.csv')
 34 |     train_video_root = os.path.join(data_root, 'train')
 35 |     val_label_path   = os.path.join(data_root, 'dev_sent_emo.csv')
 36 |     val_video_root   = os.path.join(data_root, 'dev')
 37 |     test_label_path  = os.path.join(data_root, 'test_sent_emo.csv')
 38 |     test_video_root  = os.path.join(data_root, 'test')
 39 | 
 40 |     # gain (names, labels)
 41 |     train_names, train_labels, train_engs = read_labels(train_label_path)
 42 |     val_names,   val_labels,   val_engs   = read_labels(val_label_path)
 43 |     test_names,  test_labels,  test_engs  = read_labels(test_label_path)
 44 |     print (f'train number: {len(train_names)}')
 45 |     print (f'val   number: {len(val_names)}')
 46 |     print (f'test  number: {len(test_names)}')
 47 |     
 48 |     ## output path
 49 |     save_video = os.path.join(save_root, 'subvideo')
 50 |     save_label = os.path.join(save_root, 'label.npz')
 51 |     save_trans = os.path.join(save_root, 'transcription.csv')
 52 |     if not os.path.exists(save_root):  os.makedirs(save_root)
 53 |     if not os.path.exists(save_video): os.makedirs(save_video)
 54 | 
 55 |     ## generate label path
 56 |     name2eng = {}
 57 |     whole_corpus = {}
 58 |     for datatype, names, labels, engs, video_root in [('train', train_names, train_labels, train_engs, train_video_root),
 59 |                                                       ('val',   val_names,   val_labels,   val_engs,   val_video_root),
 60 |                                                       ('test',  test_names,  test_labels,  test_engs,  test_video_root)]:
 61 |         whole_corpus[datatype] = {}
 62 |         for ii, name in enumerate(names):
 63 |             newname = f'{datatype}_{name}'
 64 |             whole_corpus[datatype][newname] = {'emo': labels[ii], 'val': -10} # save labels
 65 |             name2eng[newname] = engs[ii] # save trans
 66 |             
 67 |             # move video
 68 |             video_path = os.path.join(video_root, name+'.mp4')
 69 |             save_path  = os.path.join(save_video, newname+'.mp4')
 70 |             if os.path.exists(save_path): continue
 71 |             try:
 72 |                 shutil.copy(video_path, save_path)
 73 |             except:
 74 |                 print (f'ERROR: {video_path} does not exist!')
 75 |             
 76 |     # save labels
 77 |     np.savez_compressed(save_label,
 78 |                         train_corpus=whole_corpus['train'],
 79 |                         val_corpus=whole_corpus['val'],
 80 |                         test_corpus=whole_corpus['test'])
 81 | 
 82 |     # save trans
 83 |     names = [name for name in name2eng]
 84 |     name2key = {}
 85 |     for ii, name in enumerate(names):
 86 |         name2key[name] = [name2eng[name]]
 87 |     func_write_key_to_csv(save_trans, names, name2key, ['english'])
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 | 
 92 |     data_root = 'E:\\Dataset\\MELD'
 93 |     save_root = 'E:\\Dataset\\meld-process'
 94 |     normalize_dataset_format(data_root, save_root)
 95 | 
 96 |     # data_root = 'E:\\Dataset\\meld-process'
 97 |     # trans_path  = os.path.join(data_root, 'transcription.csv')
 98 |     # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv')
 99 |     # func_translate_transcript_polish_merge(trans_path, polish_path) # 再次检测一下遗漏的部分
100 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/vggish/vggish_input.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Compute input examples for VGGish from audio waveform."""
 17 | 
 18 | import numpy as np
 19 | import resampy # verison: 0.2.2, pip install resampy
 20 | import math
 21 | from vggish import mel_features
 22 | from vggish import vggish_params
 23 | 
 24 | try:
 25 |   import soundfile as sf
 26 | 
 27 |   def wav_read(wav_file):
 28 |     wav_data, sr = sf.read(wav_file, dtype='int16')
 29 |     return wav_data, sr
 30 | 
 31 | except ImportError:
 32 | 
 33 |   def wav_read(wav_file):
 34 |     raise NotImplementedError('WAV file reading requires soundfile package.')
 35 | 
 36 | 
 37 | def waveform_to_examples(data, sample_rate, hop_sec):
 38 |   """Converts audio waveform into an array of examples for VGGish.
 39 | 
 40 |   Args:
 41 |     data: np.array of either one dimension (mono) or two dimensions
 42 |       (multi-channel, with the outer dimension representing channels).
 43 |       Each sample is generally expected to lie in the range [-1.0, +1.0],
 44 |       although this is not required.
 45 |     sample_rate: Sample rate of data.
 46 | 
 47 |   Returns:
 48 |     3-D np.array of shape [num_examples, num_frames, num_bands] which represents
 49 |     a sequence of examples, each of which contains a patch of log mel
 50 |     spectrogram, covering num_frames frames of audio and num_bands mel frequency
 51 |     bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
 52 |   """
 53 |   # Convert to mono.
 54 |   if len(data.shape) > 1:
 55 |     data = np.mean(data, axis=1)
 56 |   # Resample to the rate assumed by VGGish.
 57 |   if sample_rate != vggish_params.SAMPLE_RATE:
 58 |     data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
 59 | 
 60 |   # Compute log mel spectrogram features.
 61 |   log_mel = mel_features.log_mel_spectrogram(
 62 |       data,
 63 |       audio_sample_rate=vggish_params.SAMPLE_RATE,
 64 |       log_offset=vggish_params.LOG_OFFSET,
 65 |       window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
 66 |       hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
 67 |       num_mel_bins=vggish_params.NUM_MEL_BINS,
 68 |       lower_edge_hertz=vggish_params.MEL_MIN_HZ,
 69 |       upper_edge_hertz=vggish_params.MEL_MAX_HZ)
 70 | 
 71 |   # Frame features into examples.
 72 |   features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
 73 |   example_window_length = int(round(
 74 |       vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
 75 |   example_hop_length = int(round(
 76 |       hop_sec * features_sample_rate))
 77 |       # vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # orginal
 78 |   log_mel_examples = mel_features.frame(
 79 |       log_mel,
 80 |       window_length=example_window_length,
 81 |       hop_length=example_hop_length)
 82 |   return log_mel_examples
 83 | 
 84 | 
 85 | def wavfile_to_examples(wav_file, hop_sec):
 86 |   """Convenience wrapper around waveform_to_examples() for a common WAV format.
 87 | 
 88 |   Args:
 89 |     wav_file: String path to a file, or a file-like object. The file
 90 |     is assumed to contain WAV audio data with signed 16-bit PCM samples.
 91 | 
 92 |   Returns:
 93 |     See waveform_to_examples.
 94 |   """
 95 |   wav_data, sr = wav_read(wav_file)
 96 |   assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
 97 |   samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
 98 | 
 99 |   ### process for samples < 1000ms, pad to longer than 1000ms
100 |   if len(samples) < sr:
101 |       samples = samples.tolist()
102 |       samples = samples * math.ceil(sr/len(samples))
103 |       samples = np.array(samples)
104 | 
105 |   return waveform_to_examples(samples, sr, hop_sec)
106 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/chatgpt.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import cv2
  4 | import glob
  5 | import base64
  6 | import numpy as np
  7 | 
  8 | import openai
  9 | 
 10 | # avoid RPD errors
 11 | global_index = 1
 12 | candidate_keys = ["sk-xxxx", "sk-xxxx", "sk-xxxx"] # Please use your own APIs, we support multiple APIs
 13 | openai.api_key = candidate_keys[global_index]
 14 | 
 15 | # 单次调用
 16 | def func_get_completion(prompt, model="gpt-3.5-turbo-16k-0613"):
 17 |     try:
 18 |         messages = [{"role": "user", "content": prompt}]
 19 |         response = openai.ChatCompletion.create(
 20 |             model=model,
 21 |             messages=messages,
 22 |             temperature=0, # this is the degree of randomness
 23 |             max_tokens=1000,
 24 |         )
 25 |         return response['choices'][0]['message']['content']
 26 |     except Exception as e:
 27 |         print ('发生错误：', e) # change key to avoid RPD
 28 |         global global_index # 修改全局变量
 29 |         global_index = (global_index + 1) % 3
 30 |         print (f'========== key index: {global_index} ==========')
 31 |         openai.api_key = candidate_keys[global_index]
 32 |         return ''
 33 | 
 34 | # 多次调用，避免网络异常
 35 | def get_completion(prompt, model, maxtry=5):
 36 |     response = ''
 37 |     try_number = 0
 38 |     while len(response) == 0:
 39 |         try_number += 1
 40 |         if try_number == maxtry: 
 41 |             print (f'fail for {maxtry} times')
 42 |             break
 43 |         response = func_get_completion(prompt, model)
 44 |     return response
 45 | 
 46 | # chatgpt输出结果后处理
 47 | def func_postprocess_chatgpt(response):
 48 |     response = response.strip()
 49 |     if response.startswith("输入"):   response = response[len("输入"):]
 50 |     if response.startswith("输出"):   response = response[len("输出"):]
 51 |     if response.startswith("翻译"):   response = response[len("翻译"):]
 52 |     if response.startswith("让我们来翻译一下："): response = response[len("让我们来翻译一下："):]
 53 |     if response.startswith("output"): response = response[len("output"):]
 54 |     if response.startswith("Output"): response = response[len("Output"):]
 55 |     response = response.strip()
 56 |     if response.startswith(":"):  response = response[len(":"):]
 57 |     if response.startswith("："): response = response[len("："):]
 58 |     response = response.strip()
 59 |     response = response.replace('\n', '') # remove \n
 60 |     response = response.strip()
 61 |     return response
 62 | 
 63 | 
 64 | # ---------------------------------------------------------------------
 65 | ## convert image/video into GPT4 support version
 66 | def func_image_to_base64(image_path, grey_flag=False): # support more types
 67 |     image = cv2.imread(image_path)
 68 |     if grey_flag:
 69 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 70 |     return func_opencv_to_base64(image)
 71 | 
 72 | def func_opencv_to_base64(image):
 73 |     _, buffer = cv2.imencode('.jpg', image)
 74 |     base64_image = base64.b64encode(buffer).decode('utf-8')
 75 |     return base64_image
 76 | 
 77 | # deal with text
 78 | def func_nyp_to_text(npy_path):
 79 |     text = np.load(npy_path).tolist()
 80 |     text = text.strip()
 81 |     text = text.replace('\n', '') # remove \n
 82 |     text = text.replace('\t', '') # remove \t
 83 |     text = text.strip()
 84 |     return text
 85 | 
 86 | # ---------------------------------------------------------------------
 87 | ## Translation
 88 | # ---------------------------------------------------------------------
 89 | def get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613'):
 90 |     if len(text) == 0:
 91 |         return ""
 92 |     
 93 |     prompt = f"""
 94 |               请将以下输入翻译为中文：
 95 |               
 96 |               输入：{text}
 97 | 
 98 |               输出：
 99 |               """
100 |     response = get_completion(prompt, model)
101 |     response = func_postprocess_chatgpt(response)
102 |     print (text)
103 |     print (response)
104 |     return response
105 | 
106 | 
107 | def get_translate_chi2eng(text, model='gpt-3.5-turbo-16k-0613'):
108 |     if len(text)==0:
109 |         return ""
110 |     
111 |     prompt = f"""
112 |               请将以下输入翻译为英文：
113 | 
114 |               输入：{text}
115 | 
116 |               输出：
117 |               """
118 |     response = get_completion(prompt, model)
119 |     response = func_postprocess_chatgpt(response)
120 |     print (text)
121 |     print (response)
122 |     return response
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     
127 |     ## text input [test ok]
128 |     text = 'The whether is sooooo good!!'
129 |     get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613')
130 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/chatgpt.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import cv2
  4 | import glob
  5 | import base64
  6 | import numpy as np
  7 | 
  8 | import openai
  9 | 
 10 | # avoid RPD errors
 11 | global_index = 1
 12 | candidate_keys = ["sk-xxxx", "sk-xxxx", "sk-xxxx"] # Please use your own APIs, we support multiple APIs
 13 | openai.api_key = candidate_keys[global_index]
 14 | 
 15 | # 单次调用
 16 | def func_get_completion(prompt, model="gpt-3.5-turbo-16k-0613"):
 17 |     try:
 18 |         messages = [{"role": "user", "content": prompt}]
 19 |         response = openai.ChatCompletion.create(
 20 |             model=model,
 21 |             messages=messages,
 22 |             temperature=0, # this is the degree of randomness
 23 |             max_tokens=1000,
 24 |         )
 25 |         return response['choices'][0]['message']['content']
 26 |     except Exception as e:
 27 |         print ('发生错误：', e) # change key to avoid RPD
 28 |         global global_index # 修改全局变量
 29 |         global_index = (global_index + 1) % 3
 30 |         print (f'========== key index: {global_index} ==========')
 31 |         openai.api_key = candidate_keys[global_index]
 32 |         return ''
 33 | 
 34 | # 多次调用，避免网络异常
 35 | def get_completion(prompt, model, maxtry=5):
 36 |     response = ''
 37 |     try_number = 0
 38 |     while len(response) == 0:
 39 |         try_number += 1
 40 |         if try_number == maxtry: 
 41 |             print (f'fail for {maxtry} times')
 42 |             break
 43 |         response = func_get_completion(prompt, model)
 44 |     return response
 45 | 
 46 | # chatgpt输出结果后处理
 47 | def func_postprocess_chatgpt(response):
 48 |     response = response.strip()
 49 |     if response.startswith("输入"):   response = response[len("输入"):]
 50 |     if response.startswith("输出"):   response = response[len("输出"):]
 51 |     if response.startswith("翻译"):   response = response[len("翻译"):]
 52 |     if response.startswith("让我们来翻译一下："): response = response[len("让我们来翻译一下："):]
 53 |     if response.startswith("output"): response = response[len("output"):]
 54 |     if response.startswith("Output"): response = response[len("Output"):]
 55 |     response = response.strip()
 56 |     if response.startswith(":"):  response = response[len(":"):]
 57 |     if response.startswith("："): response = response[len("："):]
 58 |     response = response.strip()
 59 |     response = response.replace('\n', '') # remove \n
 60 |     response = response.strip()
 61 |     return response
 62 | 
 63 | 
 64 | # ---------------------------------------------------------------------
 65 | ## convert image/video into GPT4 support version
 66 | def func_image_to_base64(image_path, grey_flag=False): # support more types
 67 |     image = cv2.imread(image_path)
 68 |     if grey_flag:
 69 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 70 |     return func_opencv_to_base64(image)
 71 | 
 72 | def func_opencv_to_base64(image):
 73 |     _, buffer = cv2.imencode('.jpg', image)
 74 |     base64_image = base64.b64encode(buffer).decode('utf-8')
 75 |     return base64_image
 76 | 
 77 | # deal with text
 78 | def func_nyp_to_text(npy_path):
 79 |     text = np.load(npy_path).tolist()
 80 |     text = text.strip()
 81 |     text = text.replace('\n', '') # remove \n
 82 |     text = text.replace('\t', '') # remove \t
 83 |     text = text.strip()
 84 |     return text
 85 | 
 86 | # ---------------------------------------------------------------------
 87 | ## Translation
 88 | # ---------------------------------------------------------------------
 89 | def get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613'):
 90 |     if len(text) == 0:
 91 |         return ""
 92 |     
 93 |     prompt = f"""
 94 |               请将以下输入翻译为中文：
 95 |               
 96 |               输入：{text}
 97 | 
 98 |               输出：
 99 |               """
100 |     response = get_completion(prompt, model)
101 |     response = func_postprocess_chatgpt(response)
102 |     print (text)
103 |     print (response)
104 |     return response
105 | 
106 | 
107 | def get_translate_chi2eng(text, model='gpt-3.5-turbo-16k-0613'):
108 |     if len(text)==0:
109 |         return ""
110 |     
111 |     prompt = f"""
112 |               请将以下输入翻译为英文：
113 | 
114 |               输入：{text}
115 | 
116 |               输出：
117 |               """
118 |     response = get_completion(prompt, model)
119 |     response = func_postprocess_chatgpt(response)
120 |     print (text)
121 |     print (response)
122 |     return response
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     
127 |     ## text input [test ok]
128 |     text = 'The whether is sooooo good!!'
129 |     get_translate_eng2chi(text, model='gpt-3.5-turbo-16k-0613')
130 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/lmf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | paper: Efficient Low-rank Multimodal Fusion with Modality-Specific Factors
 3 | From: https://github.com/Justin1904/Low-rank-Multimodal-Fusion
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | from torch.nn.init import xavier_normal_
 8 | from torch.nn.parameter import Parameter
 9 | from .modules.encoder import MLPEncoder, LSTMEncoder
10 | 
11 | class LMF(nn.Module):
12 | 
13 |     def __init__(self, args):
14 |         super(LMF, self).__init__()
15 | 
16 |         # load input and output dim
17 |         text_dim    = args.text_dim
18 |         audio_dim   = args.audio_dim
19 |         video_dim   = args.video_dim
20 |         output_dim1 = args.output_dim1
21 |         output_dim2 = args.output_dim2
22 |         rank = args.rank
23 |         dropout = args.dropout
24 |         hidden_dim = args.hidden_dim
25 |         self.grad_clip = args.grad_clip
26 | 
27 |         # define the pre-fusion subnetworks
28 |         if args.feat_type in ['utt']:
29 |             self.audio_encoder = MLPEncoder(audio_dim, hidden_dim, dropout)
30 |             self.text_encoder  = MLPEncoder(text_dim,  hidden_dim, dropout)
31 |             self.video_encoder = MLPEncoder(video_dim, hidden_dim, dropout)
32 |         elif args.feat_type in ['frm_align', 'frm_unalign']:
33 |             self.audio_encoder = LSTMEncoder(audio_dim, hidden_dim, dropout)
34 |             self.text_encoder  = LSTMEncoder(text_dim,  hidden_dim, dropout)
35 |             self.video_encoder = LSTMEncoder(video_dim, hidden_dim, dropout)
36 | 
37 |         # define the post_fusion layers
38 |         self.output_dim = hidden_dim // 2
39 |         self.post_fusion_dropout = nn.Dropout(p=dropout)
40 |         self.audio_factor   = Parameter(torch.Tensor(rank, hidden_dim + 1, self.output_dim))
41 |         self.video_factor   = Parameter(torch.Tensor(rank, hidden_dim + 1, self.output_dim))
42 |         self.text_factor    = Parameter(torch.Tensor(rank, hidden_dim + 1, self.output_dim))
43 |         self.fusion_weights = Parameter(torch.Tensor(1, rank))
44 |         self.fusion_bias    = Parameter(torch.Tensor(1, self.output_dim))
45 | 
46 |         # init teh factors
47 |         xavier_normal_(self.audio_factor)
48 |         xavier_normal_(self.video_factor)
49 |         xavier_normal_(self.text_factor)
50 |         xavier_normal_(self.fusion_weights)
51 |         self.fusion_bias.data.fill_(0)
52 | 
53 |         self.fc_out_1 = nn.Linear(self.output_dim, output_dim1)
54 |         self.fc_out_2 = nn.Linear(self.output_dim, output_dim2)
55 | 
56 | 
57 |     def forward(self, batch):
58 |         '''
59 |         Args:
60 |             audio_x: tensor of shape (batch_size, audio_in)
61 |             video_x: tensor of shape (batch_size, video_in)
62 |             text_x:  tensor of shape (batch_size, text_in)
63 |         '''
64 |         audio_h = self.audio_encoder(batch['audios'])
65 |         video_h = self.video_encoder(batch['videos'])
66 |         text_h  = self.text_encoder(batch['texts'])
67 |         batch_size = audio_h.data.shape[0]
68 | 
69 |         # next we perform low-rank multimodal fusion
70 |         # here is a more efficient implementation than the one the paper describes
71 |         # basically swapping the order of summation and elementwise product
72 |         # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product
73 |         add_one  = torch.ones(size=[batch_size, 1], requires_grad=False).type_as(audio_h).to(audio_h.device)
74 |         _audio_h = torch.cat((add_one, audio_h), dim=1)
75 |         _video_h = torch.cat((add_one, video_h), dim=1)
76 |         _text_h  = torch.cat((add_one, text_h), dim=1)
77 |         
78 |         # torch.matmul() 处理时会将 [batch, feat+1] -> [rank, batch, feat+1], 看结果就好像把 [feat+1] 分解为 rank * [hidden]
79 |         fusion_audio = torch.matmul(_audio_h, self.audio_factor) # [batch, feat+1] * [rank, feat+1, hidden] = [rank, batch, hidden]
80 |         fusion_video = torch.matmul(_video_h, self.video_factor)
81 |         fusion_text  = torch.matmul(_text_h,  self.text_factor )
82 |         fusion_zy    = fusion_audio * fusion_video * fusion_text # [rank, batch, hidden]
83 | 
84 |         # use linear transformation instead of simple summation, more flexibility
85 |         output = torch.matmul(self.fusion_weights, fusion_zy.permute(1, 0, 2)).squeeze() + self.fusion_bias # [1, rank] * [batch, rank, hidden] -> [batch, hidden]
86 |         features = output.view(-1, self.output_dim)
87 | 
88 |         emos_out  = self.fc_out_1(features)
89 |         vals_out  = self.fc_out_2(features)
90 |         interloss = torch.tensor(0).cuda()
91 | 
92 |         return features, emos_out, vals_out, interloss
93 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/extract_manet_embedding.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.nn.parallel
  7 | import torch.optim
  8 | import torch.utils.data
  9 | import torchvision.transforms as transforms
 10 | 
 11 | # import config
 12 | import sys
 13 | sys.path.append('../../')
 14 | import config
 15 | from dataset import FaceDataset
 16 | from manet.model.manet import manet
 17 | 
 18 | class RecorderMeter(object):
 19 |     """Computes and stores the minimum loss value and its epoch index"""
 20 | 
 21 |     def __init__(self, total_epoch):
 22 |         self.reset(total_epoch)
 23 | 
 24 |     def reset(self, total_epoch):
 25 |         self.total_epoch = total_epoch
 26 |         self.current_epoch = 0
 27 |         self.epoch_losses = np.zeros((self.total_epoch, 2), dtype=np.float32)    # [epoch, train/val]
 28 |         self.epoch_accuracy = np.zeros((self.total_epoch, 2), dtype=np.float32)  # [epoch, train/val]
 29 | 
 30 | def extract(data_loader, model):
 31 |     model.eval()
 32 |     with torch.no_grad():
 33 |         features, timestamps = [], []
 34 |         for images, names in data_loader:
 35 |             images = images.cuda()
 36 |             embedding = model(images, return_embedding=True)
 37 |             features.append(embedding.cpu().detach().numpy())
 38 |             timestamps.extend(names)
 39 |         features, timestamps = np.row_stack(features), np.array(timestamps)
 40 |         return features, timestamps
 41 | 
 42 | if __name__ == '__main__':
 43 |     parser = argparse.ArgumentParser(description='Run.')
 44 |     parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset')
 45 |     parser.add_argument('--feature_level', type=str, default='UTTERANCE', help='feature level [FRAME or UTTERANCE]')
 46 |     parser.add_argument('--gpu', type=str, default='1', help='gpu id')
 47 |     params = parser.parse_args()
 48 |     os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu
 49 | 
 50 |     print(f'==> Extracting manet embedding...')
 51 |     face_dir = config.PATH_TO_RAW_FACE[params.dataset]
 52 |     save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], f'manet_{params.feature_level[:3]}')
 53 |     if not os.path.exists(save_dir): os.makedirs(save_dir)
 54 | 
 55 |     # load model
 56 |     model = manet(num_classes=7).cuda()
 57 |     checkpoint_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, 'manet/[02-08]-[21-19]-model_best-acc88.33.pth')
 58 |     checkpoint = torch.load(checkpoint_file)
 59 |     pre_trained_dict = {k.replace('module.', ''): v for k,v in checkpoint['state_dict'].items()}
 60 |     model.load_state_dict(pre_trained_dict)
 61 | 
 62 |     # transform
 63 |     transform = transforms.Compose([transforms.Resize((224, 224)),
 64 |                                     transforms.ToTensor()])
 65 | 
 66 |     # extract embedding video by video
 67 |     vids = os.listdir(face_dir)
 68 |     EMBEDDING_DIM = -1
 69 |     print(f'Find total "{len(vids)}" videos.')
 70 |     for i, vid in enumerate(vids, 1):
 71 |         print(f"Processing video '{vid}' ({i}/{len(vids)})...")
 72 | 
 73 |         # forward
 74 |         dataset = FaceDataset(vid, face_dir, transform=transform)
 75 |         if len(dataset) == 0:
 76 |             print("Warning: number of frames of video {} should not be zero.".format(vid))
 77 |             embeddings, framenames = [], []
 78 |         else:
 79 |             data_loader = torch.utils.data.DataLoader(dataset,
 80 |                                                       batch_size=32,
 81 |                                                       num_workers=4,
 82 |                                                       pin_memory=True)
 83 |             embeddings, framenames = extract(data_loader, model)
 84 | 
 85 |         # save results
 86 |         indexes = np.argsort(framenames)
 87 |         embeddings = embeddings[indexes]
 88 |         framenames = framenames[indexes]
 89 |         EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1])
 90 | 
 91 |         save_file = os.path.join(save_dir, f'{vid}.npy')
 92 |         if params.feature_level == 'FRAME':
 93 |             embeddings = np.array(embeddings).squeeze()
 94 |             if len(embeddings) == 0:
 95 |                 embeddings = np.zeros((1, EMBEDDING_DIM))
 96 |             elif len(embeddings.shape) == 1:
 97 |                 embeddings = embeddings[np.newaxis, :]
 98 |             np.save(save_file, embeddings)
 99 |         else:
100 |             embeddings = np.array(embeddings).squeeze()
101 |             if len(embeddings) == 0:
102 |                 embeddings = np.zeros((EMBEDDING_DIM, ))
103 |             elif len(embeddings.shape) == 2:
104 |                 embeddings = np.mean(embeddings, axis=0)
105 |             np.save(save_file, embeddings)
106 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/model/vgg_vd_face_fer_dag.py:
--------------------------------------------------------------------------------
  1 | # *_*coding:utf-8 *_*
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | 
  7 | class Vgg_vd_face_fer_dag(nn.Module):
  8 | 
  9 |     def __init__(self):
 10 |         super(Vgg_vd_face_fer_dag, self).__init__()
 11 |         self.meta = {'mean': [129.186279296875, 104.76238250732422, 93.59396362304688],
 12 |                      'std': [1, 1, 1],
 13 |                      'imageSize': [224, 224, 3]}
 14 |         self.conv1_1 = nn.Conv2d(3, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 15 |         self.relu1_1 = nn.ReLU()
 16 |         self.conv1_2 = nn.Conv2d(64, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 17 |         self.relu1_2 = nn.ReLU()
 18 |         self.pool1 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
 19 |         self.conv2_1 = nn.Conv2d(64, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 20 |         self.relu2_1 = nn.ReLU()
 21 |         self.conv2_2 = nn.Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 22 |         self.relu2_2 = nn.ReLU()
 23 |         self.pool2 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
 24 |         self.conv3_1 = nn.Conv2d(128, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 25 |         self.relu3_1 = nn.ReLU()
 26 |         self.conv3_2 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 27 |         self.relu3_2 = nn.ReLU()
 28 |         self.conv3_3 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 29 |         self.relu3_3 = nn.ReLU()
 30 |         self.pool3 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
 31 |         self.conv4_1 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 32 |         self.relu4_1 = nn.ReLU()
 33 |         self.conv4_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 34 |         self.relu4_2 = nn.ReLU()
 35 |         self.conv4_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 36 |         self.relu4_3 = nn.ReLU()
 37 |         self.pool4 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
 38 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 39 |         self.relu5_1 = nn.ReLU()
 40 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 41 |         self.relu5_2 = nn.ReLU()
 42 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
 43 |         self.relu5_3 = nn.ReLU()
 44 |         self.pool5 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False)
 45 |         self.fc6 = nn.Conv2d(512, 4096, kernel_size=[7, 7], stride=(1, 1))
 46 |         self.relu6 = nn.ReLU()
 47 |         self.fc7 = nn.Linear(in_features=4096, out_features=4096, bias=True)
 48 |         self.relu7 = nn.ReLU()
 49 |         self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True)
 50 | 
 51 |     def forward(self, data):
 52 |         x1 = self.conv1_1(data)
 53 |         x2 = self.relu1_1(x1)
 54 |         x3 = self.conv1_2(x2)
 55 |         x4 = self.relu1_2(x3)
 56 |         x5 = self.pool1(x4)
 57 |         x6 = self.conv2_1(x5)
 58 |         x7 = self.relu2_1(x6)
 59 |         x8 = self.conv2_2(x7)
 60 |         x9 = self.relu2_2(x8)
 61 |         x10 = self.pool2(x9)
 62 |         x11 = self.conv3_1(x10)
 63 |         x12 = self.relu3_1(x11)
 64 |         x13 = self.conv3_2(x12)
 65 |         x14 = self.relu3_2(x13)
 66 |         x15 = self.conv3_3(x14)
 67 |         x16 = self.relu3_3(x15)
 68 |         x17 = self.pool3(x16)
 69 |         x18 = self.conv4_1(x17)
 70 |         x19 = self.relu4_1(x18)
 71 |         x20 = self.conv4_2(x19)
 72 |         x21 = self.relu4_2(x20)
 73 |         x22 = self.conv4_3(x21)
 74 |         x23 = self.relu4_3(x22)
 75 |         x24 = self.pool4(x23)
 76 |         x25 = self.conv5_1(x24)
 77 |         x26 = self.relu5_1(x25)
 78 |         x27 = self.conv5_2(x26)
 79 |         x28 = self.relu5_2(x27)
 80 |         x29 = self.conv5_3(x28)
 81 |         x30 = self.relu5_3(x29)
 82 |         x31 = self.pool5(x30)
 83 |         x32 = self.fc6(x31)
 84 |         x33_preflatten = self.relu6(x32)
 85 |         x33 = x33_preflatten.view(x33_preflatten.size(0), -1)
 86 |         x34 = self.fc7(x33)
 87 |         x35 = self.relu7(x34)
 88 |         prediction = self.fc8(x35)
 89 |         return prediction
 90 | 
 91 | def vgg_vd_face_fer_dag(weights_path=None, **kwargs):
 92 |     """
 93 |     load imported model instance
 94 | 
 95 |     Args:
 96 |         weights_path (str): If set, loads model weights from the given path
 97 |     """
 98 |     model = Vgg_vd_face_fer_dag()
 99 |     if weights_path:
100 |         state_dict = torch.load(weights_path)
101 |         model.load_state_dict(state_dict)
102 |     return model


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/fer2013/fer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Fer2013 benchmark
  3 | 
  4 | The module evaluates the performance of a pytorch model on the FER2013
  5 | benchmark.
  6 | """
  7 | 
  8 | from __future__ import division
  9 | 
 10 | import os
 11 | import time
 12 | 
 13 | import torch
 14 | import numpy as np
 15 | import torch.utils.data
 16 | import torch.backends.cudnn as cudnn
 17 | from fer2013.fer_loader import Fer2013Dataset, Fer2013PlusDataset
 18 | from utils.benchmark_helpers import compose_transforms
 19 | 
 20 | def fer2013_benchmark(model, data_dir, res_cache, refresh_cache,
 21 |                        batch_size=256, num_workers=2, fer_plus=False):
 22 |     if not refresh_cache: # load result from cache, if available
 23 |         if os.path.isfile(res_cache):
 24 |             res = torch.load(res_cache)
 25 |             prec1_val, prec1_test = res['prec1_val'], res['prec1_test']
 26 |             print("=> loaded results from '{}'".format(res_cache))
 27 |             info = (prec1_val, prec1_test, res['speed'])
 28 |             msg = 'val acc: {:.2f}, test acc: {:.2f}, Speed: {:.1f}Hz'
 29 |             print(msg.format(*info))
 30 |             return
 31 | 
 32 |     meta = model.meta
 33 |     cudnn.benchmark = True
 34 |     model = torch.nn.DataParallel(model).cuda()
 35 |     preproc_transforms = compose_transforms(meta, center_crop=False)
 36 |     if fer_plus:
 37 |         dataset = Fer2013PlusDataset
 38 |     else:
 39 |         dataset = Fer2013Dataset
 40 |     speeds = []
 41 |     res = {}
 42 |     for mode in 'val', 'test':
 43 |         loader = torch.utils.data.DataLoader(
 44 |             dataset(data_dir, mode=mode, transform=preproc_transforms),
 45 |             batch_size=batch_size, shuffle=False,
 46 |             num_workers=num_workers, pin_memory=True)
 47 |         prec1, speed = validate(loader, model, mode)
 48 |         res['prec1_{}'.format(mode)] = prec1
 49 |         speeds.append(speed)
 50 |     res['speed'] = np.mean(speed)
 51 |     torch.save(res, res_cache)
 52 | 
 53 | def validate(val_loader, model, mode):
 54 |     model.eval()
 55 |     top1 = AverageMeter()
 56 |     speed = WarmupAverageMeter()
 57 |     end = time.time()
 58 |     with torch.no_grad():
 59 |         for ii, (ims, target) in enumerate(val_loader):
 60 |             # target = target.cuda(async=True)
 61 |             target = target.cuda()
 62 |             output = model(ims) # compute output
 63 |             prec1, = accuracy(output.data, target, topk=(1,))
 64 |             top1.update(prec1[0], ims.size(0))
 65 |             speed.update(time.time() - end, ims.size(0))
 66 |             end = time.time()
 67 |             if ii % 10 == 0:
 68 |                 msg = ('{0}: [{1}/{2}]\tSpeed {speed.current:.1f}Hz\t'
 69 |                        '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f}')
 70 |                 print(msg.format(mode, ii, len(val_loader),
 71 |                       speed=speed, top1=top1))
 72 |     print(' * Accuracy {0:.3f}'.format(top1.avg))
 73 |     return top1.avg, speed.avg
 74 | 
 75 | class WarmupAverageMeter(object):
 76 |     """Computes and stores the average and current value, after a fixed
 77 |     warmup period (useful for approximate benchmarking)
 78 | 
 79 |     Args:
 80 |         warmup (int) [3]: The number of updates to be ignored before the
 81 |         average starts to be computed.
 82 |     """
 83 |     def __init__(self, warmup=3):
 84 |         self.reset()
 85 |         self.warmup = warmup
 86 | 
 87 |     def reset(self):
 88 |         self.avg = 0
 89 |         self.current = 0
 90 |         self.delta_sum = 0
 91 |         self.count = 0
 92 |         self.warmup_count = 0
 93 | 
 94 |     def update(self, delta, n):
 95 |         self.warmup_count = self.warmup_count + 1
 96 |         if self.warmup_count >= self.warmup:
 97 |             self.current = n / delta
 98 |             self.delta_sum += delta
 99 |             self.count += n
100 |             self.avg = self.count / self.delta_sum
101 | 
102 | class AverageMeter(object):
103 |     """Computes and stores the average and current value"""
104 |     def __init__(self):
105 |         self.reset()
106 | 
107 |     def reset(self):
108 |         self.val = 0
109 |         self.avg = 0
110 |         self.sum = 0
111 |         self.count = 0
112 | 
113 |     def update(self, val, n=1):
114 |         self.val = val
115 |         self.sum += val * n
116 |         self.count += n
117 |         self.avg = self.sum / self.count
118 | 
119 | def accuracy(output, target, topk=(1,)):
120 |     """Computes the precision@k for the specified values of k"""
121 |     maxk = max(topk)
122 |     batch_size = target.size(0)
123 |     output = output.squeeze(-1).squeeze(-1)
124 |     _, pred = output.topk(maxk, 1, True, True)
125 |     pred = pred.t()
126 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
127 | 
128 |     res = []
129 |     for k in topk:
130 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
131 |         res.append(correct_k.mul_(100.0 / batch_size))
132 |     return res
133 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/sims.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from toolkit.utils.functions import *
  4 | from toolkit.utils.read_files import *
  5 | 
  6 | def func_convert_name_to_newname(video_id, clip_id):
  7 |     newname = video_id + '_%04d' %(clip_id)
  8 |     return newname
  9 | 
 10 | def func_merge_id_to_path(video_id, clip_id, video_root):
 11 |     video_path = os.path.join(video_root, video_id, '%04d.mp4' %(clip_id))
 12 |     return video_path
 13 | 
 14 | # label_path -> (video_paths, labels)
 15 | def read_labels(label_path, video_root):
 16 |     video_ids = func_read_key_from_csv(label_path, 'video_id')
 17 |     clip_ids = func_read_key_from_csv(label_path, 'clip_id')
 18 |     labels = func_read_key_from_csv(label_path, 'label')
 19 |     print (f'label range  ->  min:{min(labels)}  max:{max(labels)}')
 20 |     print (f'whole sample number: {len(labels)}')
 21 | 
 22 |     video_paths = []
 23 |     for ii in range(len(video_ids)):
 24 |         video_path = func_merge_id_to_path(video_ids[ii], clip_ids[ii], video_root)
 25 |         video_paths.append(video_path)
 26 | 
 27 |     return video_paths, labels
 28 | 
 29 | # 只读取 idx_path 对应的 items并返回
 30 | def gain_sub_items(video_paths, labels, idx_path):
 31 |     indexes = func_read_key_from_csv(idx_path, 'index')
 32 |     video_paths = np.array(video_paths)[indexes]
 33 |     labels = np.array(labels)[indexes]
 34 |     print (f'subset sample number: {len(labels)}')
 35 |     return video_paths, labels
 36 | 
 37 | # 转化为 newname 对应的 trans
 38 | def update_transcription(trans_path, save_path):
 39 |     video_ids = func_read_key_from_csv(trans_path, 'video_id')
 40 |     clip_ids = func_read_key_from_csv(trans_path, 'clip_id')
 41 |     chi_subtitles = func_read_key_from_csv(trans_path, 'Chinese')
 42 |     eng_subtitles = func_read_key_from_csv(trans_path, 'English')
 43 |     print (f'whole sample number: {len(video_ids)}')
 44 | 
 45 |     newnames = []
 46 |     for ii in range(len(video_ids)):
 47 |         newname = func_convert_name_to_newname(video_ids[ii], clip_ids[ii])
 48 |         newnames.append(newname)
 49 |     
 50 |     name2key = {}
 51 |     for ii, name in enumerate(newnames):
 52 |         name2key[name] = [chi_subtitles[ii], eng_subtitles[ii]]
 53 |     func_write_key_to_csv(save_path, newnames, name2key, ['chinese', 'english'])
 54 | 
 55 | 
 56 | # ------------------- main process -------------------
 57 | def normalize_dataset_format(data_root, save_root):
 58 |     # gain paths
 59 |     video_root = os.path.join(data_root, 'Raw')
 60 |     label_path = os.path.join(data_root, 'metadata/sentiment/label_M.csv')
 61 |     train_idx_path = os.path.join(data_root, 'metadata/train_index.csv')
 62 |     val_idx_path = os.path.join(data_root, 'metadata/val_index.csv')
 63 |     test_idx_path = os.path.join(data_root, 'metadata/test_index.csv')
 64 |     trans_path = os.path.join(data_root, 'metadata/Translation.csv')
 65 | 
 66 |     # read all items
 67 |     video_paths, labels = read_labels(label_path, video_root)
 68 |     train_video, train_label = gain_sub_items(video_paths, labels, train_idx_path)
 69 |     val_video,   val_label   = gain_sub_items(video_paths, labels, val_idx_path)
 70 |     test_video,  test_label  = gain_sub_items(video_paths, labels, test_idx_path)
 71 | 
 72 |     ## output path
 73 |     save_video = os.path.join(save_root, 'video')
 74 |     save_label = os.path.join(save_root, 'label.npz')
 75 |     save_trans = os.path.join(save_root, 'transcription.csv')
 76 |     if not os.path.exists(save_root):  os.makedirs(save_root)
 77 |     if not os.path.exists(save_video): os.makedirs(save_video)
 78 | 
 79 |     ## generate new transcripts
 80 |     update_transcription(trans_path, save_trans)
 81 | 
 82 |     ## generate label path
 83 |     whole_corpus = {}
 84 |     for name, video_paths, labels in [('train', train_video, train_label),
 85 |                                       ('val',   val_video,   val_label  ),
 86 |                                       ('test',  test_video,  test_label )]:
 87 |         whole_corpus[name] = {}        
 88 |         print (f'{name}: sample number: {len(video_paths)}')
 89 |         for ii, video_path in enumerate(video_paths):
 90 |             video_name = video_path.split('/')[-2]
 91 |             clip_name  = video_path.split('/')[-1]
 92 |             save_path  = os.path.join(save_video, f'{video_name}_{clip_name}')
 93 |             shutil.copy(video_path, save_path)
 94 | 
 95 |             save_name  = os.path.basename(save_path)[:-4]
 96 |             whole_corpus[name][save_name] = {'emo': 0, 'val': labels[ii]}
 97 |             
 98 |     np.savez_compressed(save_label,
 99 |                         train_corpus=whole_corpus['train'],
100 |                         val_corpus=whole_corpus['val'],
101 |                         test_corpus=whole_corpus['test'])
102 | 
103 | if __name__ == '__main__':
104 |     data_root = '/data/lianzheng/chinese-mer-2023/CH-SIMS'
105 |     save_root = '/data/lianzheng/chinese-mer-2023/CH-SIMS-process'
106 |     normalize_dataset_format(data_root, save_root)
107 | 
108 |     # data_root = 'H:\\desktop\\Multimedia-Transformer\\chinese-mer-2023\\dataset\\sims-process'
109 |     # trans_path = os.path.join(data_root, 'transcription.csv')
110 |     # polish_path = os.path.join(data_root, 'transcription-engchi-polish.csv')
111 |     # func_translate_transcript_polish_merge(trans_path, polish_path)
112 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/imagenet/imagenet.py.bak:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Imagenet validation set benchmark
  3 | 
  4 | The module evaluates the performance of a pytorch model on the ILSVRC 2012
  5 | validation set.
  6 | 
  7 | Based on PyTorch imagenet example:
  8 |     https://github.com/pytorch/examples/tree/master/imagenet
  9 | """
 10 | 
 11 | from __future__ import division
 12 | 
 13 | import os
 14 | import time
 15 | 
 16 | from PIL import ImageFile
 17 | import torch
 18 | import torch.nn.parallel
 19 | import torch.utils.data
 20 | import torch.backends.cudnn as cudnn
 21 | import torchvision.datasets as datasets
 22 | from utils.benchmark_helpers import compose_transforms
 23 | 
 24 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 25 | 
 26 | def imagenet_benchmark(model, data_dir, res_cache, refresh_cache,
 27 |                        batch_size=256, num_workers=20,
 28 |                        remove_blacklist=False, center_crop=True):
 29 |     if not refresh_cache: # load result from cache, if available
 30 |         if os.path.isfile(res_cache):
 31 |             res = torch.load(res_cache)
 32 |             prec1, prec5, speed = res['prec1'], res['prec5'], res['speed']
 33 |             print("=> loaded results from '{}'".format(res_cache))
 34 |             info = (100 - prec1, 100 - prec5, speed)
 35 |             msg = 'Top 1 err: {:.2f}, Top 5 err: {:.2f}, Speed: {:.1f}Hz'
 36 |             print(msg.format(*info))
 37 |             return
 38 | 
 39 |     meta = model.meta
 40 |     cudnn.benchmark = True
 41 |     model = torch.nn.DataParallel(model).cuda()
 42 |     if remove_blacklist:
 43 |         subset = 'val_blacklisted'
 44 |     else:
 45 |         subset = 'val'
 46 |     valdir = os.path.join(data_dir, subset)
 47 |     preproc_transforms = compose_transforms(meta, center_crop=center_crop)
 48 |     val_loader = torch.utils.data.DataLoader(
 49 |         datasets.ImageFolder(valdir, preproc_transforms),
 50 |         batch_size=batch_size, shuffle=False,
 51 |         num_workers=num_workers, pin_memory=True)
 52 |     prec1, prec5, speed = validate(val_loader, model)
 53 |     torch.save({'prec1': prec1, 'prec5': prec5, 'speed': speed}, res_cache)
 54 | 
 55 | def validate(val_loader, model):
 56 |     model.eval()
 57 |     top1 = AverageMeter()
 58 |     top5 = AverageMeter()
 59 |     speed = WarmupAverageMeter()
 60 |     end = time.time()
 61 |     with torch.no_grad():
 62 |         for ii, (ims, target) in enumerate(val_loader):
 63 |             target = target.cuda(async=True)
 64 |             # ims_var = torch.autograd.Variable(ims, volatile=True)
 65 |             output = model(ims) # compute output
 66 |             prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
 67 |             top1.update(prec1[0], ims.size(0))
 68 |             top5.update(prec5[0], ims.size(0))
 69 |             speed.update(time.time() - end, ims.size(0))
 70 |             end = time.time()
 71 |             if ii % 10 == 0:
 72 |                 msg = ('Test: [{0}/{1}]\tSpeed {speed.current:.1f}Hz\t'
 73 |                        '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f} '
 74 |                        '{top5.avg:.3f}')
 75 |                 print(msg.format(ii, len(val_loader), speed=speed,
 76 |                                               top1=top1, top5=top5))
 77 |     top1_err, top5_err = 100 - top1.avg, 100 - top5.avg
 78 |     print(' * Err@1 {0:.3f} Err@5 {1:.3f}'.format(top1_err, top5_err))
 79 | 
 80 |     return top1.avg, top5.avg, speed.avg
 81 | 
 82 | class WarmupAverageMeter(object):
 83 |     """Computes and stores the average and current value, after a fixed
 84 |     warmup period (useful for approximate benchmarking)
 85 | 
 86 |     Args:
 87 |         warmup (int) [3]: The number of updates to be ignored before the
 88 |         average starts to be computed.
 89 |     """
 90 |     def __init__(self, warmup=3):
 91 |         self.reset()
 92 |         self.warmup = warmup
 93 | 
 94 |     def reset(self):
 95 |         self.avg = 0
 96 |         self.current = 0
 97 |         self.delta_sum = 0
 98 |         self.count = 0
 99 |         self.warmup_count = 0
100 | 
101 |     def update(self, delta, n):
102 |         self.warmup_count = self.warmup_count + 1
103 |         if self.warmup_count >= self.warmup:
104 |             self.current = n / delta
105 |             self.delta_sum += delta
106 |             self.count += n
107 |             self.avg = self.count / self.delta_sum
108 | 
109 | class AverageMeter(object):
110 |     """Computes and stores the average and current value"""
111 |     def __init__(self):
112 |         self.reset()
113 | 
114 |     def reset(self):
115 |         self.val = 0
116 |         self.avg = 0
117 |         self.sum = 0
118 |         self.count = 0
119 | 
120 |     def update(self, val, n=1):
121 |         self.val = val
122 |         self.sum += val * n
123 |         self.count += n
124 |         self.avg = self.sum / self.count
125 | 
126 | def accuracy(output, target, topk=(1,)):
127 |     """Computes the precision@k for the specified values of k"""
128 |     maxk = max(topk)
129 |     batch_size = target.size(0)
130 | 
131 |     _, pred = output.topk(maxk, 1, True, True)
132 |     pred = pred.t()
133 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
134 | 
135 |     res = []
136 |     for k in topk:
137 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
138 |         res.append(correct_k.mul_(100.0 / batch_size))
139 |     return res
140 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/utils/read_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import tqdm
  4 | import math
  5 | import pickle
  6 | import numpy as np
  7 | import multiprocessing
  8 | 
  9 | from ..globals import *
 10 | from .functions import *
 11 | from .read_files import *
 12 | 
 13 | ############################################################
 14 | # ------ for feat: feature_root+name -> (seqlen, featdim) ------
 15 | def func_read_one_feat(argv=None, feature_root=None, name=None, processor=None, model_name=None):
 16 |     feature_root, name, processor, model_name = argv
 17 | 
 18 |     # 路径可能的两个选项
 19 |     feature_path = os.path.join(feature_root, name+'.npy')
 20 |     feature_dir  = os.path.join(feature_root, name)
 21 | 
 22 |     feature = []
 23 |     if os.path.exists(feature_path): # audio/text => belong to speaker
 24 |         single_feature = np.load(feature_path)
 25 |         single_feature = single_feature.squeeze() # [Dim, ] or [Time, Dim]
 26 |         feature.append(single_feature)
 27 |     elif os.path.isdir(feature_dir):
 28 |         facenames = os.listdir(feature_dir) # 如果是文件夹，则依次读取文件夹内所有信息
 29 |         for facename in sorted(facenames):
 30 |             facefeat = np.load(os.path.join(feature_dir, facename))
 31 |             feature.append(facefeat)
 32 |     else:
 33 |         raise Exception('feature path or dir do not exist!')
 34 | 
 35 |     # feature -> (seqlen, featdim)
 36 |     single_feature = np.array(feature).squeeze()
 37 |     if len(single_feature) == 0:
 38 |         print ('feature has errors!!')
 39 |     elif len(single_feature.shape) == 1:
 40 |         single_feature = single_feature[np.newaxis, :]
 41 |     return single_feature
 42 | 
 43 | 
 44 | # model_name：表示用的哪个预训练模型
 45 | # read multiple data [different datasets need different processors]
 46 | def func_read_multiprocess(feature_root, names, processor=None, read_type='feat', model_name=None):
 47 |     ## names => features
 48 |     params = []
 49 |     for name in names:
 50 |         params.append((feature_root, name, processor, model_name))
 51 | 
 52 |     # ------ debug ------
 53 |     # func_read_one_feat(params[0])
 54 |     # func_read_one_e2e_video(params[0])
 55 |     # func_read_one_e2e_audio(params[0])
 56 | 
 57 |     features = []
 58 |     with multiprocessing.Pool(processes=8) as pool:
 59 |         if read_type == 'feat':
 60 |             features = list(tqdm.tqdm(pool.imap(func_read_one_feat, params), total=len(params)))
 61 | 
 62 |     ## save (names, features)
 63 |     feature_shape = np.array(features[0]).shape
 64 |     feature_name = os.path.basename(feature_root)
 65 |     print (f'Input feature {feature_name} ===> dim is {feature_shape}')
 66 |     assert len(names) == len(features), f'Error: len(names) != len(features)'
 67 |     return features, feature_shape[-1]
 68 | 
 69 | 
 70 | ############################################################
 71 | # (seqlen, featdim) -> (dst_len, featdim)
 72 | def func_mapping_feature(feature, dst_len):
 73 |     featlen, featdim = feature.shape
 74 |     if featlen == dst_len:
 75 |         return feature
 76 |     elif featlen < dst_len:
 77 |         pad_feature = np.zeros((dst_len-featlen, featdim))
 78 |         feature = np.concatenate((pad_feature, feature), axis=0)
 79 |     else:
 80 |         if featlen // dst_len == featlen / dst_len:
 81 |             pad_len = 0
 82 |             pool_size = featlen // dst_len
 83 |         else:
 84 |             pad_len = dst_len - featlen % dst_len
 85 |             pool_size = featlen // dst_len + 1
 86 |         pad_feature = np.zeros((pad_len, featdim))
 87 |         feature = np.concatenate([pad_feature, feature]).reshape(dst_len, pool_size, featdim) # 相邻时刻特征取平均
 88 |         feature = np.mean(feature, axis=1)
 89 |     return feature
 90 | 
 91 | # sample-level
 92 | def align_to_utt(audios, texts, videos):
 93 |     for ii in range(len(audios)):
 94 |         audios[ii] = np.mean(audios[ii], axis=0)
 95 |         texts[ii]  = np.mean(texts[ii],  axis=0)
 96 |         videos[ii] = np.mean(videos[ii], axis=0)
 97 |     return audios, texts, videos
 98 | 
 99 | # sample-level: 每个模态的特征长度压缩到原来的scale倍
100 | def feature_scale_compress(audios, texts, videos, scale_factor=1):
101 |     for ii in range(len(audios)):
102 |         audios[ii] = func_mapping_feature(audios[ii], math.ceil(len(audios[ii]) / scale_factor))
103 |         texts[ii]  = func_mapping_feature(texts[ii],  math.ceil(len(texts[ii])  / scale_factor))
104 |         videos[ii] = func_mapping_feature(videos[ii], math.ceil(len(videos[ii]) / scale_factor))
105 |     return audios, texts, videos
106 | 
107 | # sample-level: 三种模态压缩到文本长度
108 | def align_to_text(audios, texts, videos):
109 |     for ii in range(len(audios)):
110 |         dst_len = len(texts[ii])
111 |         audios[ii] = func_mapping_feature(audios[ii], dst_len)
112 |         texts[ii]  = func_mapping_feature(texts[ii],  dst_len)
113 |         videos[ii] = func_mapping_feature(videos[ii], dst_len)
114 |     return audios, texts, videos
115 | 
116 | # batch-level: generate batch
117 | def pad_to_maxlen_pre_modality(audios, texts, videos):
118 |     audio_maxlen = max([len(feature) for feature in audios])
119 |     text_maxlen  = max([len(feature) for feature in texts ])
120 |     video_maxlen = max([len(feature) for feature in videos])
121 |     for ii in range(len(audios)):
122 |         audios[ii] = func_mapping_feature(audios[ii], audio_maxlen)
123 |         texts[ii]  = func_mapping_feature(texts[ii],  text_maxlen)
124 |         videos[ii] = func_mapping_feature(videos[ii], video_maxlen)
125 |     return audios, texts, videos
126 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/utils/read_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import tqdm
  4 | import math
  5 | import pickle
  6 | import numpy as np
  7 | import multiprocessing
  8 | 
  9 | from ..globals import *
 10 | from .functions import *
 11 | from .read_files import *
 12 | 
 13 | ############################################################
 14 | # ------ for feat: feature_root+name -> (seqlen, featdim) ------
 15 | def func_read_one_feat(argv=None, feature_root=None, name=None, processor=None, model_name=None):
 16 |     feature_root, name, processor, model_name = argv
 17 | 
 18 |     # 路径可能的两个选项
 19 |     feature_path = os.path.join(feature_root, name+'.npy')
 20 |     feature_dir  = os.path.join(feature_root, name)
 21 | 
 22 |     feature = []
 23 |     if os.path.exists(feature_path): # audio/text => belong to speaker
 24 |         single_feature = np.load(feature_path)
 25 |         single_feature = single_feature.squeeze() # [Dim, ] or [Time, Dim]
 26 |         feature.append(single_feature)
 27 |     elif os.path.isdir(feature_dir):
 28 |         facenames = os.listdir(feature_dir) # 如果是文件夹，则依次读取文件夹内所有信息
 29 |         for facename in sorted(facenames):
 30 |             facefeat = np.load(os.path.join(feature_dir, facename))
 31 |             feature.append(facefeat)
 32 |     else:
 33 |         raise Exception('feature path or dir do not exist!')
 34 | 
 35 |     # feature -> (seqlen, featdim)
 36 |     single_feature = np.array(feature).squeeze()
 37 |     if len(single_feature) == 0:
 38 |         print ('feature has errors!!')
 39 |     elif len(single_feature.shape) == 1:
 40 |         single_feature = single_feature[np.newaxis, :]
 41 |     return single_feature
 42 | 
 43 | 
 44 | # model_name：表示用的哪个预训练模型
 45 | # read multiple data [different datasets need different processors]
 46 | def func_read_multiprocess(feature_root, names, processor=None, read_type='feat', model_name=None):
 47 |     ## names => features
 48 |     params = []
 49 |     for name in names:
 50 |         params.append((feature_root, name, processor, model_name))
 51 | 
 52 |     # ------ debug ------
 53 |     # func_read_one_feat(params[0])
 54 |     # func_read_one_e2e_video(params[0])
 55 |     # func_read_one_e2e_audio(params[0])
 56 | 
 57 |     features = []
 58 |     with multiprocessing.Pool(processes=8) as pool:
 59 |         if read_type == 'feat':
 60 |             features = list(tqdm.tqdm(pool.imap(func_read_one_feat, params), total=len(params)))
 61 | 
 62 |     ## save (names, features)
 63 |     feature_shape = np.array(features[0]).shape
 64 |     feature_name = os.path.basename(feature_root)
 65 |     print (f'Input feature {feature_name} ===> dim is {feature_shape}')
 66 |     assert len(names) == len(features), f'Error: len(names) != len(features)'
 67 |     return features, feature_shape[-1]
 68 | 
 69 | 
 70 | ############################################################
 71 | # (seqlen, featdim) -> (dst_len, featdim)
 72 | def func_mapping_feature(feature, dst_len):
 73 |     featlen, featdim = feature.shape
 74 |     if featlen == dst_len:
 75 |         return feature
 76 |     elif featlen < dst_len:
 77 |         pad_feature = np.zeros((dst_len-featlen, featdim))
 78 |         feature = np.concatenate((pad_feature, feature), axis=0)
 79 |     else:
 80 |         if featlen // dst_len == featlen / dst_len:
 81 |             pad_len = 0
 82 |             pool_size = featlen // dst_len
 83 |         else:
 84 |             pad_len = dst_len - featlen % dst_len
 85 |             pool_size = featlen // dst_len + 1
 86 |         pad_feature = np.zeros((pad_len, featdim))
 87 |         feature = np.concatenate([pad_feature, feature]).reshape(dst_len, pool_size, featdim) # 相邻时刻特征取平均
 88 |         feature = np.mean(feature, axis=1)
 89 |     return feature
 90 | 
 91 | # sample-level
 92 | def align_to_utt(audios, texts, videos):
 93 |     for ii in range(len(audios)):
 94 |         audios[ii] = np.mean(audios[ii], axis=0)
 95 |         texts[ii]  = np.mean(texts[ii],  axis=0)
 96 |         videos[ii] = np.mean(videos[ii], axis=0)
 97 |     return audios, texts, videos
 98 | 
 99 | # sample-level: 每个模态的特征长度压缩到原来的scale倍
100 | def feature_scale_compress(audios, texts, videos, scale_factor=1):
101 |     for ii in range(len(audios)):
102 |         audios[ii] = func_mapping_feature(audios[ii], math.ceil(len(audios[ii]) / scale_factor))
103 |         texts[ii]  = func_mapping_feature(texts[ii],  math.ceil(len(texts[ii])  / scale_factor))
104 |         videos[ii] = func_mapping_feature(videos[ii], math.ceil(len(videos[ii]) / scale_factor))
105 |     return audios, texts, videos
106 | 
107 | # sample-level: 三种模态压缩到文本长度
108 | def align_to_text(audios, texts, videos):
109 |     for ii in range(len(audios)):
110 |         dst_len = len(texts[ii])
111 |         audios[ii] = func_mapping_feature(audios[ii], dst_len)
112 |         texts[ii]  = func_mapping_feature(texts[ii],  dst_len)
113 |         videos[ii] = func_mapping_feature(videos[ii], dst_len)
114 |     return audios, texts, videos
115 | 
116 | # batch-level: generate batch
117 | def pad_to_maxlen_pre_modality(audios, texts, videos):
118 |     audio_maxlen = max([len(feature) for feature in audios])
119 |     text_maxlen  = max([len(feature) for feature in texts ])
120 |     video_maxlen = max([len(feature) for feature in videos])
121 |     for ii in range(len(audios)):
122 |         audios[ii] = func_mapping_feature(audios[ii], audio_maxlen)
123 |         texts[ii]  = func_mapping_feature(texts[ii],  text_maxlen)
124 |         videos[ii] = func_mapping_feature(videos[ii], video_maxlen)
125 |     return audios, texts, videos
126 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/pytorch-benchmarks/imagenet/evaluation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Imagenet validation set benchmark
  3 | 
  4 | The module evaluates the performance of a pytorch model on the ILSVRC 2012
  5 | validation set.
  6 | 
  7 | Based on PyTorch imagenet example:
  8 |     https://github.com/pytorch/examples/tree/master/imagenet
  9 | """
 10 | 
 11 | from __future__ import division
 12 | 
 13 | import os
 14 | import time
 15 | 
 16 | from PIL import ImageFile
 17 | import torch
 18 | import torch.nn.parallel
 19 | import torch.utils.data
 20 | import torch.backends.cudnn as cudnn
 21 | import torchvision.datasets as datasets
 22 | from utils.benchmark_helpers import compose_transforms
 23 | 
 24 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 25 | 
 26 | 
 27 | def imagenet_benchmark(model, data_dir, res_cache, refresh_cache, batch_size=256,
 28 |                        num_workers=20, remove_blacklist=False, center_crop=True,
 29 |                        override_meta_imsize=False):
 30 |     if not refresh_cache:  # load result from cache, if available
 31 |         if os.path.isfile(res_cache):
 32 |             res = torch.load(res_cache)
 33 |             prec1, prec5, speed = res['prec1'], res['prec5'], res['speed']
 34 |             print("=> loaded results from '{}'".format(res_cache))
 35 |             info = (100 - prec1, 100 - prec5, speed)
 36 |             msg = 'Top 1 err: {:.2f}, Top 5 err: {:.2f}, Speed: {:.1f}Hz'
 37 |             print(msg.format(*info))
 38 |             return
 39 | 
 40 |     meta = model.meta
 41 |     cudnn.benchmark = True
 42 | 
 43 |     if override_meta_imsize:  # NOTE REMOVE THIS LATER!
 44 |         import torch.nn as nn
 45 |         model.features_8 = nn.AdaptiveAvgPool2d(1)
 46 | 
 47 |     model = torch.nn.DataParallel(model).cuda()
 48 |     if remove_blacklist:
 49 |         subset = 'val_blacklisted'
 50 |     else:
 51 |         subset = 'val'
 52 |     valdir = os.path.join(data_dir, subset)
 53 |     preproc_transforms = compose_transforms(meta, resize=256, center_crop=center_crop,
 54 |                                             override_meta_imsize=override_meta_imsize)
 55 |     val_loader = torch.utils.data.DataLoader(
 56 |         datasets.ImageFolder(valdir, preproc_transforms), batch_size=batch_size,
 57 |         shuffle=False, num_workers=num_workers, pin_memory=True)
 58 |     prec1, prec5, speed = validate(val_loader, model)
 59 |     torch.save({'prec1': prec1, 'prec5': prec5, 'speed': speed}, res_cache)
 60 | 
 61 | 
 62 | def validate(val_loader, model):
 63 |     model.eval()
 64 |     top1 = AverageMeter()
 65 |     top5 = AverageMeter()
 66 |     speed = WarmupAverageMeter()
 67 |     end = time.time()
 68 |     with torch.no_grad():
 69 |         for ii, (ims, target) in enumerate(val_loader):
 70 |             target = target.cuda()
 71 |             # ims_var = torch.autograd.Variable(ims, volatile=True)
 72 |             output = model(ims)  # compute output
 73 |             prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
 74 |             top1.update(prec1[0], ims.size(0))
 75 |             top5.update(prec5[0], ims.size(0))
 76 |             speed.update(time.time() - end, ims.size(0))
 77 |             end = time.time()
 78 |             if ii % 10 == 0:
 79 |                 msg = ('Test: [{0}/{1}]\tSpeed {speed.current:.1f}Hz\t'
 80 |                        '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f} '
 81 |                        '{top5.avg:.3f}')
 82 |                 print(msg.format(ii, len(val_loader), speed=speed, top1=top1,
 83 |                                  top5=top5))
 84 |     top1_err, top5_err = 100 - top1.avg, 100 - top5.avg
 85 |     print(' * Err@1 {0:.3f} Err@5 {1:.3f}'.format(top1_err, top5_err))
 86 | 
 87 |     return top1.avg, top5.avg, speed.avg
 88 | 
 89 | 
 90 | class WarmupAverageMeter(object):
 91 |     """Computes and stores the average and current value, after a fixed
 92 |     warmup period (useful for approximate benchmarking)
 93 | 
 94 |     Args:
 95 |         warmup (int) [3]: The number of updates to be ignored before the
 96 |         average starts to be computed.
 97 |     """
 98 | 
 99 |     def __init__(self, warmup=3):
100 |         self.reset()
101 |         self.warmup = warmup
102 | 
103 |     def reset(self):
104 |         self.avg = 0
105 |         self.current = 0
106 |         self.delta_sum = 0
107 |         self.count = 0
108 |         self.warmup_count = 0
109 | 
110 |     def update(self, delta, n):
111 |         self.warmup_count = self.warmup_count + 1
112 |         if self.warmup_count >= self.warmup:
113 |             self.current = n / delta
114 |             self.delta_sum += delta
115 |             self.count += n
116 |             self.avg = self.count / self.delta_sum
117 | 
118 | 
119 | class AverageMeter(object):
120 |     """Computes and stores the average and current value"""
121 | 
122 |     def __init__(self):
123 |         self.reset()
124 | 
125 |     def reset(self):
126 |         self.val = 0
127 |         self.avg = 0
128 |         self.sum = 0
129 |         self.count = 0
130 | 
131 |     def update(self, val, n=1):
132 |         self.val = val
133 |         self.sum += val * n
134 |         self.count += n
135 |         self.avg = self.sum / self.count
136 | 
137 | 
138 | def accuracy(output, target, topk=(1, )):
139 |     """Computes the precision@k for the specified values of k"""
140 |     maxk = max(topk)
141 |     batch_size = target.size(0)
142 | 
143 |     _, pred = output.topk(maxk, 1, True, True)
144 |     pred = pred.t()
145 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
146 | 
147 |     res = []
148 |     for k in topk:
149 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
150 |         res.append(correct_k.mul_(100.0 / batch_size))
151 |     return res
152 | 


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/audio/vggish/vggish_slim.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Defines the 'VGGish' model used to generate AudioSet embedding features.
 17 | 
 18 | The public AudioSet release (https://research.google.com/audioset/download.html)
 19 | includes 128-D features extracted from the embedding layer of a VGG-like model
 20 | that was trained on a large Google-internal YouTube dataset. Here we provide
 21 | a TF-Slim definition of the same model, without any dependences on libraries
 22 | internal to Google. We call it 'VGGish'.
 23 | 
 24 | Note that we only define the model up to the embedding layer, which is the
 25 | penultimate layer before the final classifier layer. We also provide various
 26 | hyperparameter values (in vggish_params.py) that were used to train this model
 27 | internally.
 28 | 
 29 | For comparison, here is TF-Slim's VGG definition:
 30 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
 31 | """
 32 | 
 33 | import tensorflow.compat.v1 as tf
 34 | tf.disable_v2_behavior()
 35 | import tf_slim as slim # version: 1.1.0, pip install tf-slim
 36 | 
 37 | from vggish import vggish_params as params
 38 | 
 39 | 
 40 | def define_vggish_slim(training=False):
 41 |   """Defines the VGGish TensorFlow model.
 42 | 
 43 |   All ops are created in the current default graph, under the scope 'vggish/'.
 44 | 
 45 |   The input is a placeholder named 'vggish/input_features' of type float32 and
 46 |   shape [batch_size, num_frames, num_bands] where batch_size is variable and
 47 |   num_frames and num_bands are constants, and [num_frames, num_bands] represents
 48 |   a log-mel-scale spectrogram patch covering num_bands frequency bands and
 49 |   num_frames time frames (where each frame step is usually 10ms). This is
 50 |   produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
 51 |   The output is an op named 'vggish/embedding' which produces the activations of
 52 |   a 128-D embedding layer, which is usually the penultimate layer when used as
 53 |   part of a full model with a final classifier layer.
 54 | 
 55 |   Args:
 56 |     training: If true, all parameters are marked trainable.
 57 | 
 58 |   Returns:
 59 |     The op 'vggish/embeddings'.
 60 |   """
 61 |   # Defaults:
 62 |   # - All weights are initialized to N(0, INIT_STDDEV).
 63 |   # - All biases are initialized to 0.
 64 |   # - All activations are ReLU.
 65 |   # - All convolutions are 3x3 with stride 1 and SAME padding.
 66 |   # - All max-pools are 2x2 with stride 2 and SAME padding.
 67 |   with slim.arg_scope([slim.conv2d, slim.fully_connected],
 68 |                       weights_initializer=tf.truncated_normal_initializer(
 69 |                           stddev=params.INIT_STDDEV),
 70 |                       biases_initializer=tf.zeros_initializer(),
 71 |                       activation_fn=tf.nn.relu,
 72 |                       trainable=training), \
 73 |        slim.arg_scope([slim.conv2d],
 74 |                       kernel_size=[3, 3], stride=1, padding='SAME'), \
 75 |        slim.arg_scope([slim.max_pool2d],
 76 |                       kernel_size=[2, 2], stride=2, padding='SAME'), \
 77 |        tf.variable_scope('vggish'):
 78 |     # Input: a batch of 2-D log-mel-spectrogram patches.
 79 |     features = tf.placeholder(
 80 |         tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
 81 |         name='input_features')
 82 |     # Reshape to 4-D so that we can convolve a batch with conv2d().
 83 |     net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])
 84 | 
 85 |     # The VGG stack of alternating convolutions and max-pools.
 86 |     net = slim.conv2d(net, 64, scope='conv1')
 87 |     net = slim.max_pool2d(net, scope='pool1')
 88 |     net = slim.conv2d(net, 128, scope='conv2')
 89 |     net = slim.max_pool2d(net, scope='pool2')
 90 |     net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
 91 |     net = slim.max_pool2d(net, scope='pool3')
 92 |     net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
 93 |     net = slim.max_pool2d(net, scope='pool4')
 94 | 
 95 |     # Flatten before entering fully-connected layers
 96 |     net = slim.flatten(net)
 97 |     net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
 98 |     # The embedding layer.
 99 |     net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
100 |     return tf.identity(net, name='embedding')
101 | 
102 | 
103 | def load_vggish_slim_checkpoint(session, checkpoint_path):
104 |   """Loads a pre-trained VGGish-compatible checkpoint.
105 | 
106 |   This function can be used as an initialization function (referred to as
107 |   init_fn in TensorFlow documentation) which is called in a Session after
108 |   initializating all variables. When used as an init_fn, this will load
109 |   a pre-trained checkpoint that is compatible with the VGGish model
110 |   definition. Only variables defined by VGGish will be loaded.
111 | 
112 |   Args:
113 |     session: an active TensorFlow session.
114 |     checkpoint_path: path to a file containing a checkpoint that is
115 |       compatible with the VGGish model definition.
116 |   """
117 |   # Get the list of names of all VGGish variables that exist in
118 |   # the checkpoint (i.e., all inference-mode VGGish variables).
119 |   with tf.Graph().as_default():
120 |     define_vggish_slim(training=False)
121 |     vggish_var_names = [v.name for v in tf.global_variables()]
122 | 
123 |   # Get the list of all currently existing variables that match
124 |   # the list of variable names we just computed.
125 |   vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
126 | 
127 |   # Use a Saver to restore just the variables selected above.
128 |   saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained',
129 |                          write_version=1)
130 |   saver.restore(session, checkpoint_path)
131 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/models/mfn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | paper: Memory Fusion Network for Multi-View Sequential Learning
  3 | From: https://github.com/pliang279/MFN
  4 | """
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | class MFN(nn.Module):
 10 |     def __init__(self, args):
 11 |         super(MFN, self).__init__()
 12 | 
 13 |         # params: analyze args
 14 |         audio_dim   = args.audio_dim
 15 |         text_dim    = args.text_dim
 16 |         video_dim   = args.video_dim
 17 |         output_dim1 = args.output_dim1
 18 |         output_dim2 = args.output_dim2
 19 |         dropout = args.dropout
 20 |         self.mem_dim = args.mem_dim
 21 |         self.hidden_dim = args.hidden_dim
 22 |         self.grad_clip = args.grad_clip
 23 | 
 24 |         # params: intermedia
 25 |         total_h_dim =  self.hidden_dim * 3
 26 |         attInShape = total_h_dim * args.window_dim
 27 |         gammaInShape = attInShape + self.mem_dim
 28 |         final_out = total_h_dim + self.mem_dim
 29 |         output_dim = self.hidden_dim // 2
 30 | 
 31 |         # each modality has one lstm cell
 32 |         self.lstm_l = nn.LSTMCell(text_dim,  self.hidden_dim)
 33 |         self.lstm_a = nn.LSTMCell(audio_dim, self.hidden_dim)
 34 |         self.lstm_v = nn.LSTMCell(video_dim, self.hidden_dim)
 35 | 
 36 |         self.att1_fc1 = nn.Linear(attInShape, self.hidden_dim)
 37 |         self.att1_fc2 = nn.Linear(self.hidden_dim, attInShape)
 38 |         self.att1_dropout = nn.Dropout(dropout)
 39 | 
 40 |         self.att2_fc1 = nn.Linear(attInShape, self.hidden_dim)
 41 |         self.att2_fc2 = nn.Linear(self.hidden_dim, self.mem_dim)
 42 |         self.att2_dropout = nn.Dropout(dropout)
 43 | 
 44 |         self.gamma1_fc1 = nn.Linear(gammaInShape, self.hidden_dim)
 45 |         self.gamma1_fc2 = nn.Linear(self.hidden_dim, self.mem_dim)
 46 |         self.gamma1_dropout = nn.Dropout(dropout)
 47 | 
 48 |         self.gamma2_fc1 = nn.Linear(gammaInShape, self.hidden_dim)
 49 |         self.gamma2_fc2 = nn.Linear(self.hidden_dim, self.mem_dim)
 50 |         self.gamma2_dropout = nn.Dropout(dropout)
 51 | 
 52 |         self.out_fc1 = nn.Linear(final_out, self.hidden_dim)
 53 |         self.out_fc2 = nn.Linear(self.hidden_dim, output_dim)
 54 |         self.out_dropout = nn.Dropout(dropout)
 55 | 
 56 |         # output results
 57 |         self.fc_out_1 = nn.Linear(output_dim, output_dim1)
 58 |         self.fc_out_2 = nn.Linear(output_dim, output_dim2)
 59 |     
 60 | 
 61 |     # MFN needs aligned multimodal features
 62 |     def forward(self, batch):
 63 |         
 64 |         '''
 65 |         simulating word-align network (for seq_len_T == seq_len_A == seq_len_V)
 66 |         audio_x: tensor of shape (batch, seqlen, audio_in)
 67 |         video_x: tensor of shape (batch, seqlen, video_in)
 68 |         text_x: tensor of shape  (batch, seqlen, text_in)
 69 |         '''
 70 |         assert batch['audios'].size()[1] == batch['videos'].size()[1]
 71 |         assert batch['audios'].size()[1] == batch['texts'].size()[1]
 72 | 
 73 |         text_x  = batch['texts'].permute(1,0,2)  # [seqlen, batch, dim]
 74 |         audio_x = batch['audios'].permute(1,0,2) # [seqlen, batch, dim]
 75 |         video_x = batch['videos'].permute(1,0,2) # [seqlen, batch, dim]
 76 | 
 77 |         # x is t x n x d
 78 |         n = text_x.size()[1] # n = batch
 79 |         t = text_x.size()[0] # t = seqlen
 80 |         self.h_l = torch.zeros(n, self.hidden_dim).cuda()
 81 |         self.h_a = torch.zeros(n, self.hidden_dim).cuda()
 82 |         self.h_v = torch.zeros(n, self.hidden_dim).cuda()
 83 |         self.c_l = torch.zeros(n, self.hidden_dim).cuda()
 84 |         self.c_a = torch.zeros(n, self.hidden_dim).cuda()
 85 |         self.c_v = torch.zeros(n, self.hidden_dim).cuda()
 86 |         self.mem = torch.zeros(n, self.mem_dim).cuda()
 87 |         all_h_ls = []
 88 |         all_h_as = []
 89 |         all_h_vs = []
 90 |         all_c_ls = []
 91 |         all_c_as = []
 92 |         all_c_vs = []
 93 |         all_mems = []
 94 |         for i in range(t): # lstm 中每个step单独处理
 95 | 
 96 |             # prev time step [这里的 c 指的就是 lstm 里面的 cell state]
 97 |             prev_c_l = self.c_l
 98 |             prev_c_a = self.c_a
 99 |             prev_c_v = self.c_v
100 | 
101 |             # curr time step
102 |             new_h_l, new_c_l = self.lstm_l(text_x[i],  (self.h_l, self.c_l))
103 |             new_h_a, new_c_a = self.lstm_a(audio_x[i], (self.h_a, self.c_a))
104 |             new_h_v, new_c_v = self.lstm_v(video_x[i], (self.h_v, self.c_v))
105 |             
106 |             # concatenate and attention
107 |             prev_cs = torch.cat([prev_c_l,prev_c_a,prev_c_v], dim=1)
108 |             new_cs  = torch.cat([new_c_l, new_c_a, new_c_v],  dim=1)
109 |             cStar = torch.cat([prev_cs, new_cs], dim=1)
110 |             attention = F.softmax(self.att1_fc2(self.att1_dropout(F.relu(self.att1_fc1(cStar)))),dim=1)
111 |             attended = attention * cStar
112 |             cHat = torch.tanh(self.att2_fc2(self.att2_dropout(F.relu(self.att2_fc1(attended)))))
113 |             both = torch.cat([attended, self.mem], dim=1)
114 |             gamma1 = torch.sigmoid(self.gamma1_fc2(self.gamma1_dropout(F.relu(self.gamma1_fc1(both)))))
115 |             gamma2 = torch.sigmoid(self.gamma2_fc2(self.gamma2_dropout(F.relu(self.gamma2_fc1(both)))))
116 |             self.mem = gamma1*self.mem + gamma2*cHat
117 |             all_mems.append(self.mem)
118 | 
119 |             # update (hidden, cell) in lstm
120 |             self.h_l, self.c_l = new_h_l, new_c_l
121 |             self.h_a, self.c_a = new_h_a, new_c_a
122 |             self.h_v, self.c_v = new_h_v, new_c_v
123 | 
124 |             all_h_ls.append(self.h_l)
125 |             all_h_as.append(self.h_a)
126 |             all_h_vs.append(self.h_v)
127 |             all_c_ls.append(self.c_l)
128 |             all_c_as.append(self.c_a)
129 |             all_c_vs.append(self.c_v)
130 | 
131 |         # last hidden layer last_hs is n x h [就是一个逐步交互的过程]
132 |         last_h_l = all_h_ls[-1]
133 |         last_h_a = all_h_as[-1]
134 |         last_h_v = all_h_vs[-1]
135 |         last_mem = all_mems[-1]
136 |         last_hs = torch.cat([last_h_l, last_h_a, last_h_v, last_mem], dim=1)
137 |         features = self.out_fc2(self.out_dropout(F.relu(self.out_fc1(last_hs))))
138 |         self.last_hs = last_hs # for outside loading
139 | 
140 |         emos_out  = self.fc_out_1(features)
141 |         vals_out  = self.fc_out_2(features)
142 |         interloss = torch.tensor(0).cuda()
143 | 
144 |         return features, emos_out, vals_out, interloss
145 |     


--------------------------------------------------------------------------------
/EmotionTalk/feature_extraction/visual/emonet/data/affecnet.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path 
  2 | import pickle
  3 | import numpy as np 
  4 | import torch
  5 | import math
  6 | from torch.utils.data import Dataset
  7 | from skimage import io
  8 | 
  9 | class AffectNet(Dataset):
 10 |     _expressions = {0: 'neutral', 1:'happy', 2:'sad', 3:'surprise', 4:'fear', 5:'disgust', 6:'anger', 7:'contempt', 8:'none'}
 11 |     _expressions_indices = {8: [0, 1, 2, 3, 4, 5, 6, 7], 
 12 |                             5: [0, 1, 2, 3, 6]}
 13 |     
 14 |     def __init__(self, root_path, subset='test',
 15 |                  transform_image_shape=None, transform_image=None,
 16 |                  n_expression=5, verbose=1, cleaned_set=True):
 17 |         self.root_path = Path(root_path).expanduser()
 18 |         self.subset = subset
 19 |         self.image_path = self.root_path.joinpath(subset)
 20 |         self.transform_image_shape = transform_image_shape
 21 |         self.transform_image = transform_image
 22 |         self.verbose = verbose
 23 | 
 24 |         #if cleaned_set and (subset not in ['test', 'val']):
 25 |         #    raise ValueError('cleaned_set can only be set to True for the val or test set, train has not been cleaned')
 26 |         self.cleaned_set = cleaned_set
 27 | 
 28 |         if n_expression not in [5, 8]:
 29 |             raise ValueError(f'n_expression should be either 5 or 8, but got n_expression={n_expression}')
 30 |         self.n_expression = n_expression
 31 | 
 32 |         self.pickle_path = self.root_path.joinpath(f'{subset}_fullpath.pkl')
 33 |         with open(self.pickle_path, 'br') as f:
 34 |             data = pickle.load(f)
 35 |         self.data = data
 36 | 
 37 |         # the keys are the image names (name.ext)
 38 |         self.keys = []
 39 |         self.skipped = {'other':[], 'pt_pt_error':[], 'expression':[], 'cleaned':[]}
 40 |         # List of each expression to generate weights
 41 |         expressions = []
 42 |         for key, value in data.items():
 43 |             if key == 'folder':
 44 |                 continue
 45 |             if (int(value['expression']) not in self._expressions_indices[self.n_expression]):
 46 |                 self.skipped['expression'].append(key)
 47 |                 continue
 48 |             if self.cleaned_set and (not value['expression_correct']):
 49 |                 self.skipped['cleaned'].append(key)
 50 |                 continue
 51 | 
 52 |             expression = int(value['expression'])
 53 |             if self.cleaned_set:
 54 |                 #Automatic cleaning : expression has to match the valence and arousal values
 55 |                 valence = float(value['valence'])
 56 |                 arousal = float(value['arousal'])
 57 |                 intensity = math.sqrt(valence**2+arousal**2)
 58 | 
 59 |                 if expression == 0 and intensity>=0.2:
 60 |                     self.skipped['other'].append(key)
 61 |                     continue
 62 |                 elif expression == 1  and (valence<=0 or intensity<=0.2):
 63 |                     self.skipped['other'].append(key)
 64 |                     continue           
 65 |                 elif expression == 2  and (valence>=0 or intensity<=0.2):
 66 |                     self.skipped['other'].append(key)
 67 |                     continue
 68 |                 elif expression == 3  and (arousal<=0 or intensity<=0.2):
 69 |                     self.skipped['other'].append(key)
 70 |                     continue
 71 |                 elif expression == 4  and (not(arousal>=0 and valence<=0) or intensity<=0.2):
 72 |                     self.skipped['other'].append(key)
 73 |                     continue
 74 |                 elif expression == 5  and (valence>=0 or intensity<=0.3):
 75 |                     self.skipped['other'].append(key)
 76 |                     continue
 77 |                 elif expression == 6  and (arousal<=0 or intensity<=0.2):
 78 |                     self.skipped['other'].append(key)
 79 |                     continue
 80 |                 elif expression == 7  and (valence>=0 or intensity<=0.2):
 81 |                     self.skipped['other'].append(key)
 82 |                     continue
 83 |  
 84 |                 if self.n_expression == 5 and expression == 6:
 85 |                     expression = 4
 86 |             expressions.append(expression)
 87 |             self.keys.append(key)
 88 | 
 89 |         expressions = np.array(expressions)
 90 |         self.sample_per_class = {label:np.sum(expressions == label) for label in np.unique(expressions)}
 91 |         self.expression_weights = np.array([1./self.sample_per_class[e] for e in expressions])
 92 |         self.average_per_class = int(np.mean(list(self.sample_per_class.values())))
 93 | 
 94 |         if self.verbose:
 95 |             skipped = sum([len(self.skipped[key]) for key in self.skipped])
 96 |             msg = f' --  {len(self.keys)} images, skipped {len(self.skipped)} images ({len(self.skipped["pt_pt_error"])} with large errors).'
 97 |             print(msg)
 98 |             print(f'Samples per class : {self.sample_per_class}')
 99 | 
100 |     def __len__(self):
101 |         return len(self.keys)
102 | 
103 |     def __getitem__(self, index):
104 |         key = self.keys[index]
105 |         sample_data = self.data[key]
106 | 
107 |         image_file = self.image_path.joinpath(key).as_posix()
108 | 
109 |         valence = torch.tensor([float(sample_data['valence'])], dtype=torch.float32)
110 |         arousal = torch.tensor([float(sample_data['arousal'])], dtype=torch.float32)
111 |         expression = int(sample_data['expression'])
112 |     
113 |         if self.n_expression == 5 and expression == 6:
114 |             expression = 4
115 | 
116 |         landmarks = sample_data['landmarks_fan']
117 |         
118 |         if isinstance(landmarks, list):
119 |             landmarks = np.array(landmarks)
120 |         image = io.imread(image_file)
121 | 
122 |         if self.transform_image_shape is not None:
123 |             bounding_box = [landmarks.min(axis=0)[0], landmarks.min(axis=0)[1],
124 |                             landmarks.max(axis=0)[0], landmarks.max(axis=0)[1]]
125 |             #image, landmarks = self.transform_image_shape(image, shape=landmarks)
126 |             image, landmarks = self.transform_image_shape(image, bb=bounding_box)
127 |             # Fix for PyTorch currently not supporting negative stric
128 |             image = np.ascontiguousarray(image)
129 | 
130 |         if self.transform_image is not None:
131 |             image = self.transform_image(image)
132 | 
133 |         return dict(valence=valence, arousal=arousal, expression=expression, image=image, au=[])
134 | 
135 | 


--------------------------------------------------------------------------------
/EmotionTalk/toolkit/preprocess/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | ############ For LINUX ##############
  4 | DATA_DIR = {
  5 | 	'MER2023': '/share/home/lianzheng/chinese-mer-2023/dataset/mer2023-dataset-process',
  6 |     'IEMOCAPFour':  '/mnt/real_sda/sunhaoqin_space/code/MERTools-master/MERBench/dataset/iemocap-process',
  7 |     'IEMOCAPSix':  '/mnt/real_sda/sunhaoqin_space/code/MERTools-master/MERBench/dataset/iemocap-process',
  8 |     'CMUMOSI':  '/share/home/lianzheng/chinese-mer-2023/dataset/cmumosi-process',
  9 |     'CMUMOSEI': '/share/home/lianzheng/chinese-mer-2023/dataset/cmumosei-process',
 10 |     'SIMS': '/share/home/lianzheng/chinese-mer-2023/dataset/sims-process',
 11 |     'MELD': '/share/home/lianzheng/chinese-mer-2023/dataset/meld-process',
 12 |     'SIMSv2': '/share/home/lianzheng/chinese-mer-2023/dataset/simsv2-process',
 13 | }
 14 | PATH_TO_RAW_AUDIO = {
 15 | 	'MER2023': os.path.join(DATA_DIR['MER2023'], 'audio'),
 16 |     'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'subaudio'),
 17 |     'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'subaudio'),
 18 |     'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'subaudio'),
 19 |     'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'subaudio'),
 20 |     'SIMS': os.path.join(DATA_DIR['SIMS'], 'audio'),
 21 |     'MELD': os.path.join(DATA_DIR['MELD'], 'subaudio'),
 22 |     'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'audio'),
 23 | }
 24 | PATH_TO_RAW_VIDEO = {
 25 | 	'MER2023': os.path.join(DATA_DIR['MER2023'], 'video'),
 26 |     'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'subvideo-tgt'),
 27 |     'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'subvideo-tgt'),
 28 |     'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'subvideo'),
 29 |     'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'subvideo'),
 30 |     'SIMS': os.path.join(DATA_DIR['SIMS'], 'video'),
 31 |     'MELD': os.path.join(DATA_DIR['MELD'], 'subvideo'),
 32 |     'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'video'),
 33 | }
 34 | PATH_TO_RAW_FACE = {
 35 | 	'MER2023': os.path.join(DATA_DIR['MER2023'], 'openface_face'),
 36 |     'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'openface_face'),
 37 |     'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'openface_face'),
 38 |     'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'openface_face'),
 39 |     'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'openface_face'),
 40 |     'SIMS': os.path.join(DATA_DIR['SIMS'], 'openface_face'),
 41 |     'MELD': os.path.join(DATA_DIR['MELD'], 'openface_face'),
 42 |     'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'openface_face'),
 43 | }
 44 | PATH_TO_TRANSCRIPTIONS = {
 45 | 	'MER2023': os.path.join(DATA_DIR['MER2023'], 'transcription-engchi-polish.csv'),
 46 |     'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'transcription-engchi-polish.csv'),
 47 |     'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'transcription-engchi-polish.csv'),
 48 |     'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'transcription-engchi-polish.csv'),
 49 |     'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'transcription-engchi-polish.csv'),
 50 |     'SIMS': os.path.join(DATA_DIR['SIMS'], 'transcription-engchi-polish.csv'),
 51 |     'MELD': os.path.join(DATA_DIR['MELD'], 'transcription-engchi-polish.csv'),
 52 |     'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'transcription-engchi-polish.csv'),
 53 | }
 54 | PATH_TO_FEATURES = {
 55 | 	'MER2023': os.path.join(DATA_DIR['MER2023'], 'features'),
 56 |     'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'features'),
 57 |     'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'features'),
 58 |     'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'features'),
 59 |     'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'features'),
 60 |     'SIMS': os.path.join(DATA_DIR['SIMS'], 'features'),
 61 |     'MELD': os.path.join(DATA_DIR['MELD'], 'features'),
 62 |     'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'features'),
 63 | }
 64 | PATH_TO_LABEL = {
 65 | 	'MER2023': os.path.join(DATA_DIR['MER2023'], 'label-6way.npz'),
 66 |     'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'label_4way.npz'),
 67 |     'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'label_6way.npz'),
 68 |     'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'label.npz'),
 69 |     'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'label.npz'),
 70 |     'SIMS': os.path.join(DATA_DIR['SIMS'], 'label.npz'),
 71 |     'MELD': os.path.join(DATA_DIR['MELD'], 'label.npz'),
 72 |     'SIMSv2': os.path.join(DATA_DIR['SIMSv2'], 'label.npz'),
 73 | }
 74 | 
 75 | # pre-trained models, including supervised and unsupervised
 76 | PATH_TO_PRETRAINED_MODELS = './tools'
 77 | PATH_TO_OPENSMILE = './tools/opensmile-2.3.0/'
 78 | PATH_TO_FFMPEG = '/mnt/real_sda/sunhaoqin_space/code/ffmpeg-4.4.1-i686-static/ffmpeg'
 79 | 
 80 | # dir
 81 | SAVED_ROOT = os.path.join('./saved')
 82 | MODEL_DIR = os.path.join(SAVED_ROOT, 'model')
 83 | LOG_DIR = os.path.join(SAVED_ROOT, 'log')
 84 | PREDICTION_DIR = os.path.join(SAVED_ROOT, 'prediction')
 85 | FUSION_DIR = os.path.join(SAVED_ROOT, 'fusion')
 86 | SUBMISSION_DIR = os.path.join(SAVED_ROOT, 'submission')
 87 | 
 88 | 
 89 | ############ For Windows [OpenFace to extract face] ##############
 90 | DATA_DIR_Win = {
 91 | 	'CMUMOSI': 'E:\\Dataset\\CMU-MOSI\\Raw',
 92 | 	'CMUMOSEI': 'E:\\Dataset\\CMU-MOSEI',
 93 | 	'MER2023': 'H:\\desktop\\Multimedia-Transformer\\chinese-mer-2023\\mer2023-dataset-process',
 94 |     'IEMOCAP': 'E:\\Dataset\\iemocap-process',
 95 |     'MELD': 'E:\\Dataset\\meld-process',
 96 |     'SIMS': 'F:\\CH-SIMS-process',
 97 |     'SIMSv2': 'E:\Dataset\simsv2-process',
 98 | }
 99 | 
100 | PATH_TO_RAW_FACE_Win = {
101 | 	'CMUMOSI': os.path.join(DATA_DIR_Win['CMUMOSI'], 'Video\\Segmented'),
102 | 	'CMUMOSEI': os.path.join(DATA_DIR_Win['CMUMOSEI'], 'video'),
103 | 	'MER2023':   os.path.join(DATA_DIR_Win['MER2023'],   'video'),
104 |     'SIMS': os.path.join(DATA_DIR_Win['SIMS'], 'video'),
105 |     'IEMOCAP': os.path.join(DATA_DIR_Win['IEMOCAP'], 'subvideo-tgt'),
106 |     'MELD': os.path.join(DATA_DIR_Win['MELD'], 'subvideo'),
107 |     'SIMSv2': os.path.join(DATA_DIR_Win['SIMSv2'], 'video'),
108 | }
109 | 
110 | PATH_TO_FEATURES_Win = {
111 | 	'CMUMOSI':   os.path.join(DATA_DIR_Win['CMUMOSI'],   'features'),
112 | 	'CMUMOSEI': os.path.join(DATA_DIR_Win['CMUMOSEI'],  'features'),
113 | 	'MER2023':   os.path.join(DATA_DIR_Win['MER2023'],   'features'),
114 |     'SIMS': os.path.join(DATA_DIR_Win['SIMS'], 'features'),
115 |     'IEMOCAP': os.path.join(DATA_DIR_Win['IEMOCAP'], 'features'),
116 |     'MELD': os.path.join(DATA_DIR_Win['MELD'], 'features'),
117 |     'SIMSv2': os.path.join(DATA_DIR_Win['SIMSv2'], 'features'),
118 | }
119 | 
120 | PATH_TO_OPENFACE_Win = "H:\\desktop\\Multimedia-Transformer\\MERBench-master\\tools\\openface_win_x64"
121 | 


--------------------------------------------------------------------------------