├── LICENSE
├── MultiMAE-DER_Fine-Tuning Code
    └── MultiMAE_DER_FSLF.ipynb
├── MultiMAE-DER_Preprocessing Code
    ├── MFCC.jpg
    ├── Preprocessing_Audio.py
    ├── Preprocessing_CFAS.py
    ├── Preprocessing_FFLS.py
    ├── Preprocessing_FSLF.py
    ├── Preprocessing_OFOS.py
    ├── Preprocessing_RFAS.py
    ├── Preprocessing_SFAS.py
    ├── Tool.py
    ├── audio_img.jpg
    ├── img.jpg
    └── video_img.jpg
├── README.md
└── images
    ├── MultiMAE-DER.png
    ├── MultiMAE-DER_Program_Flowchart.png
    ├── Multimodal_Sequence_Fusion_Strategy.png
    ├── Result_on_CREMA-D.png
    ├── Result_on_IEMOCAP.png
    └── Result_on_RAVDESS.png


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/MFCC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/MultiMAE-DER_Preprocessing Code/MFCC.jpg


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_Audio.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import shutil
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | mid_point = 17
 19 | 
 20 | def normalize_audio(audio):
 21 |     audio = audio / np.max(np.abs(audio))
 22 |     return audio
 23 | 
 24 | def MFCC(signal,sample_rate):
 25 |     pre_emphasis = 0.97
 26 |     emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
 27 | 
 28 |     frame_size = 0.025
 29 |     frame_stride = 0.0001
 30 | 
 31 |     frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
 32 |     signal_length = len(emphasized_signal)
 33 |     frame_length = int(round(frame_length))
 34 |     frame_step = int(round(frame_step))
 35 |     num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
 36 | 
 37 |     pad_signal_length = num_frames * frame_step + frame_length
 38 |     z = np.zeros((pad_signal_length - signal_length))
 39 |     pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
 40 | 
 41 |     indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
 42 |     frames = pad_signal[indices.astype(np.int32, copy=False)]
 43 |     frames *= np.hamming(frame_length)
 44 |     NFFT = 512
 45 | 
 46 |     mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
 47 |     pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
 48 |     nfilt = 40
 49 | 
 50 |     low_freq_mel = 0
 51 |     high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
 52 |     mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
 53 |     hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
 54 |     bin = np.floor((NFFT + 1) * hz_points / sample_rate)
 55 | 
 56 |     fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
 57 |     for m in range(1, nfilt + 1):
 58 |         f_m_minus = int(bin[m - 1])   # left
 59 |         f_m = int(bin[m])             # center
 60 |         f_m_plus = int(bin[m + 1])    # right
 61 | 
 62 |         for k in range(f_m_minus, f_m):
 63 |             fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
 64 |         for k in range(f_m, f_m_plus):
 65 |             fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
 66 |     filter_banks = np.dot(pow_frames, fbank.T)
 67 |     filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
 68 |     filter_banks = 20 * np.log10(filter_banks)  # dB
 69 |     num_ceps = 13
 70 |     mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
 71 |     cep_lifter = 22
 72 |     (nframes, ncoeff) = mfcc.shape
 73 |     n = np.arange(ncoeff)
 74 |     lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
 75 |     mfcc *= lift
 76 |     return mfcc
 77 | 
 78 | def preprocessing_audio(path, save_path):
 79 |     n = 1
 80 | 
 81 |     for class_name in os.listdir(path):
 82 |         class_dir = os.path.join(path, class_name)
 83 |         save_dir = os.path.join(save_path, class_name)
 84 |         
 85 |         for video_file in os.listdir(class_dir):
 86 |             video_path = os.path.join(class_dir, video_file)
 87 | 
 88 |             video_name = os.path.basename(video_path).split(".")[0]
 89 |             mp4_name = str(n) +  '.mp4'
 90 |             path_video_save = os.path.join(save_dir, mp4_name)
 91 | 
 92 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
 93 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
 94 |               
 95 |             audio_clip = AudioFileClip(video_path)
 96 |             audio_name = os.path.basename(video_path).split(".")[0]
 97 |             wave_name = str(audio_name) +  '.wav'
 98 |             path_audio_save = os.path.join('Data\\RAVDESS\\RAVDESS_WAVE', wave_name)
 99 | 
100 |             audio_clip.write_audiofile(path_audio_save)
101 |             fs, Audiodata = wavfile.read(path_audio_save)
102 |             Audiodata = normalize_audio(Audiodata)
103 |             step=int((len(Audiodata))/mid_point) - 1
104 |             tx=np.arange(0,len(Audiodata),step)
105 |             
106 |             # Only Save Audio Spectrograms
107 |             for i in range(16):
108 |                 signal=Audiodata[tx[i]:tx[i+2]]
109 |                 mfcc=MFCC(signal,fs)
110 | 
111 |                 fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
112 |                 cax = ax.matshow(
113 |                     np.transpose(mfcc),
114 |                     interpolation="nearest",
115 |                     aspect="auto",
116 |                     # cmap=plt.cm.afmhot_r,
117 |                     origin="lower",
118 |                 )
119 | 
120 |                 plt.axis('off')
121 |                 fig.savefig("MFCC.jpg")
122 |                 audio_img = Image.open("MFCC.jpg")
123 |                 audio_img = audio_img.resize((224, 224))
124 |                 audio_img = np.array(audio_img)
125 | 
126 |                 cv2.imwrite('audio_img.jpg',audio_img)
127 |                 audio_img = Image.open("audio_img.jpg")
128 |                 audio_img = audio_img.resize((224, 224))
129 |                 audio_img = np.array(audio_img)
130 | 
131 |                 plt.close('all')
132 |                 output_video.write(audio_img)
133 | 
134 |             output_video.release()
135 |             cv2.destroyAllWindows()
136 |             n = n + 1
137 | 
138 |     return n
139 | 
140 | if __name__ == '__main__':
141 |     
142 |     shutil.rmtree("Data\\RAVDESS\\RAVDESS_WAVE")
143 |     os.mkdir("Data\\RAVDESS\\RAVDESS_WAVE")
144 |     
145 |     path_train = 'Data\\RAVDESS\\RAVDESS_RAW\\train'
146 |     save_train_path = 'Data\\RAVDESS\\RAVDESS_Audio\\train'
147 | 
148 |     # path_test = 'Data\\RAVDESS\\RAVDESS_RAW\\test'
149 |     # save_test_path = 'Data\\RAVDESS\\RAVDESS_Audio\\test'
150 | 
151 |     # path_val = 'Data\\RAVDESS\\RAVDESS_RAW\\val'
152 |     # save_val_path = 'Data\\RAVDESS\\RAVDESS_Audio\\val'
153 | 
154 |     n_train = preprocessing_audio(path_train, save_train_path)
155 |     # n_test = preprocessing_audio(path_test, save_test_path)
156 |     # n_val = preprocessing_audio(path_val, save_val_path)
157 | 
158 |     print(n_train)
159 |     # print(n_test)
160 |     # print(n_val)
161 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_CFAS.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import shutil
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | num_frame = 16
 19 | sampling_rate = 3
 20 | 
 21 | def normalize_audio(audio):
 22 |     audio = audio / np.max(np.abs(audio))
 23 |     return audio
 24 | 
 25 | def MFCC(signal,sample_rate):
 26 |     pre_emphasis = 0.97
 27 |     emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
 28 | 
 29 |     frame_size = 0.025
 30 |     frame_stride = 0.0001
 31 | 
 32 |     frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
 33 |     signal_length = len(emphasized_signal)
 34 |     frame_length = int(round(frame_length))
 35 |     frame_step = int(round(frame_step))
 36 |     num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
 37 | 
 38 |     pad_signal_length = num_frames * frame_step + frame_length
 39 |     z = np.zeros((pad_signal_length - signal_length))
 40 |     pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
 41 | 
 42 |     indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
 43 |     frames = pad_signal[indices.astype(np.int32, copy=False)]
 44 |     frames *= np.hamming(frame_length)
 45 |     NFFT = 512
 46 | 
 47 |     mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
 48 |     pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
 49 |     nfilt = 40
 50 | 
 51 |     low_freq_mel = 0
 52 |     high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
 53 |     mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
 54 |     hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
 55 |     bin = np.floor((NFFT + 1) * hz_points / sample_rate)
 56 | 
 57 |     fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
 58 |     for m in range(1, nfilt + 1):
 59 |         f_m_minus = int(bin[m - 1])   # left
 60 |         f_m = int(bin[m])             # center
 61 |         f_m_plus = int(bin[m + 1])    # right
 62 | 
 63 |         for k in range(f_m_minus, f_m):
 64 |             fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
 65 |         for k in range(f_m, f_m_plus):
 66 |             fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
 67 |     filter_banks = np.dot(pow_frames, fbank.T)
 68 |     filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
 69 |     filter_banks = 20 * np.log10(filter_banks)  # dB
 70 |     num_ceps = 13
 71 |     mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
 72 |     cep_lifter = 22
 73 |     (nframes, ncoeff) = mfcc.shape
 74 |     n = np.arange(ncoeff)
 75 |     lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
 76 |     mfcc *= lift
 77 |     return mfcc
 78 | 
 79 | def read_video(file_path):
 80 |     vr = VideoReader(file_path)
 81 |     frames = vr.get_batch(range(len(vr))).asnumpy()
 82 |     return format_frames(
 83 |         frames,
 84 |         output_size=(input_size, input_size)
 85 |     )
 86 | 
 87 | def format_frames(frame, output_size):
 88 |     frame = tf.image.convert_image_dtype(frame, tf.uint8)
 89 |     frame = tf.image.resize(frame, size=list(output_size))
 90 |     return frame
 91 | 
 92 | def uniform_temporal_subsample(
 93 |     x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4
 94 | ):
 95 |     t = tf.shape(x)[temporal_dim]
 96 |     max_offset = t - num_samples * frame_rate
 97 |     step = max_offset // total_clips
 98 |     offset = clip_idx * step
 99 |     indices = tf.linspace(
100 |         tf.cast(offset, tf.float32),
101 |         tf.cast(offset + (num_samples-1) * frame_rate, tf.float32),
102 |         num_samples
103 |     )
104 |     indices = tf.clip_by_value(indices, 0, tf.cast(t - 1, tf.float32))
105 |     indices = tf.cast(tf.round(indices), tf.int32)
106 |     return tf.gather(x, indices, axis=temporal_dim)
107 | 
108 | 
109 | def clip_generator(
110 |     image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224
111 | ):
112 |     clips_list = []
113 |     for i in range(num_clips):
114 |         frame = uniform_temporal_subsample(
115 |             image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0
116 |         )
117 |         clips_list.append(frame)
118 | 
119 |     video = tf.stack(clips_list)
120 |     video = tf.reshape(
121 |         video, [num_clips*num_frames, crop_size, crop_size, 3]
122 |     )
123 |     return video
124 | 
125 | def video_audio(path, save_path):
126 |     n = 1
127 | 
128 |     for class_name in os.listdir(path):
129 |         class_dir = os.path.join(path, class_name)
130 |         save_dir = os.path.join(save_path, class_name)
131 |         
132 |         for video_file in os.listdir(class_dir):
133 |             video_path = os.path.join(class_dir, video_file)
134 | 
135 |             video_name = os.path.basename(video_path).split(".")[0]
136 |             mp4_name = str(video_name) +  '.mp4'
137 |             path_video_save = os.path.join(save_dir, mp4_name)
138 | 
139 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
140 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
141 | 
142 |             video_ds = read_video(video_path)
143 |             video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)
144 |               
145 |             audio_clip = AudioFileClip(video_path)
146 |             audio_name = os.path.basename(video_path).split(".")[0]
147 |             wave_name = str(audio_name) +  '.wav'
148 |             path_audio_save = os.path.join('Data\\MEAD\\MEAD_WAVE', wave_name)
149 | 
150 |             audio_clip.write_audiofile(path_audio_save)
151 |             fs, Audiodata = wavfile.read(path_audio_save)
152 |             Audiodata = normalize_audio(Audiodata)
153 |             step=int((len(Audiodata))/17) - 1
154 |             tx=np.arange(0,len(Audiodata),step)
155 |             
156 |         # Combine of Face and Spectrogram
157 |             for i in range(16):
158 |                 video_img = video_ds.numpy()[i]
159 |                 video_img = video_img.astype('uint8')
160 | 
161 |                 signal=Audiodata[tx[i]:tx[i+2]]
162 |                 mfcc=MFCC(signal,fs)
163 | 
164 |                 fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
165 |                 cax = ax.matshow(
166 |                     np.transpose(mfcc),
167 |                     interpolation="nearest",
168 |                     aspect="auto",
169 |                     #cmap=plt.cm.afmhot_r,
170 |                     origin="lower",
171 |                 )
172 | 
173 |                 plt.axis('off')
174 |                 fig.savefig("MFCC.jpg")
175 |                 audio_img = Image.open("MFCC.jpg")
176 |                 audio_img = audio_img.resize((224, 224))
177 |                 audio_img = np.array(audio_img)
178 | 
179 |                 img = np.concatenate((video_img, audio_img), axis = 0)
180 |                 cv2.imwrite('img.jpg',img)
181 |                 img = Image.open("img.jpg")
182 |                 img = img.resize((224, 224))
183 |                 img = np.array(img)
184 | 
185 |                 plt.close('all')
186 |                 output_video.write(img)
187 | 
188 |             output_video.release()
189 |             cv2.destroyAllWindows()
190 |             n = n + 1
191 | 
192 |     return n
193 | 
194 | if __name__ == '__main__':
195 | 
196 |     shutil.rmtree("Data\\MEAD\\MEAD_WAVE")
197 |     os.mkdir("Data\\MEAD\\MEAD_WAVE")
198 | 
199 |     path_train = 'Data\\MEAD\\MEAD\\train'
200 |     save_train_path = 'Data\\MEAD\\MEAD_CFAS\\train'
201 | 
202 |     path_test = 'Data\\MEAD\\MEAD\\test'
203 |     save_test_path = 'Data\\MEAD\\MEAD_CFAS\\test'
204 | 
205 |     path_val = 'Data\\MEAD\\MEAD\\val'
206 |     save_val_path = 'Data\\MEAD\\MEAD_CFAS\\val'
207 | 
208 |     n_train = video_audio(path_train, save_train_path)
209 |     n_test = video_audio(path_test, save_test_path)
210 |     n_val = video_audio(path_val, save_val_path)
211 | 
212 |     print(n_train)
213 |     print(n_test)
214 |     print(n_val)
215 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_FFLS.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import shutil
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | num_frame = 8
 19 | sampling_rate = 6
 20 | 
 21 | def normalize_audio(audio):
 22 |     audio = audio / np.max(np.abs(audio))
 23 |     return audio
 24 | 
 25 | def MFCC(signal,sample_rate):
 26 |     pre_emphasis = 0.97
 27 |     emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
 28 | 
 29 |     frame_size = 0.025
 30 |     frame_stride = 0.0001
 31 | 
 32 |     frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
 33 |     signal_length = len(emphasized_signal)
 34 |     frame_length = int(round(frame_length))
 35 |     frame_step = int(round(frame_step))
 36 |     num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
 37 | 
 38 |     pad_signal_length = num_frames * frame_step + frame_length
 39 |     z = np.zeros((pad_signal_length - signal_length))
 40 |     pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
 41 | 
 42 |     indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
 43 |     frames = pad_signal[indices.astype(np.int32, copy=False)]
 44 |     frames *= np.hamming(frame_length)
 45 |     NFFT = 512
 46 | 
 47 |     mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
 48 |     pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
 49 |     nfilt = 40
 50 | 
 51 |     low_freq_mel = 0
 52 |     high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
 53 |     mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
 54 |     hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
 55 |     bin = np.floor((NFFT + 1) * hz_points / sample_rate)
 56 | 
 57 |     fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
 58 |     for m in range(1, nfilt + 1):
 59 |         f_m_minus = int(bin[m - 1])   # left
 60 |         f_m = int(bin[m])             # center
 61 |         f_m_plus = int(bin[m + 1])    # right
 62 | 
 63 |         for k in range(f_m_minus, f_m):
 64 |             fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
 65 |         for k in range(f_m, f_m_plus):
 66 |             fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
 67 |     filter_banks = np.dot(pow_frames, fbank.T)
 68 |     filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
 69 |     filter_banks = 20 * np.log10(filter_banks)  # dB
 70 |     num_ceps = 13
 71 |     mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
 72 |     cep_lifter = 22
 73 |     (nframes, ncoeff) = mfcc.shape
 74 |     n = np.arange(ncoeff)
 75 |     lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
 76 |     mfcc *= lift
 77 |     return mfcc
 78 | 
 79 | def read_video(file_path):
 80 |     vr = VideoReader(file_path)
 81 |     frames = vr.get_batch(range(len(vr))).asnumpy()
 82 |     return format_frames(
 83 |         frames,
 84 |         output_size=(input_size, input_size)
 85 |     )
 86 | 
 87 | def format_frames(frame, output_size):
 88 |     frame = tf.image.convert_image_dtype(frame, tf.uint8)
 89 |     frame = tf.image.resize(frame, size=list(output_size))
 90 |     return frame
 91 | 
 92 | def uniform_temporal_subsample(
 93 |     x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4
 94 | ):
 95 |     t = tf.shape(x)[temporal_dim]
 96 |     max_offset = t - num_samples * frame_rate
 97 |     step = max_offset // total_clips
 98 |     offset = clip_idx * step
 99 |     indices = tf.linspace(
100 |         tf.cast(offset, tf.float32),
101 |         tf.cast(offset + (num_samples-1) * frame_rate, tf.float32),
102 |         num_samples
103 |     )
104 |     indices = tf.clip_by_value(indices, 0, tf.cast(t - 1, tf.float32))
105 |     indices = tf.cast(tf.round(indices), tf.int32)
106 |     return tf.gather(x, indices, axis=temporal_dim)
107 | 
108 | 
109 | def clip_generator(
110 |     image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224
111 | ):
112 |     clips_list = []
113 |     for i in range(num_clips):
114 |         frame = uniform_temporal_subsample(
115 |             image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0
116 |         )
117 |         clips_list.append(frame)
118 | 
119 |     video = tf.stack(clips_list)
120 |     video = tf.reshape(
121 |         video, [num_clips*num_frames, crop_size, crop_size, 3]
122 |     )
123 |     return video
124 | 
125 | def video_audio(path, save_path):
126 |     n = 1
127 | 
128 |     for class_name in os.listdir(path):
129 |         class_dir = os.path.join(path, class_name)
130 |         save_dir = os.path.join(save_path, class_name)
131 |         
132 |         for video_file in os.listdir(class_dir):
133 |             video_path = os.path.join(class_dir, video_file)
134 | 
135 |             video_name = os.path.basename(video_path).split(".")[0]
136 |             mp4_name = str(video_name) +  '.mp4'
137 |             path_video_save = os.path.join(save_dir, mp4_name)
138 | 
139 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
140 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
141 | 
142 |             video_ds = read_video(video_path)
143 |             video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)
144 |               
145 |             audio_clip = AudioFileClip(video_path)
146 |             audio_name = os.path.basename(video_path).split(".")[0]
147 |             wave_name = str(audio_name) +  '.wav'
148 |             path_audio_save = os.path.join('Data\\MEAD\\MEAD_WAVE', wave_name)
149 | 
150 |             audio_clip.write_audiofile(path_audio_save)
151 |             fs, Audiodata = wavfile.read(path_audio_save)
152 |             Audiodata = normalize_audio(Audiodata)
153 |             step=int((len(Audiodata))/9) - 1
154 |             tx=np.arange(0,len(Audiodata),step)
155 |             
156 |         # First Face Late Spectrogram
157 |             for i in range(8):
158 |                 video_img = video_ds.numpy()[i]
159 |                 video_img = video_img.astype('uint8')
160 |                 plt.axis('off')
161 | 
162 |                 cv2.imwrite('video_img.jpg',video_img)
163 |                 video_img = Image.open("video_img.jpg")
164 |                 video_img = video_img.resize((224, 224))
165 |                 video_img = np.array(video_img)
166 | 
167 |                 plt.close('all')
168 |                 output_video.write(video_img)
169 | 
170 |             for i in range(8):
171 |                 signal=Audiodata[tx[i]:tx[i+2]]
172 |                 mfcc=MFCC(signal,fs)
173 | 
174 |                 fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
175 |                 cax = ax.matshow(
176 |                     np.transpose(mfcc),
177 |                     interpolation="nearest",
178 |                     aspect="auto",
179 |                     #cmap=plt.cm.afmhot_r,
180 |                     origin="lower",
181 |                 )
182 | 
183 |                 plt.axis('off')
184 |                 fig.savefig("MFCC.jpg")
185 |                 audio_img = Image.open("MFCC.jpg")
186 |                 audio_img = audio_img.resize((224, 224))
187 |                 audio_img = np.array(audio_img)
188 | 
189 |                 cv2.imwrite('audio_img.jpg',audio_img)
190 |                 audio_img = Image.open("audio_img.jpg")
191 |                 audio_img = audio_img.resize((224, 224))
192 |                 audio_img = np.array(audio_img)
193 | 
194 |                 plt.close('all')
195 |                 output_video.write(audio_img)
196 | 
197 |             output_video.release()
198 |             cv2.destroyAllWindows()
199 |             n = n + 1
200 | 
201 |     return n
202 | 
203 | if __name__ == '__main__':
204 | 
205 |     shutil.rmtree("Data\\MEAD\\MEAD_WAVE")
206 |     os.mkdir("Data\\MEAD\\MEAD_WAVE")
207 | 
208 |     path_train = 'Data\\MEAD\\MEAD\\train'
209 |     save_train_path = 'Data\\MEAD\\MEAD_FFLS\\train'
210 | 
211 |     path_test = 'Data\\MEAD\\MEAD\\test'
212 |     save_test_path = 'Data\\MEAD\\MEAD_FFLS\\test'
213 | 
214 |     path_val = 'Data\\MEAD\\MEAD\\val'
215 |     save_val_path = 'Data\\MEAD\\MEAD_FFLS\\val'
216 | 
217 |     n_train = video_audio(path_train, save_train_path)
218 |     n_test = video_audio(path_test, save_test_path)
219 |     n_val = video_audio(path_val, save_val_path)
220 | 
221 |     print(n_train)
222 |     print(n_test)
223 |     print(n_val)
224 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_FSLF.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import shutil
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | num_frame = 8
 19 | sampling_rate = 6
 20 | 
 21 | def normalize_audio(audio):
 22 |     audio = audio / np.max(np.abs(audio))
 23 |     return audio
 24 | 
 25 | def MFCC(signal,sample_rate):
 26 |     pre_emphasis = 0.97
 27 |     emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
 28 | 
 29 |     frame_size = 0.025
 30 |     frame_stride = 0.0001
 31 | 
 32 |     frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
 33 |     signal_length = len(emphasized_signal)
 34 |     frame_length = int(round(frame_length))
 35 |     frame_step = int(round(frame_step))
 36 |     num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
 37 | 
 38 |     pad_signal_length = num_frames * frame_step + frame_length
 39 |     z = np.zeros((pad_signal_length - signal_length))
 40 |     pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
 41 | 
 42 |     indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
 43 |     frames = pad_signal[indices.astype(np.int32, copy=False)]
 44 |     frames *= np.hamming(frame_length)
 45 |     NFFT = 512
 46 | 
 47 |     mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
 48 |     pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
 49 |     nfilt = 40
 50 | 
 51 |     low_freq_mel = 0
 52 |     high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
 53 |     mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
 54 |     hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
 55 |     bin = np.floor((NFFT + 1) * hz_points / sample_rate)
 56 | 
 57 |     fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
 58 |     for m in range(1, nfilt + 1):
 59 |         f_m_minus = int(bin[m - 1])   # left
 60 |         f_m = int(bin[m])             # center
 61 |         f_m_plus = int(bin[m + 1])    # right
 62 | 
 63 |         for k in range(f_m_minus, f_m):
 64 |             fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
 65 |         for k in range(f_m, f_m_plus):
 66 |             fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
 67 |     filter_banks = np.dot(pow_frames, fbank.T)
 68 |     filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
 69 |     filter_banks = 20 * np.log10(filter_banks)  # dB
 70 |     num_ceps = 13
 71 |     mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
 72 |     cep_lifter = 22
 73 |     (nframes, ncoeff) = mfcc.shape
 74 |     n = np.arange(ncoeff)
 75 |     lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
 76 |     mfcc *= lift
 77 |     return mfcc
 78 | 
 79 | def read_video(file_path):
 80 |     vr = VideoReader(file_path)
 81 |     frames = vr.get_batch(range(len(vr))).asnumpy()
 82 |     return format_frames(
 83 |         frames,
 84 |         output_size=(input_size, input_size)
 85 |     )
 86 | 
 87 | def format_frames(frame, output_size):
 88 |     frame = tf.image.convert_image_dtype(frame, tf.uint8)
 89 |     frame = tf.image.resize(frame, size=list(output_size))
 90 |     return frame
 91 | 
 92 | def uniform_temporal_subsample(
 93 |     x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4
 94 | ):
 95 |     t = tf.shape(x)[temporal_dim]
 96 |     max_offset = t - num_samples * frame_rate
 97 |     step = max_offset // total_clips
 98 |     offset = clip_idx * step
 99 |     indices = tf.linspace(
100 |         tf.cast(offset, tf.float32),
101 |         tf.cast(offset + (num_samples-1) * frame_rate, tf.float32),
102 |         num_samples
103 |     )
104 |     indices = tf.clip_by_value(indices, 0, tf.cast(t - 1, tf.float32))
105 |     indices = tf.cast(tf.round(indices), tf.int32)
106 |     return tf.gather(x, indices, axis=temporal_dim)
107 | 
108 | 
109 | def clip_generator(
110 |     image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224
111 | ):
112 |     clips_list = []
113 |     for i in range(num_clips):
114 |         frame = uniform_temporal_subsample(
115 |             image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0
116 |         )
117 |         clips_list.append(frame)
118 | 
119 |     video = tf.stack(clips_list)
120 |     video = tf.reshape(
121 |         video, [num_clips*num_frames, crop_size, crop_size, 3]
122 |     )
123 |     return video
124 | 
125 | def video_audio(path, save_path):
126 |     n = 1
127 | 
128 |     for class_name in os.listdir(path):
129 |         class_dir = os.path.join(path, class_name)
130 |         save_dir = os.path.join(save_path, class_name)
131 |         
132 |         for video_file in os.listdir(class_dir):
133 |             video_path = os.path.join(class_dir, video_file)
134 | 
135 |             video_name = os.path.basename(video_path).split(".")[0]
136 |             mp4_name = str(video_name) +  '.mp4'
137 |             path_video_save = os.path.join(save_dir, mp4_name)
138 | 
139 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
140 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
141 | 
142 |             video_ds = read_video(video_path)
143 |             video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)
144 |               
145 |             audio_clip = AudioFileClip(video_path)
146 |             audio_name = os.path.basename(video_path).split(".")[0]
147 |             wave_name = str(audio_name) +  '.wav'
148 |             path_audio_save = os.path.join('Data\\MEAD\\MEAD_WAVE', wave_name)
149 | 
150 |             audio_clip.write_audiofile(path_audio_save)
151 |             fs, Audiodata = wavfile.read(path_audio_save)
152 |             Audiodata = normalize_audio(Audiodata)
153 |             step=int((len(Audiodata))/9) - 1
154 |             tx=np.arange(0,len(Audiodata),step)
155 |             
156 |         # First Spectrogram Late Face
157 |             for i in range(8):
158 |                 signal=Audiodata[tx[i]:tx[i+2]]
159 |                 mfcc=MFCC(signal,fs)
160 | 
161 |                 fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
162 |                 cax = ax.matshow(
163 |                     np.transpose(mfcc),
164 |                     interpolation="nearest",
165 |                     aspect="auto",
166 |                     #cmap=plt.cm.afmhot_r,
167 |                     origin="lower",
168 |                 )
169 | 
170 |                 plt.axis('off')
171 |                 fig.savefig("MFCC.jpg")
172 |                 audio_img = Image.open("MFCC.jpg")
173 |                 audio_img = audio_img.resize((224, 224))
174 |                 audio_img = np.array(audio_img)
175 | 
176 |                 cv2.imwrite('audio_img.jpg',audio_img)
177 |                 audio_img = Image.open("audio_img.jpg")
178 |                 audio_img = audio_img.resize((224, 224))
179 |                 audio_img = np.array(audio_img)
180 | 
181 |                 plt.close('all')
182 |                 output_video.write(audio_img)
183 | 
184 |             for i in range(8):
185 |                 video_img = video_ds.numpy()[i]
186 |                 video_img = video_img.astype('uint8')
187 |                 plt.axis('off')
188 | 
189 |                 cv2.imwrite('video_img.jpg',video_img)
190 |                 video_img = Image.open("video_img.jpg")
191 |                 video_img = video_img.resize((224, 224))
192 |                 video_img = np.array(video_img)
193 | 
194 |                 plt.close('all')
195 |                 output_video.write(video_img)
196 | 
197 |             output_video.release()
198 |             cv2.destroyAllWindows()
199 |             n = n + 1
200 | 
201 |     return n
202 | 
203 | if __name__ == '__main__':
204 | 
205 |     shutil.rmtree("Data\\MEAD\\MEAD_WAVE")
206 |     os.mkdir("Data\\MEAD\\MEAD_WAVE")
207 | 
208 |     path_train = 'Data\\MEAD\\MEAD\\train'
209 |     save_train_path = 'Data\\MEAD\\MEAD_FSLF\\train'
210 | 
211 |     path_test = 'Data\\MEAD\\MEAD\\test'
212 |     save_test_path = 'Data\\MEAD\\MEAD_FSLF\\test'
213 | 
214 |     path_val = 'Data\\MEAD\\MEAD\\val'
215 |     save_val_path = 'Data\\MEAD\\MEAD_FSLF\\val'
216 | 
217 |     n_train = video_audio(path_train, save_train_path)
218 |     n_test = video_audio(path_test, save_test_path)
219 |     n_val = video_audio(path_val, save_val_path)
220 | 
221 |     print(n_train)
222 |     print(n_test)
223 |     print(n_val)
224 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_OFOS.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import shutil
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | num_frame = 8
 19 | sampling_rate = 6
 20 | 
 21 | def normalize_audio(audio):
 22 |     audio = audio / np.max(np.abs(audio))
 23 |     return audio
 24 | 
 25 | def MFCC(signal,sample_rate):
 26 |     pre_emphasis = 0.97
 27 |     emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
 28 | 
 29 |     frame_size = 0.025
 30 |     frame_stride = 0.0001
 31 | 
 32 |     frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
 33 |     signal_length = len(emphasized_signal)
 34 |     frame_length = int(round(frame_length))
 35 |     frame_step = int(round(frame_step))
 36 |     num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
 37 | 
 38 |     pad_signal_length = num_frames * frame_step + frame_length
 39 |     z = np.zeros((pad_signal_length - signal_length))
 40 |     pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
 41 | 
 42 |     indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
 43 |     frames = pad_signal[indices.astype(np.int32, copy=False)]
 44 |     frames *= np.hamming(frame_length)
 45 |     NFFT = 512
 46 | 
 47 |     mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
 48 |     pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
 49 |     nfilt = 40
 50 | 
 51 |     low_freq_mel = 0
 52 |     high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
 53 |     mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
 54 |     hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
 55 |     bin = np.floor((NFFT + 1) * hz_points / sample_rate)
 56 | 
 57 |     fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
 58 |     for m in range(1, nfilt + 1):
 59 |         f_m_minus = int(bin[m - 1])   # left
 60 |         f_m = int(bin[m])             # center
 61 |         f_m_plus = int(bin[m + 1])    # right
 62 | 
 63 |         for k in range(f_m_minus, f_m):
 64 |             fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
 65 |         for k in range(f_m, f_m_plus):
 66 |             fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
 67 |     filter_banks = np.dot(pow_frames, fbank.T)
 68 |     filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
 69 |     filter_banks = 20 * np.log10(filter_banks)  # dB
 70 |     num_ceps = 13
 71 |     mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
 72 |     cep_lifter = 22
 73 |     (nframes, ncoeff) = mfcc.shape
 74 |     n = np.arange(ncoeff)
 75 |     lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
 76 |     mfcc *= lift
 77 |     return mfcc
 78 | 
 79 | def read_video(file_path):
 80 |     vr = VideoReader(file_path)
 81 |     frames = vr.get_batch(range(len(vr))).asnumpy()
 82 |     return format_frames(
 83 |         frames,
 84 |         output_size=(input_size, input_size)
 85 |     )
 86 | 
 87 | def format_frames(frame, output_size):
 88 |     frame = tf.image.convert_image_dtype(frame, tf.uint8)
 89 |     frame = tf.image.resize(frame, size=list(output_size))
 90 |     return frame
 91 | 
 92 | def uniform_temporal_subsample(
 93 |     x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4
 94 | ):
 95 |     t = tf.shape(x)[temporal_dim]
 96 |     max_offset = t - num_samples * frame_rate
 97 |     step = max_offset // total_clips
 98 |     offset = clip_idx * step
 99 |     indices = tf.linspace(
100 |         tf.cast(offset, tf.float32),
101 |         tf.cast(offset + (num_samples-1) * frame_rate, tf.float32),
102 |         num_samples
103 |     )
104 |     indices = tf.clip_by_value(indices, 0, tf.cast(t - 1, tf.float32))
105 |     indices = tf.cast(tf.round(indices), tf.int32)
106 |     return tf.gather(x, indices, axis=temporal_dim)
107 | 
108 | 
109 | def clip_generator(
110 |     image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224
111 | ):
112 |     clips_list = []
113 |     for i in range(num_clips):
114 |         frame = uniform_temporal_subsample(
115 |             image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0
116 |         )
117 |         clips_list.append(frame)
118 | 
119 |     video = tf.stack(clips_list)
120 |     video = tf.reshape(
121 |         video, [num_clips*num_frames, crop_size, crop_size, 3]
122 |     )
123 |     return video
124 | 
125 | def video_audio(path, save_path):
126 |     n = 1
127 | 
128 |     for class_name in os.listdir(path):
129 |         class_dir = os.path.join(path, class_name)
130 |         save_dir = os.path.join(save_path, class_name)
131 |         
132 |         for video_file in os.listdir(class_dir):
133 |             video_path = os.path.join(class_dir, video_file)
134 | 
135 |             video_name = os.path.basename(video_path).split(".")[0]
136 |             mp4_name = str(video_name) +  '.mp4'
137 |             path_video_save = os.path.join(save_dir, mp4_name)
138 | 
139 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
140 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
141 | 
142 |             video_ds = read_video(video_path)
143 |             video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)
144 |               
145 |             audio_clip = AudioFileClip(video_path)
146 |             audio_name = os.path.basename(video_path).split(".")[0]
147 |             wave_name = str(audio_name) +  '.wav'
148 |             path_audio_save = os.path.join('Data\\MEAD\\MEAD_WAVE', wave_name)
149 | 
150 |             audio_clip.write_audiofile(path_audio_save)
151 |             fs, Audiodata = wavfile.read(path_audio_save)
152 |             Audiodata = normalize_audio(Audiodata)
153 |             step=int((len(Audiodata))/9) - 1
154 |             tx=np.arange(0,len(Audiodata),step)
155 |             
156 |         # One Face One Spectrogram
157 |             for i in range(8):
158 |                 video_img = video_ds.numpy()[i]
159 |                 video_img = video_img.astype('uint8')
160 |                 plt.axis('off')
161 | 
162 |                 cv2.imwrite('video_img.jpg',video_img)
163 |                 video_img = Image.open("video_img.jpg")
164 |                 video_img = video_img.resize((224, 224))
165 |                 video_img = np.array(video_img)
166 | 
167 |                 plt.close('all')
168 |                 output_video.write(video_img)
169 | 
170 |                 signal=Audiodata[tx[i]:tx[i+2]]
171 |                 mfcc=MFCC(signal,fs)
172 | 
173 |                 fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
174 |                 cax = ax.matshow(
175 |                     np.transpose(mfcc),
176 |                     interpolation="nearest",
177 |                     aspect="auto",
178 |                     #cmap=plt.cm.afmhot_r,
179 |                     origin="lower",
180 |                 )
181 | 
182 |                 plt.axis('off')
183 |                 fig.savefig("MFCC.jpg")
184 |                 audio_img = Image.open("MFCC.jpg")
185 |                 audio_img = audio_img.resize((224, 224))
186 |                 audio_img = np.array(audio_img)
187 | 
188 |                 cv2.imwrite('audio_img.jpg',audio_img)
189 |                 audio_img = Image.open("audio_img.jpg")
190 |                 audio_img = audio_img.resize((224, 224))
191 |                 audio_img = np.array(audio_img)
192 | 
193 |                 plt.close('all')
194 |                 output_video.write(audio_img)
195 | 
196 |             output_video.release()
197 |             cv2.destroyAllWindows()
198 |             n = n + 1
199 | 
200 |     return n
201 | 
202 | if __name__ == '__main__':
203 | 
204 |     shutil.rmtree("Data\\MEAD\\MEAD_WAVE")
205 |     os.mkdir("Data\\MEAD\\MEAD_WAVE")
206 | 
207 |     path_train = 'Data\\MEAD\\MEAD\\train'
208 |     save_train_path = 'Data\\MEAD\\MEAD_OFOS\\train'
209 | 
210 |     path_test = 'Data\\MEAD\\MEAD\\test'
211 |     save_test_path = 'Data\\MEAD\\MEAD_OFOS\\test'
212 | 
213 |     path_val = 'Data\\MEAD\\MEAD\\val'
214 |     save_val_path = 'Data\\MEAD\\MEAD_OFOS\\val'
215 | 
216 |     n_train = video_audio(path_train, save_train_path)
217 |     n_test = video_audio(path_test, save_test_path)
218 |     n_val = video_audio(path_val, save_val_path)
219 | 
220 |     print(n_train)
221 |     print(n_test)
222 |     print(n_val)
223 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_RFAS.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import random
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | num_frame = 16
 19 | sampling_rate = 1
 20 | 
 21 | def read_video(file_path):
 22 |     vr = VideoReader(file_path)
 23 |     frames = vr.get_batch(range(len(vr))).asnumpy()
 24 |     return format_frames(
 25 |         frames,
 26 |         output_size=(input_size, input_size)
 27 |     )
 28 | 
 29 | def format_frames(frame, output_size):
 30 |     frame = tf.image.convert_image_dtype(frame, tf.uint8)
 31 |     frame = tf.image.resize(frame, size=list(output_size))
 32 |     return frame
 33 | 
 34 | def uniform_temporal_subsample(
 35 |     x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4
 36 | ):
 37 |     t = tf.shape(x)[temporal_dim]
 38 |     max_offset = t - num_samples * frame_rate
 39 |     step = max_offset // total_clips
 40 |     offset = clip_idx * step
 41 |     indices = tf.linspace(
 42 |         tf.cast(offset, tf.float32),
 43 |         tf.cast(offset + (num_samples-1) * frame_rate, tf.float32),
 44 |         num_samples
 45 |     )
 46 |     indices = tf.clip_by_value(indices, 0, tf.cast(t - 1, tf.float32))
 47 |     indices = tf.cast(tf.round(indices), tf.int32)
 48 |     return tf.gather(x, indices, axis=temporal_dim)
 49 | 
 50 | 
 51 | def clip_generator(
 52 |     image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224
 53 | ):
 54 |     clips_list = []
 55 |     for i in range(num_clips):
 56 |         frame = uniform_temporal_subsample(
 57 |             image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0
 58 |         )
 59 |         clips_list.append(frame)
 60 | 
 61 |     video = tf.stack(clips_list)
 62 |     video = tf.reshape(
 63 |         video, [num_clips*num_frames, crop_size, crop_size, 3]
 64 |     )
 65 |     return video
 66 | 
 67 | def video_audio(path, save_path):
 68 |     n = 1
 69 | 
 70 |     for class_name in os.listdir(path):
 71 |         class_dir = os.path.join(path, class_name)
 72 |         save_dir = os.path.join(save_path, class_name)
 73 |         
 74 |         for video_file in os.listdir(class_dir):
 75 |             video_path = os.path.join(class_dir, video_file)
 76 | 
 77 |             video_name = os.path.basename(video_path).split(".")[0]
 78 |             mp4_name = str(video_name) +  '.mp4'
 79 |             path_video_save = os.path.join(save_dir, mp4_name)
 80 | 
 81 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
 82 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
 83 | 
 84 |             video_ds = read_video(video_path)
 85 |             video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)
 86 | 
 87 |             L1 = random.sample(range(0, 16), 16)
 88 |         # Random Face and Spectrogram
 89 |             for i in L1:
 90 |                 video_img = video_ds.numpy()[i]
 91 |                 video_img = video_img.astype('uint8')
 92 |                 plt.axis('off')
 93 | 
 94 |                 cv2.imwrite('video_img.jpg',video_img)
 95 |                 video_img = Image.open("video_img.jpg")
 96 |                 video_img = video_img.resize((224, 224))
 97 |                 video_img = np.array(video_img)
 98 | 
 99 |                 plt.close('all')
100 |                 output_video.write(video_img)
101 | 
102 |             output_video.release()
103 |             cv2.destroyAllWindows()
104 |             n = n + 1
105 | 
106 |     return n
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     path_train = 'Data\\MEAD\\MEAD_OFOS\\train'
111 |     save_train_path = 'Data\\MEAD\\MEAD_RFAS\\train'
112 | 
113 |     path_test = 'Data\\MEAD\\MEAD_OFOS\\test'
114 |     save_test_path = 'Data\\MEAD\\MEAD_RFAS\\test'
115 | 
116 |     path_val = 'Data\\MEAD\\MEAD_OFOS\\val'
117 |     save_val_path = 'Data\\MEAD\\MEAD_RFAS\\val'
118 | 
119 |     n_train = video_audio(path_train, save_train_path)
120 |     n_test = video_audio(path_test, save_test_path)
121 |     n_val = video_audio(path_val, save_val_path)
122 | 
123 |     print(n_train)
124 |     print(n_test)
125 |     print(n_val)
126 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Preprocessing_SFAS.py:
--------------------------------------------------------------------------------
  1 | import os, warnings
  2 | import cv2
  3 | import shutil
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import matplotlib.pyplot as plt
  8 | from decord import VideoReader
  9 | from moviepy.editor import AudioFileClip
 10 | 
 11 | from scipy.io import wavfile # scipy library to read wav files
 12 | import numpy as np
 13 | from scipy.fftpack import dct
 14 | from matplotlib import pyplot as plt
 15 | from PIL import Image
 16 | 
 17 | input_size = 224
 18 | num_frame = 16
 19 | sampling_rate = 3
 20 | 
 21 | def normalize_audio(audio):
 22 |     audio = audio / np.max(np.abs(audio))
 23 |     return audio
 24 | 
 25 | def MFCC(signal,sample_rate):
 26 |     pre_emphasis = 0.97
 27 |     emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
 28 | 
 29 |     frame_size = 0.025
 30 |     frame_stride = 0.0001
 31 | 
 32 |     frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
 33 |     signal_length = len(emphasized_signal)
 34 |     frame_length = int(round(frame_length))
 35 |     frame_step = int(round(frame_step))
 36 |     num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
 37 | 
 38 |     pad_signal_length = num_frames * frame_step + frame_length
 39 |     z = np.zeros((pad_signal_length - signal_length))
 40 |     pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
 41 | 
 42 |     indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
 43 |     frames = pad_signal[indices.astype(np.int32, copy=False)]
 44 |     frames *= np.hamming(frame_length)
 45 |     NFFT = 512
 46 | 
 47 |     mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
 48 |     pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
 49 |     nfilt = 40
 50 | 
 51 |     low_freq_mel = 0
 52 |     high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
 53 |     mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
 54 |     hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
 55 |     bin = np.floor((NFFT + 1) * hz_points / sample_rate)
 56 | 
 57 |     fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
 58 |     for m in range(1, nfilt + 1):
 59 |         f_m_minus = int(bin[m - 1])   # left
 60 |         f_m = int(bin[m])             # center
 61 |         f_m_plus = int(bin[m + 1])    # right
 62 | 
 63 |         for k in range(f_m_minus, f_m):
 64 |             fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
 65 |         for k in range(f_m, f_m_plus):
 66 |             fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
 67 |     filter_banks = np.dot(pow_frames, fbank.T)
 68 |     filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
 69 |     filter_banks = 20 * np.log10(filter_banks)  # dB
 70 |     num_ceps = 13
 71 |     mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
 72 |     cep_lifter = 22
 73 |     (nframes, ncoeff) = mfcc.shape
 74 |     n = np.arange(ncoeff)
 75 |     lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
 76 |     mfcc *= lift
 77 |     return mfcc
 78 | 
 79 | def read_video(file_path):
 80 |     vr = VideoReader(file_path)
 81 |     frames = vr.get_batch(range(len(vr))).asnumpy()
 82 |     return format_frames(
 83 |         frames,
 84 |         output_size=(input_size, input_size)
 85 |     )
 86 | 
 87 | def format_frames(frame, output_size):
 88 |     frame = tf.image.convert_image_dtype(frame, tf.uint8)
 89 |     frame = tf.image.resize(frame, size=list(output_size))
 90 |     return frame
 91 | 
 92 | def uniform_temporal_subsample(
 93 |     x, num_samples, clip_idx, total_clips, frame_rate=1, temporal_dim=-4
 94 | ):
 95 |     t = tf.shape(x)[temporal_dim]
 96 |     max_offset = t - num_samples * frame_rate
 97 |     step = max_offset // total_clips
 98 |     offset = clip_idx * step
 99 |     indices = tf.linspace(
100 |         tf.cast(offset, tf.float32),
101 |         tf.cast(offset + (num_samples-1) * frame_rate, tf.float32),
102 |         num_samples
103 |     )
104 |     indices = tf.clip_by_value(indices, 0, tf.cast(t - 1, tf.float32))
105 |     indices = tf.cast(tf.round(indices), tf.int32)
106 |     return tf.gather(x, indices, axis=temporal_dim)
107 | 
108 | 
109 | def clip_generator(
110 |     image, num_frames=32, frame_rate=1, num_clips=1, crop_size=224
111 | ):
112 |     clips_list = []
113 |     for i in range(num_clips):
114 |         frame = uniform_temporal_subsample(
115 |             image, num_frames, i, num_clips, frame_rate=frame_rate, temporal_dim=0
116 |         )
117 |         clips_list.append(frame)
118 | 
119 |     video = tf.stack(clips_list)
120 |     video = tf.reshape(
121 |         video, [num_clips*num_frames, crop_size, crop_size, 3]
122 |     )
123 |     return video
124 | 
125 | def video_audio(path, save_path):
126 |     n = 1
127 | 
128 |     for class_name in os.listdir(path):
129 |         class_dir = os.path.join(path, class_name)
130 |         save_dir = os.path.join(save_path, class_name)
131 |         
132 |         for video_file in os.listdir(class_dir):
133 |             video_path = os.path.join(class_dir, video_file)
134 | 
135 |             video_name = os.path.basename(video_path).split(".")[0]
136 |             mp4_name = str(video_name) +  '.mp4'
137 |             path_video_save = os.path.join(save_dir, mp4_name)
138 | 
139 |             fourcc = cv2.VideoWriter_fourcc(*'mp4v')
140 |             output_video = cv2.VideoWriter(path_video_save, fourcc, 16.0, (224, 224))
141 | 
142 |             video_ds = read_video(video_path)
143 |             video_ds = clip_generator(video_ds, num_frame, sampling_rate, num_clips=1)
144 |               
145 |             audio_clip = AudioFileClip(video_path)
146 |             audio_name = os.path.basename(video_path).split(".")[0]
147 |             wave_name = str(audio_name) +  '.wav'
148 |             path_audio_save = os.path.join('Data\\MEAD\\MEAD_WAVE', wave_name)
149 | 
150 |             audio_clip.write_audiofile(path_audio_save)
151 |             fs, Audiodata = wavfile.read(path_audio_save)
152 |             Audiodata = normalize_audio(Audiodata)
153 |             step=int((len(Audiodata))/17) - 1
154 |             tx=np.arange(0,len(Audiodata),step)
155 |             
156 |         # Sum of Face and Spectrogram
157 |             for i in range(16):
158 |                 video_img = video_ds.numpy()[i]
159 |                 video_img = video_img.astype('uint8')
160 |                 plt.axis('off')
161 | 
162 |                 cv2.imwrite('video_img.jpg',video_img)
163 |                 video_img = Image.open("video_img.jpg")
164 |                 video_img = video_img.resize((224, 224))
165 |                 video_img = np.array(video_img)
166 | 
167 |                 signal=Audiodata[tx[i]:tx[i+2]]
168 |                 mfcc=MFCC(signal,fs)
169 | 
170 |                 fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(4, 4))
171 |                 cax = ax.matshow(
172 |                     np.transpose(mfcc),
173 |                     interpolation="nearest",
174 |                     aspect="auto",
175 |                     # cmap=plt.cm.afmhot_r,
176 |                     origin="lower",
177 |                 )
178 | 
179 |                 plt.axis('off')
180 |                 fig.savefig("MFCC.jpg")
181 |                 audio_img = Image.open("MFCC.jpg")
182 |                 audio_img = audio_img.resize((224, 224))
183 |                 audio_img = np.array(audio_img)
184 | 
185 |                 cv2.imwrite('audio_img.jpg',audio_img)
186 |                 audio_img = Image.open("audio_img.jpg")
187 |                 audio_img = audio_img.resize((224, 224))
188 |                 audio_img = np.array(audio_img)
189 | 
190 |                 img = video_img + audio_img
191 | 
192 |                 plt.close('all')
193 |                 output_video.write(img)
194 | 
195 |             output_video.release()
196 |             cv2.destroyAllWindows()
197 |             n = n + 1
198 | 
199 |     return n
200 | 
201 | if __name__ == '__main__':
202 | 
203 |     shutil.rmtree("Data\\MEAD\\MEAD_WAVE")
204 |     os.mkdir("Data\\MEAD\\MEAD_WAVE")
205 | 
206 |     path_train = 'Data\\MEAD\\MEAD\\train'
207 |     save_train_path = 'Data\\MEAD\\MEAD_SFAS\\train'
208 | 
209 |     path_test = 'Data\\MEAD\\MEAD\\test'
210 |     save_test_path = 'Data\\MEAD\\MEAD_SFAS\\test'
211 | 
212 |     path_val = 'Data\\MEAD\\MEAD\\val'
213 |     save_val_path = 'Data\\MEAD\\MEAD_SFAS\\val'
214 | 
215 |     n_train = video_audio(path_train, save_train_path)
216 |     n_test = video_audio(path_test, save_test_path)
217 |     n_val = video_audio(path_val, save_val_path)
218 | 
219 |     print(n_train)
220 |     print(n_test)
221 |     print(n_val)
222 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/Tool.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def rename(read_path, save_path):
 4 |     n = 0
 5 | 
 6 |     for people_name in os.listdir(read_path):
 7 |         people_dir = os.path.join(read_path, people_name)
 8 | 
 9 |         for class_name in os.listdir(people_dir):
10 |             class_dir = os.path.join(people_dir, class_name)
11 |             save_dir = os.path.join(save_path, class_name)
12 | 
13 |             for level_name in os.listdir(class_dir):
14 |                 level_dir = os.path.join(class_dir, level_name)
15 | 
16 |                 for video_file in os.listdir(level_dir):
17 |                     video_path = os.path.join(level_dir, video_file)
18 |                     video_name = os.path.basename(video_path).split(".")[0]
19 | 
20 |                     rename = str(people_name) + '_' + str(class_name) + '_' + str(level_name) + '_' + str(video_name) + '.mp4'
21 |                     rename_path = os.path.join(save_dir, rename)
22 | 
23 |                     if os.path.exists(video_path):
24 |                         os.rename(video_path, rename_path)
25 |                         n = n + 1
26 | 
27 |     return n
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     read_path = 'Data\\MEAD_RAW'
32 |     save_path = 'Data\\MEAD'
33 | 
34 |     n = rename(read_path, save_path)
35 |     print(n)
36 | 


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/audio_img.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/MultiMAE-DER_Preprocessing Code/audio_img.jpg


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/img.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/MultiMAE-DER_Preprocessing Code/img.jpg


--------------------------------------------------------------------------------
/MultiMAE-DER_Preprocessing Code/video_img.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/MultiMAE-DER_Preprocessing Code/video_img.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MultiMAE-DER: Multimodal Masked Autoencoder for Dynamic Emotion Recognition (IEEE ICPRS 2024)
 2 | 
 3 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/multimae-der-multimodal-masked-autoencoder/emotion-recognition-on-ravdess)](https://paperswithcode.com/sota/emotion-recognition-on-ravdess?p=multimae-der-multimodal-masked-autoencoder)<br>
 4 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/multimae-der-multimodal-masked-autoencoder/video-emotion-recognition-on-crema-d)](https://paperswithcode.com/sota/video-emotion-recognition-on-crema-d?p=multimae-der-multimodal-masked-autoencoder)<br>
 5 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/multimae-der-multimodal-masked-autoencoder/multimodal-emotion-recognition-on-iemocap-4)](https://paperswithcode.com/sota/multimodal-emotion-recognition-on-iemocap-4?p=multimae-der-multimodal-masked-autoencoder)<br>
 6 | 
 7 | > [![HCPS link](https://img.shields.io/badge/FIU-HCPS-red)](https://hcps.fiu.edu/) [![Arxiv link](https://img.shields.io/static/v1?label=arXiv&message=MultiMAE-DER&color=red&logo=arxiv)](https://arxiv.org/abs/2404.18327) [![Citation link](https://img.shields.io/badge/Citation-BibTeX-red)](#citation) [![ICPRS link](https://img.shields.io/badge/IEEE_ICPRS--24-MultiMAE--DER-red)](https://ieeexplore.ieee.org/document/10677820)<br>
 8 | > [Peihao Xiang](https://scholar.google.com/citations?user=k--3fM4AAAAJ&hl=zh-CN&oi=ao), [Chaohao Lin](https://scholar.google.com/citations?hl=zh-CN&user=V3l7dAEAAAAJ), [Kaida Wu](https://ieeexplore.ieee.org/author/167739911238744), and [Ou Bai](https://scholar.google.com/citations?hl=zh-CN&user=S0j4DOoAAAAJ)<br>
 9 | > HCPS Laboratory, Department of Electrical and Computer Engineering, Florida International University<br>
10 | 
11 | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Peihao-Xiang/MultiMAE-DER/blob/main/MultiMAE-DER_Fine-Tuning%20Code/MultiMAE_DER_FSLF.ipynb)
12 | [![Hugging Face Datasets](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Datasets-blue)](https://huggingface.co/datasets/NoahMartinezXiang/RAVDESS)
13 | 
14 | Official TensorFlow implementation and pre-trained VideoMAE models for MultiMAE-DER: Multimodal Masked Autoencoder for Dynamic Emotion Recognition.
15 | 
16 | Note: The .ipynb is just a simple example. In addition, the VideoMAE encoder model should be pre-trained using the MAE-DFER method, but this repository does not provide it.
17 | 
18 | ## Overview
19 | 
20 | This paper presents a novel approach to processing multimodal data for dynamic emotion recognition, named as the Multimodal Masked Autoencoder for Dynamic Emotion Recognition (MultiMAE-DER). The MultiMAE-DER leverages the closely correlated representation information within spatiotemporal sequences across visual and audio modalities. By utilizing a pre-trained masked autoencoder model, the MultiMAE-DER is accomplished through simple, straightforward finetuning. The performance of the MultiMAE-DER is enhanced by optimizing six fusion strategies for multimodal input sequences. These strategies address dynamic feature correlations within cross-domain data across spatial, temporal, and spatiotemporal sequences. In comparison to state-of-the-art multimodal supervised learning models for dynamic emotion recognition, MultiMAE-DER enhances the weighted average recall (WAR) by 4.41% on the RAVDESS dataset and by 2.06% on the CREMA-D. Furthermore, when compared with the state-of-the-art model of multimodal self-supervised learning, MultiMAE-DER achieves a 1.86% higher WAR on the IEMOCAP dataset.
21 | 
22 | <p align="center">
23 |   <img src="images/MultiMAE-DER.png" width=50%
24 |     class="center"><br>
25 |   Illustration of our MultiMAE-DER.
26 | </p>
27 | 
28 | General Multimodal Model vs. MultiMAE-DER. The uniqueness of our approach lies in the capability to extract features from cross-domain data using only a single encoder, eliminating the need for targeted feature extraction for different modalities.
29 | 
30 | <p align="center">
31 |   <img src="images/Multimodal_Sequence_Fusion_Strategy.png" width=70%
32 |     class="center"><br>
33 |   Multimodal Sequence Fusion Strategies.
34 | </p>
35 | 
36 | ## Implementation details
37 | 
38 | <p align="center">
39 |   <img src="images/MultiMAE-DER_Program_Flowchart.png" width=50%> <br>
40 |   The architecture of MultiMAE-DER.
41 | </p>
42 | 
43 | ## Main Results
44 | 
45 | ### RAVDESS
46 | 
47 | ![Result_on_RAVDESS](images/Result_on_RAVDESS.png)
48 | 
49 | ### CREMA-D
50 | 
51 | ![Result_on_CREMA-D](images/Result_on_CREMA-D.png)
52 | 
53 | ### IEMOCAP
54 | 
55 | ![Result_on_IEMOCAP](images/Result_on_IEMOCAP.png)
56 | 
57 | ## Contact 
58 | 
59 | If you have any questions, please feel free to reach me out at pxian001@fiu.edu.
60 | 
61 | ## Acknowledgments
62 | This project is built upon [VideoMAE](https://github.com/innat/VideoMAE) and [MAE-DFER](https://github.com/sunlicai/MAE-DFER). Thanks for their great codebase.
63 | 
64 | ## License
65 | 
66 | This project is under the Apache License 2.0. See [LICENSE](LICENSE) for details.
67 | 
68 | ## Citation
69 | 
70 | If you find this repository helpful, please consider citing our work:
71 | 
72 | ```BibTeX
73 | @misc{xiang2024multimaeder,
74 |       title={MultiMAE-DER: Multimodal Masked Autoencoder for Dynamic Emotion Recognition}, 
75 |       author={Peihao Xiang and Chaohao Lin and Kaida Wu and Ou Bai},
76 |       year={2024},
77 |       eprint={2404.18327},
78 |       archivePrefix={arXiv},
79 |       primaryClass={cs.CV}
80 | }
81 | 
82 | @INPROCEEDINGS{10677820,
83 |   author={Xiang, Peihao and Lin, Chaohao and Wu, Kaida and Bai, Ou},
84 |   booktitle={2024 14th International Conference on Pattern Recognition Systems (ICPRS)}, 
85 |   title={MultiMAE-DER: Multimodal Masked Autoencoder for Dynamic Emotion Recognition}, 
86 |   year={2024},
87 |   volume={},
88 |   number={},
89 |   pages={1-7},
90 |   keywords={Emotion recognition;Visualization;Correlation;Supervised learning;Semantics;Self-supervised learning;Transformers;Dynamic Emotion Recognition;Multimodal Model;Self-Supervised Learning;Video Masked Autoencoder;Vision Transformer},
91 |   doi={10.1109/ICPRS62101.2024.10677820}}
92 | ```
93 | 


--------------------------------------------------------------------------------
/images/MultiMAE-DER.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/images/MultiMAE-DER.png


--------------------------------------------------------------------------------
/images/MultiMAE-DER_Program_Flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/images/MultiMAE-DER_Program_Flowchart.png


--------------------------------------------------------------------------------
/images/Multimodal_Sequence_Fusion_Strategy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/images/Multimodal_Sequence_Fusion_Strategy.png


--------------------------------------------------------------------------------
/images/Result_on_CREMA-D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/images/Result_on_CREMA-D.png


--------------------------------------------------------------------------------
/images/Result_on_IEMOCAP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/images/Result_on_IEMOCAP.png


--------------------------------------------------------------------------------
/images/Result_on_RAVDESS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Peihao-Xiang/MultiMAE-DER/88d3f671f4e5d1e26d4bd04848179320ec674ec2/images/Result_on_RAVDESS.png


--------------------------------------------------------------------------------