├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── example_results.csv
    ├── logo-sheet-only.png
    └── mos-bench-table.png
├── egs
    ├── BENCHMARKS
    │   ├── READMD.md
    │   ├── get_all_bencmarks.sh
    │   ├── get_all_bencmarks_installation_free.sh
    │   ├── run_all_bencmarks.sh
    │   ├── run_all_bencmarks_np.sh
    │   ├── run_bc19_test.sh
    │   ├── run_bvcc_test.sh
    │   ├── run_nisqa_test.sh
    │   ├── run_singmos_test.sh
    │   ├── run_somos_test.sh
    │   ├── run_tmhint_qi_test.sh
    │   ├── run_vmc23_test.sh
    │   └── utils
    ├── README.md
    ├── TEMPLATE
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── ldnet-ml.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── bc19
    │   └── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    ├── bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── alignnet-wav2vec2-mdf.yaml
    │   │   ├── alignnet-wav2vec2.yaml
    │   │   ├── ssl-mos-wav2vec2-mdf.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── bvcc
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── ldnet-ml.yaml
    │   │   ├── ssl-mos-wav2vec2-categorical.yaml
    │   │   ├── ssl-mos-wav2vec2.yaml
    │   │   ├── stacking_ridge.yaml
    │   │   └── utmos-strong.yaml
    │   ├── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── nisqa
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── alignnet-wav2vec2.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── pstn
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── ldnet-ml.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── singmos
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── ldnet-ml.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── somos
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── tencent
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── ldnet-ml.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    ├── tmhint-qi
    │   ├── README.md
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── ldnet-ml.yaml
    │   │   └── ssl-mos-wav2vec2.yaml
    │   ├── local
    │   │   ├── data_download.sh
    │   │   └── data_prep.py
    │   ├── path.sh
    │   ├── run.sh
    │   └── utils
    └── vmc23
    │   ├── answers
    │       ├── track1_answer.txt
    │       ├── track2_answer.txt
    │       └── track3_answer.txt
    │   └── local
    │       ├── data_download.sh
    │       └── data_prep.py
├── hubconf.py
├── pyproject.toml
├── setup.cfg
├── sheet
    ├── __init__.py
    ├── bin
    │   ├── construct_datastore.py
    │   ├── inference.py
    │   ├── nonparametric_inference.py
    │   ├── train.py
    │   └── train_stack.py
    ├── collaters
    │   ├── __init__.py
    │   └── non_intrusive.py
    ├── datasets
    │   ├── __init__.py
    │   └── non_intrusive.py
    ├── evaluation
    │   ├── metrics.py
    │   └── plot.py
    ├── losses
    │   ├── __init__.py
    │   ├── basic_losses.py
    │   └── contrastive_loss.py
    ├── models
    │   ├── __init__.py
    │   ├── alignnet.py
    │   ├── ldnet.py
    │   ├── sslmos.py
    │   └── utmos.py
    ├── modules
    │   ├── ldnet
    │   │   ├── mobilenetv2.py
    │   │   ├── mobilenetv3.py
    │   │   └── modules.py
    │   └── utils.py
    ├── nonparametric
    │   └── datastore.py
    ├── schedulers
    │   ├── __init__.py
    │   └── schedulers.py
    ├── trainers
    │   ├── __init__.py
    │   ├── base.py
    │   └── non_intrusive.py
    ├── utils
    │   ├── __init__.py
    │   ├── download.py
    │   ├── model_io.py
    │   ├── types.py
    │   └── utils.py
    └── warmup_lr.py
├── tools
    └── Makefile
└── utils
    ├── BENCHMARKS
    ├── calculate_metrics.py
    ├── combine_datasets.py
    ├── hf_download.py
    ├── parse_options.sh
    ├── queue.pl
    ├── run.pl
    └── subsample.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | exp/
132 | downloads/
133 | data/
134 | *.done
135 | *.wav
136 | *.txt
137 | egs/playground/
138 | egs/visualize_for_mos_bench_journal/
139 | egs/bvcc+nisqa/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Wen-Chin Huang (unilight)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/logo-sheet-only.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/sheet/b0d1465b1b7d3c122f6e63d26c20873cd70f2e42/assets/logo-sheet-only.png


--------------------------------------------------------------------------------
/assets/mos-bench-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unilight/sheet/b0d1465b1b7d3c122f6e63d26c20873cd70f2e42/assets/mos-bench-table.png


--------------------------------------------------------------------------------
/egs/BENCHMARKS/READMD.md:
--------------------------------------------------------------------------------
 1 | # Zero-shot evaluation on benchmarks
 2 | 
 3 | **NOTE**: Please do NOT run these recipes in this folder.
 4 | 
 5 | ## Usage
 6 | 
 7 | Let's say you want to benchmark on `vmc23` (which stands for the VoiceMOS Challenge 2023).
 8 | 
 9 | 1. You need to have a trained model in anothe recipe (ex., `egs/bvcc`).
10 | 
11 | 2. Then, **IN THAT FOLDER**, execute the following:
12 | ```
13 | utils/BENCHMARKS/run_vmc23_test.sh --conf XXX.yaml --checkpoint YYY.ckpt
14 | ```
15 | 
16 | ## Recipe structure
17 | 
18 | All the scripts in this folder share the following stage structure:
19 | 
20 | - Stage -1: Dataset download. Please modify the `db_root` variable in each script to specify where to download the dataset.
21 | - Stage 0: Dataset preparation and csv file generation. They will be stored in `../<benchmark>/data`. (Ex. `../vmc23/data`).
22 | - Stage 1: Inference.


--------------------------------------------------------------------------------
/egs/BENCHMARKS/get_all_bencmarks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2024 Wen-Chin Huang (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | stage=-1       # stage to start
 7 | stop_stage=0   # stage to stop
 8 | 
 9 | # shellcheck disable=SC1091
10 | . utils/parse_options.sh || exit 1;
11 | 
12 | set -euo pipefail
13 | 
14 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
15 |     echo "stage -1: Download data for all benchmark sets"
16 | 
17 |     _opts+="--stage -1 --stop_stage -1 "
18 | 
19 |     utils/BENCHMARKS/run_bvcc_test.sh ${_opts}
20 |     utils/BENCHMARKS/run_bc19_test.sh ${_opts}
21 |     utils/BENCHMARKS/run_somos_test.sh ${_opts}
22 |     utils/BENCHMARKS/run_singmos_test.sh ${_opts}
23 |     utils/BENCHMARKS/run_nisqa_test.sh ${_opts}
24 |     utils/BENCHMARKS/run_tmhint_qi_test.sh ${_opts}
25 |     utils/BENCHMARKS/run_vmc23_test.sh ${_opts}
26 | 
27 |     echo "Please follow instructions in bvcc, bc19 to finish the download process."
28 | fi
29 | 
30 | 
31 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
32 |     echo "stage 0: Data preparation for all benchmark sets"
33 | 
34 |     _opts+="--stage 0 --stop_stage 0 "
35 | 
36 |     utils/BENCHMARKS/run_bvcc_test.sh ${_opts}
37 |     utils/BENCHMARKS/run_bc19_test.sh ${_opts}
38 |     utils/BENCHMARKS/run_somos_test.sh ${_opts}
39 |     utils/BENCHMARKS/run_singmos_test.sh ${_opts}
40 |     utils/BENCHMARKS/run_nisqa_test.sh ${_opts}
41 |     utils/BENCHMARKS/run_tmhint_qi_test.sh ${_opts}
42 |     utils/BENCHMARKS/run_vmc23_test.sh ${_opts}
43 | fi
44 | 


--------------------------------------------------------------------------------
/egs/BENCHMARKS/get_all_bencmarks_installation_free.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2024 Wen-Chin Huang (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | db_root=
 7 | datadir=data
 8 | 
 9 | stage=-1       # stage to start
10 | stop_stage=0   # stage to stop
11 | 
12 | # shellcheck disable=SC1091
13 | . utils/parse_options.sh || exit 1;
14 | 
15 | set -euo pipefail
16 | 
17 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
18 |     echo "stage -1: Download data for all benchmark sets"
19 | 
20 |     ../bvcc/local/data_download.sh ${db_root}/bvcc
21 |     ../bc19/local/data_download.sh ${db_root}/bc19
22 |     ../somos/local/data_download.sh ${db_root}/somos
23 |     ../singmos/local/data_download.sh ${db_root}/singmos
24 |     ../nisqa/local/data_download.sh ${db_root}/nisqa
25 |     ../tmhint-qi/local/data_download.sh ${db_root}/tmhint_qi
26 |     ../vmc23/local/data_download.sh ${db_root}/vmc23
27 | 
28 |     echo "Please follow instructions in bvcc, bc19 to finish the download process."
29 | fi
30 | 
31 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
32 |     echo "stage 0: Data preparation for all benchmark sets"
33 | 
34 |     mkdir -p "${datadir}"
35 | 
36 |     # bvcc
37 |     echo "=== Data preparation for BVCC test ==="
38 |     ../bvcc/local/data_prep.py --avg-score-only \
39 |         --original-path "${db_root}/bvcc/main/DATA/sets/TESTSET" --wavdir "${db_root}/main/DATA/wav" --out "${datadir}/bvcc_test.csv"
40 |     echo
41 | 
42 |     # bc19
43 |     echo "=== Data preparation for VMC'22 OOD track (bc'19) ==="
44 |     ../bc19/local/data_prep.py --avg-score-only \
45 |         --original-path "${db_root}/bc19/ood/DATA/sets/TESTSET" --wavdir "${db_root}/bc19/ood/DATA/wav" --out "${datadir}/bc19_test.csv"
46 |     echo
47 | 
48 |     # somos
49 |     echo "=== Data preparation for SOMOS test ==="
50 |     ../somos/local/data_prep.py --avg-score-only \
51 |         --original-path "${db_root}/somos/training_files/split1/clean/TESTSET" --wavdir "${db_root}/somos/audios" --out "${datadir}/somos_test.csv"
52 |     echo
53 | 
54 |     # singmos
55 |     echo "=== Data preparation for VMC'24 track 2 (SingMOS test) ==="
56 |     ../singmos/local/data_prep.py --avg-score-only \
57 |         --original-path "${db_root}/singmos/DATA/sets/eval_mos_list.txt" --wavdir "${db_root}/DATA/wav" --out "${datadir}/singmos_test.csv"
58 |     echo
59 | 
60 |     # nisqa
61 |     for test_set in LIVETALK FOR P501; do
62 |         echo "=== Data preparation for NISQA TEST ${test_set} ==="
63 |         ../nisqa/local/data_prep.py --avg-score-only \
64 |             --original-path "${db_root}/nisqa/NISQA_TEST_${test_set}/NISQA_TEST_${test_set}_file.csv" \
65 |             --wavdir "${db_root}/nisqa/NISQA_TEST_${test_set}/deg" \
66 |             --out "${datadir}/nisqa_${test_set}.csv"
67 |         echo
68 |     done
69 | 
70 |     # tmhint-qi
71 |     echo "=== Data preparation for TMHINT-QI test ==="
72 |     ../tmhint-qi/local/data_prep.py --avg-score-only \
73 |         --original-path "${db_root}/tmhint_qi/raw_data.csv" --wavdir "${db_root}/tmhint_qi/test" --setname "test" --out "${datadir}/tmhintqi_test.csv"
74 |     echo
75 | 
76 |     # vmc23
77 |     for track in track1a track1b track2 track3; do
78 |         echo "=== Data preparation for VMC'22 ${track} ==="
79 |         if [ "${track}" = "track1a" ] || [ "${track}" = "track1b" ]; then
80 |             _track=track1
81 |         else
82 |             _track="${track}"
83 |         fi
84 |         ../vmc23/local/data_prep.py --avg-score-only \
85 |             --original-path "${db_root}/vmc23/${_track}" \
86 |             --wavdir "${db_root}/vmc23/${_track}" \
87 |             --answer_path "../vmc23/answers/${_track}_answer.txt" \
88 |             --track "${track}" \
89 |             --out "${datadir}/vmc23_${track}_test.csv"
90 |         echo
91 |     done
92 | fi
93 | 


--------------------------------------------------------------------------------
/egs/BENCHMARKS/run_all_bencmarks.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2024 Wen-Chin Huang (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | conf=
 7 | tag=
 8 | np_inference_mode=
 9 | seed=
10 | checkpoint=
11 | 
12 | # shellcheck disable=SC1091
13 | . utils/parse_options.sh || exit 1;
14 | 
15 | set -euo pipefail
16 | 
17 | _opts=
18 | if [ ! -z ${np_inference_mode} ]; then
19 |     _opts+="--stage 3 --stop_stage 3 --np-inference-mode ${np_inference_mode} "
20 | else
21 |     _opts+="--stage 1 --stop_stage 1 "
22 | fi
23 | if [ ! -z ${tag} ]; then
24 |     _opts+="--tag ${tag} "
25 | fi
26 | 
27 | utils/BENCHMARKS/run_bvcc_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}
28 | utils/BENCHMARKS/run_bc19_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}
29 | utils/BENCHMARKS/run_somos_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}
30 | utils/BENCHMARKS/run_singmos_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}
31 | utils/BENCHMARKS/run_nisqa_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}
32 | utils/BENCHMARKS/run_tmhint_qi_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}
33 | utils/BENCHMARKS/run_vmc23_test.sh --conf ${conf} --seed ${seed} --checkpoint ${checkpoint} ${_opts}


--------------------------------------------------------------------------------
/egs/BENCHMARKS/run_all_bencmarks_np.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2024 Wen-Chin Huang (Nagoya University)
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | conf=
 7 | datadir=
 8 | 
 9 | # shellcheck disable=SC1091
10 | . utils/parse_options.sh || exit 1;
11 | 
12 | set -euo pipefail
13 | 
14 | if [ ! -z ${datadir} ]; then
15 |     utils/BENCHMARKS/run_bvcc_test.sh --stage 1 --stop_stage 1 --conf ${conf} --datadir ${datadir}
16 |     utils/BENCHMARKS/run_somos_test.sh --stage 1 --stop_stage 1  --conf ${conf} --datadir ${datadir}
17 |     utils/BENCHMARKS/run_singmos_test.sh --stage 1 --stop_stage 1  --conf ${conf} --datadir ${datadir}
18 |     utils/BENCHMARKS/run_nisqa_test.sh --stage 1 --stop_stage 1  --conf ${conf} --datadir ${datadir}
19 |     utils/BENCHMARKS/run_tmhint_qi_test.sh --stage 1 --stop_stage 1  --conf ${conf} --datadir ${datadir}
20 |     utils/BENCHMARKS/run_vmc23_test.sh --stage 2 --stop_stage 2  --conf ${conf}
21 | else
22 |     utils/BENCHMARKS/run_bvcc_test.sh --stage 1 --stop_stage 1 --conf ${conf}
23 |     utils/BENCHMARKS/run_somos_test.sh --stage 1 --stop_stage 1  --conf ${conf}
24 |     utils/BENCHMARKS/run_singmos_test.sh --stage 1 --stop_stage 1  --conf ${conf}
25 |     utils/BENCHMARKS/run_nisqa_test.sh --stage 1 --stop_stage 1  --conf ${conf}
26 |     utils/BENCHMARKS/run_tmhint_qi_test.sh --stage 1 --stop_stage 1  --conf ${conf}
27 |     utils/BENCHMARKS/run_vmc23_test.sh --stage 1 --stop_stage 1  --conf ${conf}
28 | fi


--------------------------------------------------------------------------------
/egs/BENCHMARKS/run_bc19_test.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2024 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # basic settings
 10 | stage=-1       # stage to start
 11 | stop_stage=100 # stage to stop
 12 | verbose=1      # verbosity level (lower is less info)
 13 | n_gpus=1       # number of gpus in training
 14 | seed=1337
 15 | 
 16 | conf=conf/ssl-mos-wav2vec2.yaml
 17 | meta_model_conf=conf/stacking_ridge.yaml
 18 | 
 19 | # dataset configuration
 20 | bc19_db_root=/data/group1/z44476r/Corpora/bc19/ood/DATA # change this to your dataset folder
 21 | datadir="../bc19/data"
 22 | domain_idx=0
 23 | target_sampling_rate=16000
 24 | 
 25 | # training related setting
 26 | tag=""     # tag for directory to save model
 27 |            
 28 | # decoding related setting
 29 | test_sets="bc19_test"
 30 | checkpoint=""               # checkpoint path to be used for decoding
 31 |                             # if not provided, the latest one will be used
 32 |                             # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
 33 | model_averaging="False"
 34 | use_stacking="False"
 35 | meta_model_checkpoint=""
 36 | np_inference_mode=
 37 |                                        
 38 | # shellcheck disable=SC1091
 39 | . utils/parse_options.sh || exit 1;
 40 | 
 41 | set -euo pipefail
 42 | 
 43 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 44 |     echo "stage -1: Data and Pretrained Model Download"
 45 | 
 46 |     ../bc19/local/data_download.sh ${bc19_db_root}
 47 | fi
 48 | 
 49 | 
 50 | mkdir -p "${datadir}"
 51 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 52 |     echo "stage 0: Data preparation"
 53 | 
 54 |     ../bc19/local/data_prep.py \
 55 |         --original-path "${bc19_db_root}/sets/TESTSET" --wavdir "${bc19_db_root}/wav" --out "${datadir}/bc19_test.csv" \
 56 |         --resample --target-sampling-rate "${target_sampling_rate}" --target-wavdir "${bc19_db_root}/wav_${target_sampling_rate}"
 57 | 
 58 | fi
 59 | 
 60 | if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
 61 |     echo "Stage 1: Inference"
 62 |     # shellcheck disable=SC2012
 63 | 
 64 |     if [ -z ${tag} ]; then
 65 |         expname="$(basename ${conf%.*})-${seed}"
 66 |     else
 67 |         expname="${tag}-${seed}"
 68 |     fi
 69 |     expdir=exp/${expname}
 70 | 
 71 |     if [ "${use_stacking}" = "True" ]; then
 72 |         [ -z "${meta_model_checkpoint}" ] && meta_model_checkpoint="${expdir}/meta_model.pkl"
 73 |         outdir="${expdir}/results/stacking-model"
 74 |     elif [ "${model_averaging}" = "True" ]; then
 75 |         outdir="${expdir}/results/model-averaging"
 76 |     else
 77 |         [ -z "${checkpoint}" ] && checkpoint="${expdir}/checkpoint-best.pkl"
 78 |         outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
 79 |     fi
 80 | 
 81 |     for name in ${test_sets}; do
 82 |         [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
 83 |         [ "${n_gpus}" -gt 1 ] && n_gpus=1
 84 |         echo "Inference start. See the progress via ${outdir}/${name}/inference.log."
 85 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/inference.log" \
 86 |             inference.py \
 87 |                 --config "${expdir}/config.yml" \
 88 |                 --csv-path "${datadir}/${name}.csv" \
 89 |                 --checkpoint "${checkpoint}" \
 90 |                 --outdir "${outdir}/${name}" \
 91 |                 --model-averaging "${model_averaging}" \
 92 |                 --use-stacking "${use_stacking}" \
 93 |                 --meta-model-checkpoint "${meta_model_checkpoint}" \
 94 |                 --verbose "${verbose}"
 95 |         echo "Successfully finished inference of ${name} set."
 96 |         grep "UTT" "${outdir}/${name}/inference.log"
 97 |     done
 98 |     echo "Successfully finished inference."
 99 | fi
100 | 
101 | if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
102 |     echo "Stage 3: Non-parametric inference"
103 |     # shellcheck disable=SC2012
104 | 
105 |     if [ -z ${tag} ]; then
106 |         expname="$(basename ${conf%.*})-${seed}"
107 |     else
108 |         expname="${tag}-${seed}"
109 |     fi
110 |     expdir=exp/${expname}
111 | 
112 |     [ -z "${checkpoint}" ] && checkpoint="${expdir}/checkpoint-best.pkl"
113 |     outdir="${expdir}/results/np_$(basename "${checkpoint}" .pkl)/${np_inference_mode}"
114 | 
115 |     for name in ${test_sets}; do
116 |         [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
117 |         [ "${n_gpus}" -gt 1 ] && n_gpus=1
118 |         echo "Inference start. See the progress via ${outdir}/${name}/inference.log."
119 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/inference.log" \
120 |             nonparametric_inference.py \
121 |                 --config "${expdir}/config.yml" \
122 |                 --datastore "${expdir}/datastore/$(basename "${checkpoint}" .pkl)/datastore.h5" \
123 |                 --csv-path "${datadir}/${name}.csv" \
124 |                 --checkpoint "${checkpoint}" \
125 |                 --outdir "${outdir}/${name}" \
126 |                 --np-inference-mode "${np_inference_mode}" \
127 |                 --verbose "${verbose}"
128 |         echo "Successfully finished inference of ${name} set."
129 |         grep "UTT" "${outdir}/${name}/inference.log"
130 |     done
131 |     echo "Successfully finished inference."
132 | fi


--------------------------------------------------------------------------------
/egs/BENCHMARKS/utils:
--------------------------------------------------------------------------------
1 | ../../utils


--------------------------------------------------------------------------------
/egs/TEMPLATE/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/conf/ldnet-ml.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: mag_sgram              # choices: wav, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "LDNet"
15 | model_params:
16 |     listener_emb_dim: 128
17 | 
18 |     activation: "ReLU"
19 |     encoder_type: "mobilenetv3"
20 |     encoder_bneck_configs:
21 |         - [16, 3, 16, 16, True, "RE", 3, 1]
22 |         - [16, 3, 72, 24, False, "RE", 3, 1]
23 |         - [24, 3, 88, 24, False, "RE", 1, 1]
24 |         - [24, 5, 96, 40, True, "HS", 3, 1]
25 |         - [40, 5, 240, 40, True, "HS", 1, 1]
26 |         - [40, 5, 240, 40, True, "HS", 1, 1]
27 |         - [40, 5, 120, 48, True, "HS", 1, 1]
28 |         - [48, 5, 144, 48, True, "HS", 1, 1]
29 |         - [48, 5, 288, 96, True, "HS", 3, 1] 
30 |         - [96, 5, 576, 96, True, "HS", 1, 1]
31 |         - [96, 5, 576, 96, True, "HS", 1, 1]
32 |     encoder_output_dim: 256
33 | 
34 |     decoder_type: "ffn"
35 |     decoder_dnn_dim: 64
36 |     output_type: "scalar"
37 |     range_clipping: True # this is needed if output_type is scalar
38 | 
39 |     use_mean_net: False
40 |     use_mean_listener: True
41 | 
42 | ###########################################################
43 | #                      LOSS SETTING                       #
44 | ###########################################################
45 | mean_score_criterion_type: null
46 | listener_score_criterion_type: "ScalarLoss"
47 | listener_score_criterion_params:
48 |     tau: 0.5
49 |     masked_loss: False
50 | listener_score_criterion_weight: 1.0
51 | 
52 | ###########################################################
53 | #                   INFERENCE SETTING                     #
54 | ###########################################################
55 | inference_mode: mean_listener   # this is used for
56 |                                 # (1) evaluation in the training loop
57 |                                 # (2) default inference mode
58 | 
59 | ###########################################################
60 | #                  DATA LOADER SETTING                    #
61 | ###########################################################
62 | train_batch_size: 60
63 | test_batch_size: 1
64 | padding_mode: "repetitive"  # repetitive, zero_padding
65 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
66 | num_workers: 0              # Number of workers in Pytorch DataLoader.
67 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
68 | 
69 | ###########################################################
70 | #             OPTIMIZER & SCHEDULER SETTING               #
71 | ###########################################################
72 | optimizer_type: RMSprop
73 | optimizer_params:
74 |     lr: 1.0e-3
75 |     # the following params come from
76 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
77 |     eps: 0.0316
78 |     alpha: 0.9
79 | grad_norm: 1.0              # Gradient norm.
80 | scheduler_type: stepLR
81 | scheduler_params:
82 |     step_size: 1000
83 |     gamma: 0.97
84 | 
85 | ###########################################################
86 | #                    INTERVAL SETTING                     #
87 | ###########################################################
88 | train_max_steps: 100000                 # Number of training steps.
89 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
90 | log_interval_steps: 100                 # Interval steps to record the training log.
91 | keep_nbest_models: 5                    # number of models to keep
92 | patience: 20                            # patience for early stopping
93 | best_model_criterion:                   # criterion to save the best models
94 |     key: sys_SRCC
95 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/TEMPLATE/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: sys_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/TEMPLATE/local/data_prep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """Data preparation for BVCC."""
 8 | 
 9 | import argparse
10 | import csv
11 | import logging
12 | import os
13 | import sys
14 | 
15 | from sheet.utils import read_csv
16 | 
17 | 
18 | def main():
19 |     """Run training process."""
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument(
22 |         "--original-path",
23 |         required=True,
24 |         type=str,
25 |         help=("original csv file path."),
26 |     )
27 |     parser.add_argument(
28 |         "--wavdir",
29 |         required=True,
30 |         type=str,
31 |         help=(
32 |             "directory of the waveform files. This is needed because wav paths in BVCC metadata files do not contain the wav directory."
33 |         ),
34 |     )
35 |     parser.add_argument(
36 |         "--out",
37 |         required=True,
38 |         type=str,
39 |         help=("output csv file path."),
40 |     )
41 |     parser.add_argument("--generate-listener-id", action="store_true")
42 |     args = parser.parse_args()
43 | 
44 |     # set logger
45 |     logging.basicConfig(
46 |         level=logging.INFO,
47 |         stream=sys.stdout,
48 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
49 |     )
50 | 
51 |     # read csv
52 |     logging.info("Reading original csv file.")
53 |     filelist, _ = read_csv(args.original_path)
54 | 
55 |     # prepare. each line looks like this:
56 |     # sys64e2f,sys64e2f-utt9c183cd.wav,4,VDP1ovyrBzg8_1,{}_30-39_bZPQE7w4Zl3g_Female_Valid_1_No
57 |     logging.info("Preparing metadata.")
58 |     metadata = []
59 |     listener_idxs, count = {}, 0
60 |     for line in filelist:
61 |         if len(line) == 0:
62 |             continue
63 |         system_id = line[0]
64 |         wav_path = line[1]
65 |         sample_id = os.path.splitext(wav_path.split("-")[1])[0]
66 |         score = int(line[2])
67 |         listener_id = line[4]
68 |         item = {
69 |             "wav_path": os.path.join(args.wavdir, wav_path),
70 |             "score": score,
71 |             "system_id": system_id,
72 |             "sample_id": sample_id,
73 |             "listener_id": listener_id,
74 |         }
75 |         if args.generate_listener_id:
76 |             if not listener_id in listener_idxs:
77 |                 listener_idxs[listener_id] = count
78 |                 count += 1
79 |             item["listener_idx"] = listener_idxs[listener_id]
80 |         metadata.append(item)
81 | 
82 |     # write csv
83 |     logging.info("Writing output csv file.")
84 |     fieldnames = ["wav_path", "score", "system_id", "sample_id", "listener_id"]
85 |     if args.generate_listener_id:
86 |         fieldnames.append("listener_idx")
87 |     with open(args.out, "w", newline="") as csvfile:
88 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
89 |         writer.writeheader()
90 |         for line in metadata:
91 |             writer.writerow(line)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/TEMPLATE/run.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2024 Wen-Chin Huang (Nagoya University)
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | . ./path.sh || exit 1;
  7 | . ./cmd.sh || exit 1;
  8 | 
  9 | # basic settings
 10 | stage=-1       # stage to start
 11 | stop_stage=100 # stage to stop
 12 | verbose=1      # verbosity level (lower is less info)
 13 | n_gpus=1       # number of gpus in training
 14 | n_jobs=16      # number of parallel jobs in feature extraction
 15 | seed=1337
 16 | 
 17 | conf=conf/ssl-mos-wav2vec2.yaml
 18 | 
 19 | # dataset configuration
 20 | db_root=/data/group1/z44476r/Corpora/BVCC/main/DATA
 21 | target_sampling_rate=16000
 22 | 
 23 | # training related setting
 24 | tag=""     # tag for directory to save model
 25 | resume=""  # checkpoint path to resume training
 26 |            # (e.g. <path>/<to>/checkpoint-10000steps.pkl)
 27 |            
 28 | # decoding related setting
 29 | test_sets="dev test"
 30 | checkpoint=""               # checkpoint path to be used for decoding
 31 |                             # if not provided, the latest one will be used
 32 |                             # (e.g. <path>/<to>/checkpoint-400000steps.pkl)
 33 | model_averaging="False"
 34 | use_stacking="False"
 35 | meta_model_checkpoint=""
 36 |                                        
 37 | # shellcheck disable=SC1091
 38 | . utils/parse_options.sh || exit 1;
 39 | 
 40 | set -euo pipefail
 41 | 
 42 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 43 |     echo "stage -1: Data and Pretrained Model Download"
 44 | 
 45 |     # Just provide instructions?
 46 | 
 47 | fi
 48 | 
 49 | mkdir -p "data"
 50 | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 51 |     echo "stage 0: Data preparation"
 52 | 
 53 |     # parse original csv file to an unified format
 54 |     local/data_prep.py --generate-listener-id \
 55 |         --original-path "${db_root}/sets/TRAINSET" --wavdir "${db_root}/wav" --out "data/train.csv" 
 56 |     local/data_prep.py \
 57 |         --original-path "${db_root}/sets/DEVSET" --wavdir "${db_root}/wav" --out "data/dev.csv"
 58 |     local/data_prep.py \
 59 |         --original-path "${db_root}/sets/TESTSET" --wavdir "${db_root}/wav" --out "data/test.csv"
 60 | fi
 61 | 
 62 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 63 |     echo "stage 1: Feature extraction"
 64 |     echo "No feature extraction needed currently"
 65 | fi
 66 | 
 67 | if [ -z ${tag} ]; then
 68 |     expname="$(basename ${conf%.*})-${seed}"
 69 | else
 70 |     expname="${tag}-${seed}"
 71 | fi
 72 | expdir=exp/${expname}
 73 | if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
 74 |     echo "Stage 2: Network training"
 75 |     [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
 76 |     if [ "${n_gpus}" -gt 1 ]; then
 77 |         echo "Not Implemented yet."
 78 |         # train="python -m seq2seq_vc.distributed.launch --nproc_per_node ${n_gpus} -c parallel-wavegan-train"
 79 |     else
 80 |         train="train.py"
 81 |     fi
 82 |     echo "Training start. See the progress via ${expdir}/train.log."
 83 |     ${cuda_cmd} --gpu "${n_gpus}" "${expdir}/train.log" \
 84 |         ${train} \
 85 |             --config "${conf}" \
 86 |             --train-csv-path "data/train.csv" \
 87 |             --dev-csv-path "data/dev.csv" \
 88 |             --outdir "${expdir}" \
 89 |             --resume "${resume}" \
 90 |             --verbose "${verbose}" \
 91 |             --seed "${seed}"
 92 |     echo "Successfully finished training."
 93 | fi
 94 | 
 95 | if [ "${stage}" -le 3 ] && [ "${stop_stage}" -ge 3 ]; then
 96 |     echo "Stage 3: Inference"
 97 |     # shellcheck disable=SC2012
 98 | 
 99 |     if [ "${use_stacking}" = "True" ]; then
100 |         [ -z "${meta_model_checkpoint}" ] && meta_model_checkpoint="${expdir}/meta_model.pkl"
101 |         outdir="${expdir}/results/stacking-model"
102 |     elif [ "${model_averaging}" = "True" ]; then
103 |         outdir="${expdir}/results/model-averaging"
104 |     else
105 |         [ -z "${checkpoint}" ] && checkpoint="${expdir}/checkpoint-best.pkl"
106 |         outdir="${expdir}/results/$(basename "${checkpoint}" .pkl)"
107 |     fi
108 | 
109 |     for name in ${test_sets}; do
110 |         [ ! -e "${outdir}/${name}" ] && mkdir -p "${outdir}/${name}"
111 |         [ "${n_gpus}" -gt 1 ] && n_gpus=1
112 |         echo "Inference start. See the progress via ${outdir}/${name}/inference.log."
113 |         ${cuda_cmd} --gpu "${n_gpus}" "${outdir}/${name}/inference.log" \
114 |             inference.py \
115 |                 --config "${expdir}/config.yml" \
116 |                 --csv-path "data/${name}.csv" \
117 |                 --checkpoint "${checkpoint}" \
118 |                 --outdir "${outdir}/${name}" \
119 |                 --model-averaging "${model_averaging}" \
120 |                 --use-stacking "${use_stacking}" \
121 |                 --meta-model-checkpoint "${meta_model_checkpoint}" \
122 |                 --verbose "${verbose}"
123 |         echo "Successfully finished inference of ${name} set."
124 |         grep "UTT" "${outdir}/${name}/inference.log"
125 |     done
126 |     echo "Successfully finished inference."
127 | fi


--------------------------------------------------------------------------------
/egs/TEMPLATE/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/bc19/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/bc19.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     wget https://zenodo.org/records/6572573/files/ood.tar.gz
15 |     tar zxvf ood.tar.gz
16 |     rm ood.tar.gz
17 |     cd $cwd
18 |     echo "Successfully finished download. Please follow the instructions."
19 |     touch ${db}/bc19.done
20 | else
21 |     echo "Already exists. Skip download."
22 | fi
23 | 


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/conf/alignnet-wav2vec2-mdf.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                  FINE-TUNING SETTING                    #
 3 | ###########################################################
 4 | init-mods: ["ssl_model", "domain_embeddings", "decoder_rnn", "decoder_dnn"]
 5 | freeze-mods: null
 6 | 
 7 | ###########################################################
 8 | #              NETWORK ARCHITECTURE SETTING               #
 9 | ###########################################################
10 | model_type: "AlignNet"
11 | model_params:
12 |     ssl_module: "s3prl"
13 |     s3prl_name: "wav2vec2"
14 |     ssl_model_output_dim: 768
15 |     ssl_model_layer_idx: -1
16 | 
17 |     use_domain_modeling: True
18 |     domain_emb_dim: 128
19 | 
20 |     decoder_dnn_dim: 64
21 |     output_type: "scalar"
22 |     range_clipping: True # this is needed if output_type is scalar
23 | 
24 | ###########################################################
25 | #                      LOSS SETTING                       #
26 | ###########################################################
27 | mean_score_criterions:
28 |     - criterion_type: "ScalarLoss"
29 |       criterion_weight: 1.0
30 |       criterion_params:
31 |         order: 1
32 |         tau: 0.5
33 |         masked_loss: False
34 | listener_score_criterions: null
35 | 
36 | ###########################################################
37 | #                   INFERENCE SETTING                     #
38 | ###########################################################
39 | inference_mode: mean_listener   # this is used for
40 |                                 # (1) evaluation in the training loop
41 |                                 # (2) default inference mode
42 | 
43 | ###########################################################
44 | #                  DATA LOADER SETTING                    #
45 | ###########################################################
46 | dev_samples_per_eval_loop: 1000
47 | allow_cache: false           # Whether to allow cache in dataset. If true, it requires cpu memory.
48 | 
49 | ###########################################################
50 | #             OPTIMIZER & SCHEDULER SETTING               #
51 | ###########################################################
52 | optimizer_type: SGD
53 | optimizer_params:
54 |     lr: 1.0e-3
55 |     # the following params come from
56 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
57 |     momentum: 0.9
58 | grad_norm: 1.0              # Gradient norm.
59 | scheduler_type: null
60 | 
61 | ###########################################################
62 | #                    INTERVAL SETTING                     #
63 | ###########################################################
64 | train_max_steps: 100000                 # Number of training steps.
65 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
66 | log_interval_steps: 100                 # Interval steps to record the training log.
67 | keep_nbest_models: 5                    # number of models to keep
68 | patience: 20                            # patience for early stopping
69 | best_model_criterion:                   # criterion to save the best models
70 |     key: utt_SRCC
71 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/conf/alignnet-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "AlignNet"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     use_domain_modeling: True
22 |     domain_emb_dim: 128
23 | 
24 |     decoder_dnn_dim: 64
25 |     output_type: "scalar"
26 |     range_clipping: True # this is needed if output_type is scalar
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_listener   # this is used for
44 |                                 # (1) evaluation in the training loop
45 |                                 # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | dev_samples_per_eval_loop: 1000
53 | padding_mode: "repetitive"  # repetitive, zero_padding
54 | wav_only: True              # Reduce to average only even for the training data.
55 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
56 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
57 | num_workers: 0              # Number of workers in Pytorch DataLoader.
58 | allow_cache: false           # Whether to allow cache in dataset. If true, it requires cpu memory.
59 | 
60 | ###########################################################
61 | #             OPTIMIZER & SCHEDULER SETTING               #
62 | ###########################################################
63 | optimizer_type: SGD
64 | optimizer_params:
65 |     lr: 1.0e-3
66 |     # the following params come from
67 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
68 |     momentum: 0.9
69 | grad_norm: 1.0              # Gradient norm.
70 | scheduler_type: null
71 | 
72 | ###########################################################
73 | #                    INTERVAL SETTING                     #
74 | ###########################################################
75 | train_max_steps: 100000                 # Number of training steps.
76 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
77 | log_interval_steps: 100                 # Interval steps to record the training log.
78 | keep_nbest_models: 5                    # number of models to keep
79 | patience: 20                            # patience for early stopping
80 | best_model_criterion:                   # criterion to save the best models
81 |     key: utt_SRCC
82 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/conf/ssl-mos-wav2vec2-mdf.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                  FINE-TUNING SETTING                    #
 3 | ###########################################################
 4 | init-mods: ["ssl_model", "mean_net_dnn"]
 5 | freeze-mods: null
 6 | 
 7 | ###########################################################
 8 | #                      LOSS SETTING                       #
 9 | ###########################################################
10 | mean_score_criterions:
11 |     - criterion_type: "ScalarLoss"
12 |       criterion_weight: 1.0
13 |       criterion_params:
14 |         order: 1
15 |         tau: 0.5
16 |         masked_loss: False
17 | listener_score_criterions: null
18 | 
19 | ###########################################################
20 | #                  DATA LOADER SETTING                    #
21 | ###########################################################
22 | dev_samples_per_eval_loop: 1000
23 | allow_cache: false           # Whether to allow cache in dataset. If true, it requires cpu memory.
24 | 
25 | ###########################################################
26 | #             OPTIMIZER & SCHEDULER SETTING               #
27 | ###########################################################
28 | optimizer_type: SGD
29 | optimizer_params:
30 |     lr: 1.0e-3
31 |     # the following params come from
32 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
33 |     momentum: 0.9
34 | grad_norm: 1.0              # Gradient norm.
35 | scheduler_type: null
36 | 
37 | ###########################################################
38 | #                    INTERVAL SETTING                     #
39 | ###########################################################
40 | train_max_steps: 100000                 # Number of training steps.
41 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
42 | log_interval_steps: 100                 # Interval steps to record the training log.
43 | keep_nbest_models: 5                    # number of models to keep
44 | patience: 20                            # patience for early stopping
45 | best_model_criterion:                   # criterion to save the best models
46 |     key: utt_SRCC
47 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | dev_samples_per_eval_loop: 1000
53 | padding_mode: "repetitive"  # repetitive, zero_padding
54 | wav_only: True              # Reduce to average only even for the training data.
55 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
56 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
57 | num_workers: 0              # Number of workers in Pytorch DataLoader.
58 | allow_cache: false           # Whether to allow cache in dataset. If true, it requires cpu memory.
59 | 
60 | ###########################################################
61 | #             OPTIMIZER & SCHEDULER SETTING               #
62 | ###########################################################
63 | optimizer_type: SGD
64 | optimizer_params:
65 |     lr: 1.0e-3
66 |     # the following params come from
67 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
68 |     momentum: 0.9
69 | grad_norm: 1.0              # Gradient norm.
70 | scheduler_type: null
71 | 
72 | ###########################################################
73 | #                    INTERVAL SETTING                     #
74 | ###########################################################
75 | train_max_steps: 100000                 # Number of training steps.
76 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
77 | log_interval_steps: 100                 # Interval steps to record the training log.
78 | keep_nbest_models: 5                    # number of models to keep
79 | patience: 20                            # patience for early stopping
80 | best_model_criterion:                   # criterion to save the best models
81 |     key: utt_SRCC
82 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/local/data_prep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """Data preparation for multiple datasets."""
 8 | 
 9 | import argparse
10 | import csv
11 | import logging
12 | import sys
13 | 
14 | from sheet.utils import read_csv
15 | 
16 | 
17 | def main():
18 |     """Run data preprocessing."""
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument(
21 |         "--original-paths",
22 |         nargs="+",
23 |         required=True,
24 |         type=str,
25 |         help=("original csv file paths."),
26 |     )
27 |     parser.add_argument(
28 |         "--out",
29 |         required=True,
30 |         type=str,
31 |         help=("output csv file path."),
32 |     )
33 |     args = parser.parse_args()
34 | 
35 |     # set logger
36 |     logging.basicConfig(
37 |         level=logging.INFO,
38 |         stream=sys.stdout,
39 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
40 |     )
41 | 
42 |     # read csv
43 |     logging.info("Reading original csv files.")
44 |     metadata = []
45 |     count = 0
46 |     for original_path in args.original_paths:
47 |         filelist, _ = read_csv(original_path, dict_reader=True)
48 |         for line in filelist:
49 |             if len(line) == 0:
50 |                 continue
51 |             line["domain_idx"] = count
52 |             line["system_id"] = f"{count}_{line['system_id']}"
53 |             line["sample_id"] = f"{count}_{line['sample_id']}"
54 |             metadata.append(line)
55 |         count += 1
56 | 
57 |     # write csv
58 |     logging.info("Writing output csv file.")
59 |     fieldnames = ["wav_path", "score", "system_id", "sample_id", "domain_idx"]
60 |     with open(args.out, "w", newline="") as csvfile:
61 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
62 |         writer.writeheader()
63 |         for line in metadata:
64 |             writer.writerow(line)
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/bvcc/README.md:
--------------------------------------------------------------------------------
 1 | # BVCC
 2 | 
 3 | ## Supported models
 4 | 
 5 | - LDNet (`conf/ldnet-ml.yaml`)
 6 | - SSL-MOS (`conf/ssl-mos-wav2vec2.yaml`)
 7 | - UTMOS Strong (`conf/utmos-strong.yaml`)
 8 | 
 9 | ## Notes
10 | 
11 | - By default, the phonemes and references provided by the UTMOS authors are always directly downloaded. Currently we have not supported transcribing datasets other than UTMOS.


--------------------------------------------------------------------------------
/egs/bvcc/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/bvcc/conf/ldnet-ml.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: mag_sgram              # choices: wav, mag_sgram
10 | use_phoneme: False                  # only True for UTMOS
11 | 
12 | ###########################################################
13 | #              NETWORK ARCHITECTURE SETTING               #
14 | ###########################################################
15 | model_type: "LDNet"
16 | model_params:
17 |     listener_emb_dim: 128
18 | 
19 |     activation: "ReLU"
20 |     encoder_type: "mobilenetv3"
21 |     encoder_bneck_configs:
22 |         - [16, 3, 16, 16, True, "RE", 3, 1]
23 |         - [16, 3, 72, 24, False, "RE", 3, 1]
24 |         - [24, 3, 88, 24, False, "RE", 1, 1]
25 |         - [24, 5, 96, 40, True, "HS", 3, 1]
26 |         - [40, 5, 240, 40, True, "HS", 1, 1]
27 |         - [40, 5, 240, 40, True, "HS", 1, 1]
28 |         - [40, 5, 120, 48, True, "HS", 1, 1]
29 |         - [48, 5, 144, 48, True, "HS", 1, 1]
30 |         - [48, 5, 288, 96, True, "HS", 3, 1] 
31 |         - [96, 5, 576, 96, True, "HS", 1, 1]
32 |         - [96, 5, 576, 96, True, "HS", 1, 1]
33 |     encoder_output_dim: 256
34 | 
35 |     decoder_type: "ffn"
36 |     decoder_dnn_dim: 64
37 |     output_type: "scalar"
38 |     range_clipping: True # this is needed if output_type is scalar
39 | 
40 |     use_mean_net: False
41 |     use_mean_listener: True
42 | 
43 | ###########################################################
44 | #                      LOSS SETTING                       #
45 | ###########################################################
46 | mean_score_criterions: null
47 | listener_score_criterions:
48 |     - criterion_type: "ScalarLoss"
49 |       criterion_weight: 1.0
50 |       criterion_params:
51 |         order: 2
52 |         tau: 0.5
53 |         masked_loss: False
54 | 
55 | ###########################################################
56 | #                   INFERENCE SETTING                     #
57 | ###########################################################
58 | inference_mode: mean_listener   # this is used for
59 |                                 # (1) evaluation in the training loop
60 |                                 # (2) default inference mode
61 | 
62 | ###########################################################
63 | #                  DATA LOADER SETTING                    #
64 | ###########################################################
65 | train_batch_size: 60
66 | test_batch_size: 1
67 | padding_mode: "repetitive"  # repetitive, zero_padding
68 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
69 | num_workers: 0              # Number of workers in Pytorch DataLoader.
70 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
71 | 
72 | ###########################################################
73 | #             OPTIMIZER & SCHEDULER SETTING               #
74 | ###########################################################
75 | optimizer_type: RMSprop
76 | optimizer_params:
77 |     lr: 1.0e-3
78 |     # the following params come from
79 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
80 |     eps: 0.0316
81 |     alpha: 0.9
82 | grad_norm: 1.0              # Gradient norm.
83 | scheduler_type: stepLR
84 | scheduler_params:
85 |     step_size: 1000
86 |     gamma: 0.97
87 | 
88 | ###########################################################
89 | #                    INTERVAL SETTING                     #
90 | ###########################################################
91 | train_max_steps: 100000                 # Number of training steps.
92 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
93 | log_interval_steps: 100                 # Interval steps to record the training log.
94 | keep_nbest_models: 5                    # number of models to keep
95 | patience: 20                            # patience for early stopping
96 | best_model_criterion:                   # criterion to save the best models
97 |     key: sys_SRCC
98 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc/conf/ssl-mos-wav2vec2-categorical.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | use_phoneme: False                  # only True for UTMOS
11 | 
12 | ###########################################################
13 | #              NETWORK ARCHITECTURE SETTING               #
14 | ###########################################################
15 | model_type: "SSLMOS"
16 | model_params:
17 |     ssl_module: "s3prl"
18 |     s3prl_name: "wav2vec2"
19 |     ssl_model_output_dim: 768
20 |     ssl_model_layer_idx: -1
21 | 
22 |     mean_net_dnn_dim: 64
23 |     mean_net_output_type: "categorical"
24 |     mean_net_output_dim: 17 # formula: (5 - 1) / categorical_step + 1
25 |     mean_net_output_step: 0.25
26 |     mean_net_range_clipping: True
27 | 
28 |     use_listener_modeling: False
29 |     use_mean_listener: False
30 |     
31 | ###########################################################
32 | #                      LOSS SETTING                       #
33 | ###########################################################
34 | mean_score_criterions:
35 |     - criterion_type: "CategoricalLoss"
36 |       criterion_weight: 1.0
37 |       criterion_params:
38 |         masked_loss: False
39 | listener_score_criterions: null
40 | 
41 | ###########################################################
42 | #                   INFERENCE SETTING                     #
43 | ###########################################################
44 | inference_mode: mean_net    # this is used for
45 |                             # (1) evaluation in the training loop
46 |                             # (2) default inference mode
47 | 
48 | ###########################################################
49 | #                  DATA LOADER SETTING                    #
50 | ###########################################################
51 | train_batch_size: 16
52 | test_batch_size: 1
53 | padding_mode: "repetitive"  # repetitive, zero_padding
54 | wav_only: True              # Reduce to average only even for the training data.
55 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
56 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
57 | num_workers: 0              # Number of workers in Pytorch DataLoader.
58 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
59 | categorical: true
60 | categorical_step: 0.25
61 | 
62 | ###########################################################
63 | #             OPTIMIZER & SCHEDULER SETTING               #
64 | ###########################################################
65 | optimizer_type: SGD
66 | optimizer_params:
67 |     lr: 1.0e-3
68 |     # the following params come from
69 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
70 |     momentum: 0.9
71 | grad_norm: 1.0              # Gradient norm.
72 | scheduler_type: null
73 | 
74 | ###########################################################
75 | #                    INTERVAL SETTING                     #
76 | ###########################################################
77 | train_max_steps: 100000                 # Number of training steps.
78 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
79 | log_interval_steps: 100                 # Interval steps to record the training log.
80 | keep_nbest_models: 5                    # number of models to keep
81 | patience: 20                            # patience for early stopping
82 | best_model_criterion:                   # criterion to save the best models
83 |     key: sys_SRCC
84 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | use_phoneme: False                  # only True for UTMOS
11 | 
12 | ###########################################################
13 | #              NETWORK ARCHITECTURE SETTING               #
14 | ###########################################################
15 | model_type: "SSLMOS"
16 | model_params:
17 |     ssl_module: "s3prl"
18 |     s3prl_name: "wav2vec2"
19 |     ssl_model_output_dim: 768
20 |     ssl_model_layer_idx: -1
21 | 
22 |     mean_net_dnn_dim: 64
23 |     mean_net_output_type: "scalar"
24 |     mean_net_range_clipping: True
25 | 
26 |     use_listener_modeling: False
27 |     use_mean_listener: False
28 |     
29 | ###########################################################
30 | #                      LOSS SETTING                       #
31 | ###########################################################
32 | mean_score_criterions:
33 |     - criterion_type: "ScalarLoss"
34 |       criterion_weight: 1.0
35 |       criterion_params:
36 |         order: 1
37 |         tau: 0.5
38 |         masked_loss: False
39 | listener_score_criterions: null
40 | 
41 | ###########################################################
42 | #                   INFERENCE SETTING                     #
43 | ###########################################################
44 | inference_mode: mean_net    # this is used for
45 |                             # (1) evaluation in the training loop
46 |                             # (2) default inference mode
47 | 
48 | ###########################################################
49 | #                  DATA LOADER SETTING                    #
50 | ###########################################################
51 | train_batch_size: 16
52 | test_batch_size: 1
53 | padding_mode: "repetitive"  # repetitive, zero_padding
54 | wav_only: True              # Reduce to average only even for the training data.
55 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
56 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
57 | num_workers: 0              # Number of workers in Pytorch DataLoader.
58 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
59 | 
60 | ###########################################################
61 | #             OPTIMIZER & SCHEDULER SETTING               #
62 | ###########################################################
63 | optimizer_type: SGD
64 | optimizer_params:
65 |     lr: 1.0e-3
66 |     # the following params come from
67 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
68 |     momentum: 0.9
69 | grad_norm: 1.0              # Gradient norm.
70 | scheduler_type: null
71 | 
72 | ###########################################################
73 | #                    INTERVAL SETTING                     #
74 | ###########################################################
75 | train_max_steps: 100000                 # Number of training steps.
76 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
77 | log_interval_steps: 100                 # Interval steps to record the training log.
78 | keep_nbest_models: 5                    # number of models to keep
79 | patience: 20                            # patience for early stopping
80 | best_model_criterion:                   # criterion to save the best models
81 |     key: sys_SRCC
82 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc/conf/stacking_ridge.yaml:
--------------------------------------------------------------------------------
1 | meta_model_type: "Ridge"
2 | meta_model_params:
3 |   alpha: 1.0
4 | 


--------------------------------------------------------------------------------
/egs/bvcc/conf/utmos-strong.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | use_phoneme: True                   # only True for UTMOS
11 | symbols: >
12 |     _;:,.!?¡¿—…"«»“” ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ'̪'̃
13 | 
14 | ###########################################################
15 | #              NETWORK ARCHITECTURE SETTING               #
16 | ###########################################################
17 | model_type: "UTMOS"
18 | model_params:
19 |     ssl_module: "s3prl"
20 |     s3prl_name: "wav2vec2"
21 |     ssl_model_output_dim: 768
22 |     ssl_model_layer_idx: -1
23 | 
24 |     use_phoneme: True
25 |     phoneme_encoder_dim: 256
26 |     phoneme_encoder_emb_dim: 256
27 |     phoneme_encoder_out_dim: 256
28 |     phoneme_encoder_n_lstm_layers: 3
29 |     phoneme_encoder_vocab_size: 300 # this is a dummy number that is set to be much larger than the actual vocab
30 |     use_reference: True
31 | 
32 |     use_listener_modeling: True
33 |     listener_emb_dim: 128
34 |     use_mean_listener: True
35 | 
36 |     decoder_dnn_dim: 2048
37 |     output_type: "scalar"
38 |     range_clipping: True # this is needed if output_type is scalar
39 |     
40 |     
41 | ###########################################################
42 | #                      LOSS SETTING                       #
43 | ###########################################################
44 | mean_score_criterions: null
45 | listener_score_criterions:
46 |     - criterion_type: "ScalarLoss"
47 |       criterion_weight: 1.0
48 |       criterion_params:
49 |         order: 2
50 |         tau: 0.25
51 |         masked_loss: False
52 |     - criterion_type: "ContrastiveLoss"
53 |       criterion_weight: 0.5
54 |       criterion_params:
55 |         margin: 0.1
56 | 
57 | ###########################################################
58 | #                   INFERENCE SETTING                     #
59 | ###########################################################
60 | inference_mode: mean_listener   # this is used for
61 |                                 # (1) evaluation in the training loop
62 |                                 # (2) default inference mode
63 | 
64 | ###########################################################
65 | #                  DATA LOADER SETTING                    #
66 | ###########################################################
67 | train_batch_size: 24
68 | test_batch_size: 1
69 | padding_mode: "repetitive"  # repetitive, zero_padding
70 | wav_only: False              # Reduce to average only even for the training data.
71 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
72 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
73 | num_workers: 0              # Number of workers in Pytorch DataLoader.
74 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
75 | 
76 | ###########################################################
77 | #             OPTIMIZER & SCHEDULER SETTING               #
78 | ###########################################################
79 | optimizer_type: Adam
80 | optimizer_params:
81 |     lr: 2.0e-5
82 | grad_norm: 1.0              # Gradient norm.
83 | scheduler_type: stepLR
84 | scheduler_params:
85 |     step_size: 4000
86 |     gamma: 0.97
87 | 
88 | ###########################################################
89 | #                    INTERVAL SETTING                     #
90 | ###########################################################
91 | train_max_steps: 100000                 # Number of training steps.
92 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
93 | log_interval_steps: 100                 # Interval steps to record the training log.
94 | keep_nbest_models: 5                    # number of models to keep
95 | patience: 20                            # patience for early stopping
96 | best_model_criterion:                   # criterion to save the best models
97 |     key: sys_SRCC
98 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/bvcc/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/main.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     wget https://zenodo.org/records/6572573/files/main.tar.gz
15 |     tar zxvf main.tar.gz
16 |     rm main.tar.gz
17 |     cd $cwd
18 |     echo "Successfully finished download. Please follow the instructions."
19 |     touch ${db}/main.done
20 | else
21 |     echo "Already exists. Skip download."
22 | fi
23 | 


--------------------------------------------------------------------------------
/egs/bvcc/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/bvcc/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/nisqa/README.md:
--------------------------------------------------------------------------------
1 | # NISQA


--------------------------------------------------------------------------------
/egs/nisqa/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/nisqa/conf/alignnet-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | num_domains: 7
11 | 
12 | ###########################################################
13 | #              NETWORK ARCHITECTURE SETTING               #
14 | ###########################################################
15 | model_type: "AlignNet"
16 | model_params:
17 |     ssl_module: "s3prl"
18 |     s3prl_name: "wav2vec2"
19 |     ssl_model_output_dim: 768
20 |     ssl_model_layer_idx: -1
21 | 
22 |     use_domain_modeling: True
23 |     domain_emb_dim: 128
24 | 
25 |     decoder_dnn_dim: 64
26 |     output_type: "scalar"
27 |     range_clipping: True # this is needed if output_type is scalar
28 |     
29 | ###########################################################
30 | #                      LOSS SETTING                       #
31 | ###########################################################
32 | mean_score_criterions:
33 |     - criterion_type: "ScalarLoss"
34 |       criterion_weight: 1.0
35 |       criterion_params:
36 |         order: 1
37 |         tau: 0.5
38 |         masked_loss: False
39 | listener_score_criterions: null
40 | 
41 | ###########################################################
42 | #                   INFERENCE SETTING                     #
43 | ###########################################################
44 | inference_mode: mean_listener   # this is used for
45 |                                 # (1) evaluation in the training loop
46 |                                 # (2) default inference mode
47 | 
48 | ###########################################################
49 | #                  DATA LOADER SETTING                    #
50 | ###########################################################
51 | train_batch_size: 16
52 | test_batch_size: 1
53 | padding_mode: "repetitive"  # repetitive, zero_padding
54 | wav_only: True              # Reduce to average only even for the training data.
55 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
56 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
57 | num_workers: 0              # Number of workers in Pytorch DataLoader.
58 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
59 | 
60 | ###########################################################
61 | #             OPTIMIZER & SCHEDULER SETTING               #
62 | ###########################################################
63 | optimizer_type: SGD
64 | optimizer_params:
65 |     lr: 1.0e-3
66 |     # the following params come from
67 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
68 |     momentum: 0.9
69 | grad_norm: 1.0              # Gradient norm.
70 | scheduler_type: null
71 | 
72 | ###########################################################
73 | #                    INTERVAL SETTING                     #
74 | ###########################################################
75 | train_max_steps: 100000                 # Number of training steps.
76 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
77 | log_interval_steps: 100                 # Interval steps to record the training log.
78 | keep_nbest_models: 5                    # number of models to keep
79 | patience: 20                            # patience for early stopping
80 | best_model_criterion:                   # criterion to save the best models
81 |     key: utt_SRCC
82 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/nisqa/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: utt_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/nisqa/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/nisqa.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     wget https://depositonce.tu-berlin.de/bitstream/11303/13012.5/9/NISQA_Corpus.zip
15 |     unzip NISQA_Corpus.zip
16 |     rm -f NISQA_Corpus.zip
17 |     mv NISQA_Corpus/* .
18 |     rm -rf NISQA_Corpus/
19 |     cd $cwd
20 |     echo "Successfully finished download."
21 |     touch ${db}/nisqa.done
22 | else
23 |     echo "Already exists. Skip download."
24 | fi
25 | 


--------------------------------------------------------------------------------
/egs/nisqa/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/nisqa/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/pstn/README.md:
--------------------------------------------------------------------------------
1 | # PSTN


--------------------------------------------------------------------------------
/egs/pstn/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/pstn/conf/ldnet-ml.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: mag_sgram              # choices: wav, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "LDNet"
15 | model_params:
16 |     listener_emb_dim: 128
17 | 
18 |     activation: "ReLU"
19 |     encoder_type: "mobilenetv3"
20 |     encoder_bneck_configs:
21 |         - [16, 3, 16, 16, True, "RE", 3, 1]
22 |         - [16, 3, 72, 24, False, "RE", 3, 1]
23 |         - [24, 3, 88, 24, False, "RE", 1, 1]
24 |         - [24, 5, 96, 40, True, "HS", 3, 1]
25 |         - [40, 5, 240, 40, True, "HS", 1, 1]
26 |         - [40, 5, 240, 40, True, "HS", 1, 1]
27 |         - [40, 5, 120, 48, True, "HS", 1, 1]
28 |         - [48, 5, 144, 48, True, "HS", 1, 1]
29 |         - [48, 5, 288, 96, True, "HS", 3, 1] 
30 |         - [96, 5, 576, 96, True, "HS", 1, 1]
31 |         - [96, 5, 576, 96, True, "HS", 1, 1]
32 |     encoder_output_dim: 256
33 | 
34 |     decoder_type: "ffn"
35 |     decoder_dnn_dim: 64
36 |     output_type: "scalar"
37 |     range_clipping: True # this is needed if output_type is scalar
38 | 
39 |     use_mean_net: False
40 |     use_mean_listener: True
41 | 
42 | ###########################################################
43 | #                      LOSS SETTING                       #
44 | ###########################################################
45 | mean_score_criterion_type: null
46 | listener_score_criterion_type: "ScalarLoss"
47 | listener_score_criterion_params:
48 |     tau: 0.5
49 |     masked_loss: False
50 | listener_score_criterion_weight: 1.0
51 | 
52 | ###########################################################
53 | #                   INFERENCE SETTING                     #
54 | ###########################################################
55 | inference_mode: mean_listener   # this is used for
56 |                                 # (1) evaluation in the training loop
57 |                                 # (2) default inference mode
58 | 
59 | ###########################################################
60 | #                  DATA LOADER SETTING                    #
61 | ###########################################################
62 | train_batch_size: 60
63 | test_batch_size: 1
64 | padding_mode: "repetitive"  # repetitive, zero_padding
65 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
66 | num_workers: 0              # Number of workers in Pytorch DataLoader.
67 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
68 | 
69 | ###########################################################
70 | #             OPTIMIZER & SCHEDULER SETTING               #
71 | ###########################################################
72 | optimizer_type: RMSprop
73 | optimizer_params:
74 |     lr: 1.0e-3
75 |     # the following params come from
76 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
77 |     eps: 0.0316
78 |     alpha: 0.9
79 | grad_norm: 1.0              # Gradient norm.
80 | scheduler_type: stepLR
81 | scheduler_params:
82 |     step_size: 1000
83 |     gamma: 0.97
84 | 
85 | ###########################################################
86 | #                    INTERVAL SETTING                     #
87 | ###########################################################
88 | train_max_steps: 100000                 # Number of training steps.
89 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
90 | log_interval_steps: 100                 # Interval steps to record the training log.
91 | keep_nbest_models: 5                    # number of models to keep
92 | patience: 20                            # patience for early stopping
93 | best_model_criterion:                   # criterion to save the best models
94 |     key: sys_SRCC
95 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/pstn/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: utt_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/pstn/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/pstn.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     wget https://challenge.blob.core.windows.net/pstn/train.zip
15 |     unzip train.zip
16 |     rm train.zip
17 |     cd $cwd
18 |     echo "Successfully finished download. Please follow the instructions."
19 |     touch ${db}/pstn.done
20 | else
21 |     echo "Already exists. Skip download."
22 | fi
23 | 


--------------------------------------------------------------------------------
/egs/pstn/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/pstn/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/singmos/README.md:
--------------------------------------------------------------------------------
1 | # SingMOS


--------------------------------------------------------------------------------
/egs/singmos/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/singmos/conf/ldnet-ml.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: mag_sgram              # choices: wav, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "LDNet"
15 | model_params:
16 |     listener_emb_dim: 128
17 | 
18 |     activation: "ReLU"
19 |     encoder_type: "mobilenetv3"
20 |     encoder_bneck_configs:
21 |         - [16, 3, 16, 16, True, "RE", 3, 1]
22 |         - [16, 3, 72, 24, False, "RE", 3, 1]
23 |         - [24, 3, 88, 24, False, "RE", 1, 1]
24 |         - [24, 5, 96, 40, True, "HS", 3, 1]
25 |         - [40, 5, 240, 40, True, "HS", 1, 1]
26 |         - [40, 5, 240, 40, True, "HS", 1, 1]
27 |         - [40, 5, 120, 48, True, "HS", 1, 1]
28 |         - [48, 5, 144, 48, True, "HS", 1, 1]
29 |         - [48, 5, 288, 96, True, "HS", 3, 1] 
30 |         - [96, 5, 576, 96, True, "HS", 1, 1]
31 |         - [96, 5, 576, 96, True, "HS", 1, 1]
32 |     encoder_output_dim: 256
33 | 
34 |     decoder_type: "ffn"
35 |     decoder_dnn_dim: 64
36 |     output_type: "scalar"
37 |     range_clipping: True # this is needed if output_type is scalar
38 | 
39 |     use_mean_net: False
40 |     use_mean_listener: True
41 | 
42 | ###########################################################
43 | #                      LOSS SETTING                       #
44 | ###########################################################
45 | mean_score_criterion_type: null
46 | listener_score_criterion_type: "ScalarLoss"
47 | listener_score_criterion_params:
48 |     tau: 0.5
49 |     masked_loss: False
50 | listener_score_criterion_weight: 1.0
51 | 
52 | ###########################################################
53 | #                   INFERENCE SETTING                     #
54 | ###########################################################
55 | inference_mode: mean_listener   # this is used for
56 |                                 # (1) evaluation in the training loop
57 |                                 # (2) default inference mode
58 | 
59 | ###########################################################
60 | #                  DATA LOADER SETTING                    #
61 | ###########################################################
62 | train_batch_size: 60
63 | test_batch_size: 1
64 | padding_mode: "repetitive"  # repetitive, zero_padding
65 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
66 | num_workers: 0              # Number of workers in Pytorch DataLoader.
67 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
68 | 
69 | ###########################################################
70 | #             OPTIMIZER & SCHEDULER SETTING               #
71 | ###########################################################
72 | optimizer_type: RMSprop
73 | optimizer_params:
74 |     lr: 1.0e-3
75 |     # the following params come from
76 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
77 |     eps: 0.0316
78 |     alpha: 0.9
79 | grad_norm: 1.0              # Gradient norm.
80 | scheduler_type: stepLR
81 | scheduler_params:
82 |     step_size: 1000
83 |     gamma: 0.97
84 | 
85 | ###########################################################
86 | #                    INTERVAL SETTING                     #
87 | ###########################################################
88 | train_max_steps: 100000                 # Number of training steps.
89 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
90 | log_interval_steps: 100                 # Interval steps to record the training log.
91 | keep_nbest_models: 5                    # number of models to keep
92 | patience: 20                            # patience for early stopping
93 | best_model_criterion:                   # criterion to save the best models
94 |     key: sys_SRCC
95 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/singmos/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: sys_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/singmos/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/singmos.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     gdown 1DtzZhk3M_jsxUxirPcFRoBhq-dsinOWN
15 |     gdown 1sO4xPUMJvGAjC8lmO6uXCwgz7s7Ruhpv
16 |     unzip voicemos2024-track2-train-phase.zip
17 |     unzip voicemos2024-track2-eval-phase.zip
18 |     rm voicemos2024-track2-train-phase.zip
19 |     rm voicemos2024-track2-eval-phase.zip
20 |     cd $cwd
21 |     echo "Successfully finished download."
22 |     touch ${db}/singmos.done
23 | else
24 |     echo "Already exists. Skip download."
25 | fi
26 | 


--------------------------------------------------------------------------------
/egs/singmos/local/data_prep.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2024 Wen-Chin Huang
  5 | #  MIT License (https://opensource.org/licenses/MIT)
  6 | 
  7 | """Data preparation for SingMOS."""
  8 | 
  9 | import argparse
 10 | from collections import defaultdict
 11 | import csv
 12 | import logging
 13 | import os
 14 | import sys
 15 | 
 16 | import numpy as np
 17 | 
 18 | # The following function(s) is(are) the same as in sheet.utils.utils
 19 | # copied here for installation-free data preparation
 20 | def read_csv(path, dict_reader=False, lazy=False):
 21 |     with open(path, newline="") as csvfile:
 22 |         if dict_reader:
 23 |             reader = csv.DictReader(csvfile)
 24 |             fieldnames = reader.fieldnames
 25 |         else:
 26 |             reader = csv.reader(csvfile)
 27 |             fieldnames = None
 28 | 
 29 |         if lazy:
 30 |             contents = reader
 31 |         else:
 32 |             contents = [line for line in reader]
 33 | 
 34 |     return contents, fieldnames
 35 | 
 36 | 
 37 | def main():
 38 |     """Run data preprocessing."""
 39 |     parser = argparse.ArgumentParser()
 40 |     parser.add_argument(
 41 |         "--original-path",
 42 |         required=True,
 43 |         type=str,
 44 |         help=("original csv file path."),
 45 |     )
 46 |     parser.add_argument(
 47 |         "--wavdir",
 48 |         required=True,
 49 |         type=str,
 50 |         help=(
 51 |             "directory of the waveform files. This is needed because wav paths in BVCC metadata files do not contain the wav directory."
 52 |         ),
 53 |     )
 54 |     parser.add_argument(
 55 |         "--out",
 56 |         required=True,
 57 |         type=str,
 58 |         help=("output csv file path."),
 59 |     )
 60 |     parser.add_argument(
 61 |         "--domain-idx",
 62 |         type=int,
 63 |         default=None,
 64 |         help=("domain ID.")
 65 |     )
 66 |     parser.add_argument(
 67 |         "--avg-score-only",
 68 |         action="store_true",
 69 |         help=("generate average score only. set for test set preparation.")
 70 |     )
 71 |     args = parser.parse_args()
 72 | 
 73 |     # set logger
 74 |     logging.basicConfig(
 75 |         level=logging.INFO,
 76 |         stream=sys.stdout,
 77 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 78 |     )
 79 | 
 80 |     # read csv
 81 |     logging.info("Reading original csv file.")
 82 |     filelist, _ = read_csv(args.original_path)
 83 | 
 84 |     # prepare. each line looks like this:
 85 |     # voicemos2024-track2-sys0001-utt0001,4.000000
 86 |     logging.info("Preparing metadata.")
 87 |     metadata = []
 88 |     listener_idxs, count = {}, 0
 89 |     for line in filelist:
 90 |         if len(line) == 0:
 91 |             continue
 92 |         sample_id = line[0]
 93 |         score = float(line[1])
 94 |         system_id = sample_id.split("-")[2]
 95 |         wav_path = os.path.join(args.wavdir, sample_id + ".wav")
 96 |         item = {
 97 |             "wav_path": wav_path,
 98 |             "score": score,
 99 |             "system_id": system_id,
100 |             "sample_id": sample_id,
101 |         }
102 |         # append domain ID if given
103 |         if args.domain_idx is not None:
104 |             item["domain_idx"] = args.domain_idx
105 |         metadata.append(item)
106 | 
107 |     # average score
108 |     if args.avg_score_only:
109 |         # take average score
110 |         sample_scores = defaultdict(list)
111 |         for item in metadata: # loop through metadata
112 |             sample_scores[item["sample_id"]].append(float(item["score"]))
113 |         sample_avg_score = {
114 |             sample_id: np.mean(np.array(scores))
115 |             for sample_id, scores in sample_scores.items()
116 |         } # take average
117 |         for i, item in enumerate(metadata): # fill back into metadata
118 |             metadata[i]["avg_score"] = sample_avg_score[item["sample_id"]]
119 |         
120 |         new_metadata = {}  # {sample_id: item}
121 |         for item in metadata:
122 |             sample_id = item["sample_id"]
123 |             if not sample_id in new_metadata:
124 |                 new_metadata[sample_id] = {
125 |                     k: v
126 |                     for k, v in item.items()
127 |                     if k not in ["listener_id", "listener_idx", "score"]
128 |                 }
129 | 
130 |         metadata = list(new_metadata.values())
131 | 
132 |     # write csv
133 |     logging.info("Writing output csv file.")
134 |     fieldnames = ["wav_path", "system_id", "sample_id"]
135 |     if args.avg_score_only:
136 |         fieldnames.append("avg_score")
137 |     else:
138 |         fieldnames.append("score")
139 |     if args.domain_idx is not None:
140 |         fieldnames.append("domain_idx")
141 |     with open(args.out, "w", newline="") as csvfile:
142 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
143 |         writer.writeheader()
144 |         for line in metadata:
145 |             writer.writerow(line)
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 | 


--------------------------------------------------------------------------------
/egs/singmos/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/singmos/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/somos/README.md:
--------------------------------------------------------------------------------
1 | # SingMOS


--------------------------------------------------------------------------------
/egs/somos/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/somos/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: sys_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/somos/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/somos.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     wget https://zenodo.org/records/7378801/files/somos.zip
15 |     unzip somos.zip
16 |     unzip audios.zip
17 |     rm somos.zip
18 |     rm audios.zip
19 |     cd $cwd
20 |     echo "Successfully finished download."
21 |     touch ${db}/somos.done
22 | else
23 |     echo "Already exists. Skip download."
24 | fi
25 | 


--------------------------------------------------------------------------------
/egs/somos/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/somos/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/tencent/README.md:
--------------------------------------------------------------------------------
1 | # Tencent


--------------------------------------------------------------------------------
/egs/tencent/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/tencent/conf/ldnet-ml.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: mag_sgram              # choices: wav, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "LDNet"
15 | model_params:
16 |     listener_emb_dim: 128
17 | 
18 |     activation: "ReLU"
19 |     encoder_type: "mobilenetv3"
20 |     encoder_bneck_configs:
21 |         - [16, 3, 16, 16, True, "RE", 3, 1]
22 |         - [16, 3, 72, 24, False, "RE", 3, 1]
23 |         - [24, 3, 88, 24, False, "RE", 1, 1]
24 |         - [24, 5, 96, 40, True, "HS", 3, 1]
25 |         - [40, 5, 240, 40, True, "HS", 1, 1]
26 |         - [40, 5, 240, 40, True, "HS", 1, 1]
27 |         - [40, 5, 120, 48, True, "HS", 1, 1]
28 |         - [48, 5, 144, 48, True, "HS", 1, 1]
29 |         - [48, 5, 288, 96, True, "HS", 3, 1] 
30 |         - [96, 5, 576, 96, True, "HS", 1, 1]
31 |         - [96, 5, 576, 96, True, "HS", 1, 1]
32 |     encoder_output_dim: 256
33 | 
34 |     decoder_type: "ffn"
35 |     decoder_dnn_dim: 64
36 |     output_type: "scalar"
37 |     range_clipping: True # this is needed if output_type is scalar
38 | 
39 |     use_mean_net: False
40 |     use_mean_listener: True
41 | 
42 | ###########################################################
43 | #                      LOSS SETTING                       #
44 | ###########################################################
45 | mean_score_criterion_type: null
46 | listener_score_criterion_type: "ScalarLoss"
47 | listener_score_criterion_params:
48 |     tau: 0.5
49 |     masked_loss: False
50 | listener_score_criterion_weight: 1.0
51 | 
52 | ###########################################################
53 | #                   INFERENCE SETTING                     #
54 | ###########################################################
55 | inference_mode: mean_listener   # this is used for
56 |                                 # (1) evaluation in the training loop
57 |                                 # (2) default inference mode
58 | 
59 | ###########################################################
60 | #                  DATA LOADER SETTING                    #
61 | ###########################################################
62 | train_batch_size: 60
63 | test_batch_size: 1
64 | padding_mode: "repetitive"  # repetitive, zero_padding
65 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
66 | num_workers: 0              # Number of workers in Pytorch DataLoader.
67 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
68 | 
69 | ###########################################################
70 | #             OPTIMIZER & SCHEDULER SETTING               #
71 | ###########################################################
72 | optimizer_type: RMSprop
73 | optimizer_params:
74 |     lr: 1.0e-3
75 |     # the following params come from
76 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
77 |     eps: 0.0316
78 |     alpha: 0.9
79 | grad_norm: 1.0              # Gradient norm.
80 | scheduler_type: stepLR
81 | scheduler_params:
82 |     step_size: 1000
83 |     gamma: 0.97
84 | 
85 | ###########################################################
86 | #                    INTERVAL SETTING                     #
87 | ###########################################################
88 | train_max_steps: 100000                 # Number of training steps.
89 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
90 | log_interval_steps: 100                 # Interval steps to record the training log.
91 | keep_nbest_models: 5                    # number of models to keep
92 | patience: 20                            # patience for early stopping
93 | best_model_criterion:                   # criterion to save the best models
94 |     key: sys_SRCC
95 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/tencent/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: utt_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/tencent/local/data_prep.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2024 Wen-Chin Huang
  5 | #  MIT License (https://opensource.org/licenses/MIT)
  6 | 
  7 | """Data preparation for the Tencent corpus."""
  8 | 
  9 | import argparse
 10 | import csv
 11 | import logging
 12 | import os
 13 | import random
 14 | import sys
 15 | 
 16 | from sheet.utils import read_csv
 17 | 
 18 | 
 19 | def main():
 20 |     """Run training process."""
 21 |     parser = argparse.ArgumentParser()
 22 |     parser.add_argument(
 23 |         "--original-path",
 24 |         required=True,
 25 |         nargs="+",
 26 |         help=("original csv file paths. For the Tencent corpus we take two."),
 27 |     )
 28 |     parser.add_argument(
 29 |         "--wavdir",
 30 |         required=True,
 31 |         type=str,
 32 |         help=(
 33 |             "directory of the waveform files. This is needed because wav paths in BVCC metadata files do not contain the wav directory."
 34 |         ),
 35 |     )
 36 |     parser.add_argument(
 37 |         "--out",
 38 |         required=True,
 39 |         type=str,
 40 |         help=("output csv file path."),
 41 |     )
 42 |     parser.add_argument(
 43 |         "--setname",
 44 |         required=True,
 45 |         type=str,
 46 |         choices=["train", "dev", "test"],
 47 |         help=(
 48 |             "setname. Since there is no dev set, we need to randomly sample dev set on our own."
 49 |         ),
 50 |     )
 51 |     parser.add_argument(
 52 |         "--dev_ratio",
 53 |         default=0.1,
 54 |         type=float,
 55 |         help=("The ratio of the dev set. Default: 0.1"),
 56 |     )
 57 |     parser.add_argument(
 58 |         "--seed",
 59 |         default=1337,
 60 |         type=int,
 61 |         help=("Random seed. This is used to get consistent random sampling results."),
 62 |     )
 63 |     args = parser.parse_args()
 64 | 
 65 |     # set logger
 66 |     logging.basicConfig(
 67 |         level=logging.INFO,
 68 |         stream=sys.stdout,
 69 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
 70 |     )
 71 | 
 72 |     # read csv
 73 |     metadata = []
 74 |     for original_path in args.original_path:
 75 |         logging.info(f"Reading original csv file {original_path}")
 76 |         filelist, _ = read_csv(original_path, dict_reader=True)
 77 | 
 78 |         # prepare. each line looks like this:
 79 |         # deg_wav,mos
 80 |         current_metadata = []
 81 |         for line in filelist:
 82 |             if len(line) == 0:
 83 |                 continue
 84 |             wav_path = line["deg_wav"].replace("./", "")
 85 |             score = float(line["mos"])
 86 |             sample_id = wav_path.replace(".wav", "").replace(os.sep, "_")
 87 |             system_id = sample_id  # no system ID information
 88 |             item = {
 89 |                 "wav_path": os.path.join(args.wavdir, wav_path),
 90 |                 "score": score,
 91 |                 "system_id": system_id,
 92 |                 "sample_id": sample_id,
 93 |             }
 94 |             current_metadata.append(item)
 95 | 
 96 |         # shuffle and split
 97 |         random.shuffle(current_metadata)
 98 |         dev_num = int(len(current_metadata) * args.dev_ratio)
 99 |         if args.setname == "train":
100 |             current_metadata = current_metadata[dev_num:]
101 |         elif args.setname == "dev":
102 |             current_metadata = current_metadata[:dev_num]
103 | 
104 |         metadata.extend(current_metadata)
105 |     metadata.sort(key=lambda x: x["wav_path"])
106 | 
107 |     # write csv
108 |     logging.info("Writing output csv file.")
109 |     fieldnames = ["wav_path", "score", "system_id", "sample_id"]
110 |     with open(args.out, "w", newline="") as csvfile:
111 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
112 |         writer.writeheader()
113 |         for line in metadata:
114 |             writer.writerow(line)
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/egs/tencent/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/tencent/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/tmhint-qi/README.md:
--------------------------------------------------------------------------------
1 | # TMHINT-QI


--------------------------------------------------------------------------------
/egs/tmhint-qi/cmd.sh:
--------------------------------------------------------------------------------
 1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 3 | # e.g.
 4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 5 | #
 6 | # Options:
 7 | #   --time <time>: Limit the maximum time to execute.
 8 | #   --mem <mem>: Limit the maximum memory usage.
 9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
10 | #   --num-threads <ngpu>: Specify the number of CPU core.
11 | #   --gpu <ngpu>: Specify the number of GPU devices.
12 | #   --config: Change the configuration file from default.
13 | #
14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
18 | #
19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
20 | # These options are mapping to specific options for each backend and
21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
22 | # If jobs failed, your configuration might be wrong for your environment.
23 | #
24 | #
25 | # The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
27 | # =========================================================~
28 | 
29 | 
30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
31 | cmd_backend="local"
32 | 
33 | # Local machine, without any Job scheduling system
34 | if [ "${cmd_backend}" = local ]; then
35 | 
36 |     # The other usage
37 |     export train_cmd="utils/run.pl"
38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
39 |     export cuda_cmd="utils/run.pl"
40 |     # Used for "*_recog.py"
41 |     export decode_cmd="utils/run.pl"
42 | 
43 | # Local machine, without any Job scheduling system
44 | elif [ "${cmd_backend}" = stdout ]; then
45 | 
46 |     # The other usage
47 |     export train_cmd="utils/stdout.pl"
48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
49 |     export cuda_cmd="utils/stdout.pl"
50 |     # Used for "*_recog.py"
51 |     export decode_cmd="utils/stdout.pl"
52 | 
53 | # "qsub" (SGE, Torque, PBS, etc.)
54 | elif [ "${cmd_backend}" = sge ]; then
55 |     # The default setting is written in conf/queue.conf.
56 |     # You must change "-q g.q" for the "queue" for your environment.
57 |     # To know the "queue" names, type "qhost -q"
58 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
59 | 
60 |     export train_cmd="utils/queue.pl"
61 |     export cuda_cmd="utils/queue.pl"
62 |     export decode_cmd="utils/queue.pl"
63 | 
64 | # "sbatch" (Slurm)
65 | elif [ "${cmd_backend}" = slurm ]; then
66 |     # The default setting is written in conf/slurm.conf.
67 |     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
68 |     # To know the "partion" names, type "sinfo".
69 |     # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
70 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
71 | 
72 |     export train_cmd="utils/slurm.pl"
73 |     export cuda_cmd="utils/slurm.pl"
74 |     export decode_cmd="utils/slurm.pl"
75 | 
76 | elif [ "${cmd_backend}" = ssh ]; then
77 |     # You have to create ".queue/machines" to specify the host to execute jobs.
78 |     # e.g. .queue/machines
79 |     #   host1
80 |     #   host2
81 |     #   host3
82 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
83 | 
84 |     export train_cmd="utils/ssh.pl"
85 |     export cuda_cmd="utils/ssh.pl"
86 |     export decode_cmd="utils/ssh.pl"
87 | 
88 | else
89 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
90 |     return 1
91 | fi
92 | 


--------------------------------------------------------------------------------
/egs/tmhint-qi/conf/ldnet-ml.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: mag_sgram              # choices: wav, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "LDNet"
15 | model_params:
16 |     listener_emb_dim: 128
17 | 
18 |     activation: "ReLU"
19 |     encoder_type: "mobilenetv3"
20 |     encoder_bneck_configs:
21 |         - [16, 3, 16, 16, True, "RE", 3, 1]
22 |         - [16, 3, 72, 24, False, "RE", 3, 1]
23 |         - [24, 3, 88, 24, False, "RE", 1, 1]
24 |         - [24, 5, 96, 40, True, "HS", 3, 1]
25 |         - [40, 5, 240, 40, True, "HS", 1, 1]
26 |         - [40, 5, 240, 40, True, "HS", 1, 1]
27 |         - [40, 5, 120, 48, True, "HS", 1, 1]
28 |         - [48, 5, 144, 48, True, "HS", 1, 1]
29 |         - [48, 5, 288, 96, True, "HS", 3, 1] 
30 |         - [96, 5, 576, 96, True, "HS", 1, 1]
31 |         - [96, 5, 576, 96, True, "HS", 1, 1]
32 |     encoder_output_dim: 256
33 | 
34 |     decoder_type: "ffn"
35 |     decoder_dnn_dim: 64
36 |     output_type: "scalar"
37 |     range_clipping: True # this is needed if output_type is scalar
38 | 
39 |     use_mean_net: False
40 |     use_mean_listener: True
41 | 
42 | ###########################################################
43 | #                      LOSS SETTING                       #
44 | ###########################################################
45 | mean_score_criterion_type: null
46 | listener_score_criterion_type: "ScalarLoss"
47 | listener_score_criterion_params:
48 |     tau: 0.5
49 |     masked_loss: False
50 | listener_score_criterion_weight: 1.0
51 | 
52 | ###########################################################
53 | #                   INFERENCE SETTING                     #
54 | ###########################################################
55 | inference_mode: mean_listener   # this is used for
56 |                                 # (1) evaluation in the training loop
57 |                                 # (2) default inference mode
58 | 
59 | ###########################################################
60 | #                  DATA LOADER SETTING                    #
61 | ###########################################################
62 | train_batch_size: 60
63 | test_batch_size: 1
64 | padding_mode: "repetitive"  # repetitive, zero_padding
65 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
66 | num_workers: 0              # Number of workers in Pytorch DataLoader.
67 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
68 | 
69 | ###########################################################
70 | #             OPTIMIZER & SCHEDULER SETTING               #
71 | ###########################################################
72 | optimizer_type: RMSprop
73 | optimizer_params:
74 |     lr: 1.0e-3
75 |     # the following params come from
76 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
77 |     eps: 0.0316
78 |     alpha: 0.9
79 | grad_norm: 1.0              # Gradient norm.
80 | scheduler_type: stepLR
81 | scheduler_params:
82 |     step_size: 1000
83 |     gamma: 0.97
84 | 
85 | ###########################################################
86 | #                    INTERVAL SETTING                     #
87 | ###########################################################
88 | train_max_steps: 100000                 # Number of training steps.
89 | eval_and_save_interval_steps: 1000      # Interval steps to do evaluation and save checkpoint.
90 | log_interval_steps: 100                 # Interval steps to record the training log.
91 | keep_nbest_models: 5                    # number of models to keep
92 | patience: 20                            # patience for early stopping
93 | best_model_criterion:                   # criterion to save the best models
94 |     key: sys_SRCC
95 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/tmhint-qi/conf/ssl-mos-wav2vec2.yaml:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #                DATA AND TASK SETTING                    #
 3 | ###########################################################
 4 | # task: NonIntrusiveAbsoluteRating
 5 | dataset_type: NonIntrusiveDataset
 6 | collater_type: NonIntrusiveCollater
 7 | trainer_type: NonIntrusiveEstimatorTrainer
 8 | sampling_rate: 16000                # Sampling rate.
 9 | model_input: waveform               # choices: waveform, mag_sgram
10 | 
11 | ###########################################################
12 | #              NETWORK ARCHITECTURE SETTING               #
13 | ###########################################################
14 | model_type: "SSLMOS"
15 | model_params:
16 |     ssl_module: "s3prl"
17 |     s3prl_name: "wav2vec2"
18 |     ssl_model_output_dim: 768
19 |     ssl_model_layer_idx: -1
20 | 
21 |     mean_net_dnn_dim: 64
22 |     mean_net_output_type: "scalar"
23 |     mean_net_range_clipping: True
24 | 
25 |     use_listener_modeling: False
26 |     use_mean_listener: False
27 |     
28 | ###########################################################
29 | #                      LOSS SETTING                       #
30 | ###########################################################
31 | mean_score_criterions:
32 |     - criterion_type: "ScalarLoss"
33 |       criterion_weight: 1.0
34 |       criterion_params:
35 |         order: 1
36 |         tau: 0.5
37 |         masked_loss: False
38 | listener_score_criterions: null
39 | 
40 | ###########################################################
41 | #                   INFERENCE SETTING                     #
42 | ###########################################################
43 | inference_mode: mean_net    # this is used for
44 |                             # (1) evaluation in the training loop
45 |                             # (2) default inference mode
46 | 
47 | ###########################################################
48 | #                  DATA LOADER SETTING                    #
49 | ###########################################################
50 | train_batch_size: 16
51 | test_batch_size: 1
52 | padding_mode: "repetitive"  # repetitive, zero_padding
53 | wav_only: True              # Reduce to average only even for the training data.
54 |                             # Set to True usually if no use listener modeling at all (ex. SSL-MOS)
55 | pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
56 | num_workers: 0              # Number of workers in Pytorch DataLoader.
57 | allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
58 | 
59 | ###########################################################
60 | #             OPTIMIZER & SCHEDULER SETTING               #
61 | ###########################################################
62 | optimizer_type: SGD
63 | optimizer_params:
64 |     lr: 1.0e-3
65 |     # the following params come from
66 |     # https://github.com/pytorch/vision/blob/c2ab0c59f42babf9ad01aa616cd8a901daac86dd/references/classification/train.py#L172-L173
67 |     momentum: 0.9
68 | grad_norm: 1.0              # Gradient norm.
69 | scheduler_type: null
70 | 
71 | ###########################################################
72 | #                    INTERVAL SETTING                     #
73 | ###########################################################
74 | train_max_steps: 100000                 # Number of training steps.
75 | eval_and_save_interval_steps: 100       # Interval steps to do evaluation and save checkpoint.
76 | log_interval_steps: 100                 # Interval steps to record the training log.
77 | keep_nbest_models: 5                    # number of models to keep
78 | patience: 20                            # patience for early stopping
79 | best_model_criterion:                   # criterion to save the best models
80 |     key: utt_SRCC
81 |     order: highest                      # choices: lowest, highest


--------------------------------------------------------------------------------
/egs/tmhint-qi/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/tmhint-qi.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     gdown 1TMDiz6dnS76hxyeAcCQxeSqqEOH4UDN0
15 |     unzip TMHINTQI.zip
16 |     rm TMHINTQI.zip
17 |     rm -rf __MACOSX/
18 |     mv TMHINTQI/* .
19 |     rm -rf TMHINTQI
20 |     cd $cwd
21 |     echo "Successfully finished download. Please follow the instructions."
22 |     touch ${db}/main.done
23 | else
24 |     echo "Already exists. Skip download."
25 | fi
26 | 


--------------------------------------------------------------------------------
/egs/tmhint-qi/path.sh:
--------------------------------------------------------------------------------
 1 | # path related
 2 | export PRJ_ROOT="${PWD}/../.."
 3 | if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then
 4 |     # shellcheck disable=SC1090
 5 |     . "${PRJ_ROOT}/tools/venv/bin/activate"
 6 | fi
 7 | 
 8 | MAIN_ROOT=$PWD/../..
 9 | export PATH=$MAIN_ROOT/sheet/bin:$PATH
10 | 
11 | # python related
12 | export OMP_NUM_THREADS=1
13 | export PYTHONIOENCODING=UTF-8
14 | export MPL_BACKEND=Agg
15 | 


--------------------------------------------------------------------------------
/egs/tmhint-qi/utils:
--------------------------------------------------------------------------------
1 | ../../utils/


--------------------------------------------------------------------------------
/egs/vmc23/local/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | db=$1
 8 | 
 9 | # download dataset
10 | cwd=`pwd`
11 | if [ ! -e ${db}/vmc23.done ]; then
12 |     mkdir -p ${db}
13 |     cd ${db}
14 |     mkdir track1
15 |     mkdir track2
16 |     mkdir track3
17 | 
18 |     # track 1
19 |     cd track1
20 |     wget https://www.dropbox.com/s/c83l67bkeh9p49k/VoiceMOS2023Track1.zip
21 |     unzip VoiceMOS2023Track1.zip
22 |     rm VoiceMOS2023Track1.zip
23 |     rm -rf __MACOSX/
24 |     mv VoiceMOS2023Track1/* .
25 |     rm -rf VoiceMOS2023Track1/
26 |     cd ..
27 | 
28 |     # track 2
29 |     cd track2
30 |     gdown 188FJiCBT0RSI6-q4ICJPfGqmR22_9Kd2
31 |     unzip VoiceMOS_2023_track2.zip
32 |     rm VoiceMOS_2023_track2.zip
33 |     mv VoiceMOS_2023_track2/* .
34 |     rm -rf VoiceMOS_2023_track2/
35 |     cd ..
36 | 
37 |     # track 3
38 |     cd track3
39 |     gdown 10_1JbEsxKPYZJLDXMeMkcjLHkbDQt84w
40 |     unzip VoiceMOS_2023_track3.zip
41 |     rm VoiceMOS_2023_track3.zip
42 |     mv VoiceMOS_2023_track3/* .
43 |     rm -rf VoiceMOS_2023_track3/
44 |     cd ..
45 | 
46 |     cd $cwd
47 |     echo "Successfully finished download."
48 |     touch ${db}/vmc23.done
49 | else
50 |     echo "Already exists. Skip download."
51 | fi
52 | 


--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
  1 | """torch.hub configuration."""
  2 | 
  3 | dependencies = ["torch", "torchaudio"]
  4 | 
  5 | import os
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import torchaudio
  9 | import yaml
 10 | 
 11 | from sheet.utils.download import _urls_to_filepaths
 12 | 
 13 | FS = 16000
 14 | resamplers = {}
 15 | MIN_REQUIRED_WAV_LENGTH = 1040
 16 | 
 17 | URLS = {
 18 |     "default": {
 19 |         "conf": "https://github.com/unilight/sheet/releases/download/v0.1.0/all7-sslmos-mdf-2337-config.yml",
 20 |         "model": "https://github.com/unilight/sheet/releases/download/v0.1.0/all7-sslmos-mdf-2337-checkpoint-86000steps.pkl",
 21 |     }
 22 | }
 23 | 
 24 | def read_wav(wav_path):
 25 |     # read waveform
 26 |     waveform, sample_rate = torchaudio.load(
 27 |         wav_path, channels_first=False
 28 |     )  # waveform: [T, 1]
 29 | 
 30 |     # resample if needed
 31 |     if sample_rate != FS:
 32 |         resampler_key = f"{sample_rate}-{FS}"
 33 |         if resampler_key not in resamplers:
 34 |             resamplers[resampler_key] = torchaudio.transforms.Resample(
 35 |                 sample_rate, FS, dtype=waveform.dtype
 36 |             )
 37 |         waveform = resamplers[resampler_key](waveform)
 38 | 
 39 |     waveform = waveform.squeeze(-1)
 40 | 
 41 |     # always pad to a minumum length
 42 |     if waveform.shape[0] < MIN_REQUIRED_WAV_LENGTH:
 43 |         to_pad = (MIN_REQUIRED_WAV_LENGTH - waveform.shape[0]) // 2
 44 |         waveform = F.pad(waveform, (to_pad, to_pad), "constant", 0)
 45 | 
 46 |     return waveform, sample_rate
 47 | 
 48 | class Predictor:
 49 |     """Wrapper class for unified waveform reading"""
 50 |     def __init__(self, model, config):
 51 |         self.model = model
 52 |         self.config = config
 53 |     
 54 |     def predict(self, wav_path=None, wav=None):
 55 |         """
 56 |         Args:
 57 |             wav: must be torch tensor
 58 |         """
 59 |         if wav is None:
 60 |             if wav_path is None:
 61 |                 raise ValueError("Either wav_path or wav must be set. Please provide one.")
 62 |             else:
 63 |                 wav, _ = read_wav(wav_path)
 64 |         else:
 65 |             if wav_path is not None:
 66 |                 raise ValueError("Either wav_path or wav can be set. Please choose one.")
 67 |         
 68 |         if type(wav) is not torch.Tensor:
 69 |             raise ValueError("wav must be torch.tensor")
 70 |         if len(wav.shape) > 1:
 71 |             raise ValueError("wav must be of an 1d tensor of shape [num_samples]")
 72 | 
 73 |         # set up model input
 74 |         model_input = wav.unsqueeze(0)
 75 |         model_lengths = model_input.new_tensor([model_input.size(1)]).long()
 76 |         inputs = {
 77 |             self.config["model_input"]: model_input,
 78 |             self.config["model_input"] + "_lengths": model_lengths,
 79 |         }
 80 | 
 81 |         with torch.no_grad():
 82 |             # model forward
 83 |             if self.config["inference_mode"] == "mean_listener":
 84 |                 outputs = self.model.mean_listener_inference(inputs)
 85 |             elif self.config["inference_mode"] == "mean_net":
 86 |                 outputs = self.model.mean_net_inference(inputs)
 87 | 
 88 |         pred_mean_scores = outputs["scores"].cpu().detach().numpy()[0]
 89 |         return pred_mean_scores
 90 | 
 91 | def default(progress: bool = True):
 92 |     """
 93 |     The default model is the SSL-MOS model with MDF trained with all seven training sets in MOS-Bench.
 94 | 
 95 |     Args:
 96 |         progress - Whether to show model checkpoint load progress
 97 |     """
 98 | 
 99 |     # get config
100 |     config_dst = os.path.join(torch.hub.get_dir(), "configs", os.path.basename(URLS["default"]["conf"]))
101 |     os.makedirs(os.path.join(torch.hub.get_dir(), "configs"), exist_ok=True)
102 |     torch.hub.download_url_to_file(URLS["default"]["conf"], dst=config_dst)
103 |     with open(config_dst) as f:
104 |         config = yaml.load(f, Loader=yaml.Loader)
105 | 
106 |     # init model
107 |     if config["model_type"] == "SSLMOS":
108 |         from sheet.models.sslmos import SSLMOS
109 |         model = SSLMOS(
110 |             config["model_input"],
111 |             **config["model_params"],
112 |         )
113 | 
114 |     # load model
115 |     state_dict = torch.hub.load_state_dict_from_url(url=URLS["default"]["model"], map_location="cpu", progress=progress)
116 |     model.load_state_dict(state_dict)
117 |     model.eval()
118 | 
119 |     # send model to a Predictor wrapper
120 |     predictor = Predictor(model, config)
121 | 
122 |     return predictor


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [options]
 2 | packages = find:
 3 | install_requires =
 4 |     librosa >= 0.8.0
 5 |     soundfile>=0.10.2
 6 |     tensorboardX
 7 |     matplotlib>=3.1.0
 8 |     pyyaml
 9 |     tqdm>=4.26.1
10 |     kaldiio>=2.14.1
11 |     h5py>=2.9.0
12 |     yq>=2.10.0
13 |     gdown
14 |     filelock
15 |     protobuf<=3.20.1
16 |     scipy
17 |     pysptk
18 |     s3prl
19 |     humanfriendly
20 |     prettytable
21 |     faiss-cpu
22 | 
23 | [options.entry_points]
24 | # console_scripts =
25 | 
26 | [metadata]
27 | name = sheet
28 | version = 0.1.0
29 | author = Wen-Chin Huang
30 | author_email = wen.chinhuang@g.sp.m.is.nagoya-u.ac.jp
31 | description = Speech Human Evaluation Estimation Toolkit (SHEET)
32 | keywords = speech quality assessment
33 | license = MIT
34 | long_description=README.md
35 | long_description_content_type=text/markdown
36 | classifiers =
37 |     License :: OSI Approved :: MIT License
38 |     Programming Language :: Python :: 3
39 | 


--------------------------------------------------------------------------------
/sheet/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __version__ = "0.1.0"
4 | 


--------------------------------------------------------------------------------
/sheet/collaters/__init__.py:
--------------------------------------------------------------------------------
1 | from .non_intrusive import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/sheet/collaters/non_intrusive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | from torch.nn.utils.rnn import pad_sequence
10 | 
11 | FEAT_TYPES = ["waveform", "mag_sgram"]
12 | 
13 | 
14 | class NonIntrusiveCollater(object):
15 |     """Customized collater for Pytorch DataLoader in the non-intrusive setting."""
16 | 
17 |     def __init__(self, padding_mode):
18 |         """Initialize customized collater for PyTorch DataLoader."""
19 |         self.padding_mode = padding_mode
20 | 
21 |     def __call__(self, batch):
22 |         """Convert into batch tensors."""
23 | 
24 |         items = {}
25 |         sorted_batch = sorted(batch, key=lambda x: -x["waveform"].shape[0])
26 |         bs = len(sorted_batch)  # batch_size
27 |         all_keys = list(sorted_batch[0].keys())
28 | 
29 |         # score & listener id
30 |         items["scores"] = torch.tensor(
31 |             [sorted_batch[i]["score"] for i in range(bs)], dtype=torch.float
32 |         )
33 |         items["avg_scores"] = torch.tensor(
34 |             [sorted_batch[i]["avg_score"] for i in range(bs)], dtype=torch.float
35 |         )
36 |         if "listener_id" in all_keys:
37 |             items["listener_ids"] = [sorted_batch[i]["listener_id"] for i in range(bs)]
38 |         if "listener_idx" in all_keys:
39 |             items["listener_idxs"] = torch.tensor(
40 |                 [sorted_batch[i]["listener_idx"] for i in range(bs)], dtype=torch.long
41 |             )
42 |         if "domain_idx" in all_keys:
43 |             items["domain_idxs"] = torch.tensor(
44 |                 [sorted_batch[i]["domain_idx"] for i in range(bs)], dtype=torch.long
45 |             )
46 | 
47 |         # phoneme and reference
48 |         if "phoneme_idxs" in all_keys:
49 |             phonemes = [
50 |                 torch.LongTensor(sorted_batch[i]["phoneme_idxs"]) for i in range(bs)
51 |             ]
52 |             items["phoneme_lengths"] = torch.from_numpy(
53 |                 np.array([phoneme.size(0) for phoneme in phonemes])
54 |             )
55 |             items["phoneme_idxs"] = pad_sequence(phonemes, batch_first=True)
56 |         if "reference_idxs" in all_keys:
57 |             references = [
58 |                 torch.LongTensor(sorted_batch[i]["reference_idxs"]) for i in range(bs)
59 |             ]
60 |             items["reference_lengths"] = torch.from_numpy(
61 |                 np.array([reference.size(0) for reference in references])
62 |             )
63 |             items["reference_idxs"] = pad_sequence(references, batch_first=True)
64 | 
65 |         # ids
66 |         items["system_ids"] = [sorted_batch[i]["system_id"] for i in range(bs)]
67 |         items["sample_ids"] = [sorted_batch[i]["sample_id"] for i in range(bs)]
68 | 
69 |         # pad input features (only those in FEAT TYPES)
70 |         for feat_type in FEAT_TYPES:
71 |             if not feat_type in sorted_batch[0]:
72 |                 continue
73 | 
74 |             feats = [sorted_batch[i][feat_type] for i in range(bs)]
75 |             feat_lengths = torch.from_numpy(np.array([feat.size(0) for feat in feats]))
76 | 
77 |             # padding
78 |             if self.padding_mode == "zero_padding":
79 |                 feats_padded = pad_sequence(feats, batch_first=True)
80 |             elif self.padding_mode == "repetitive":
81 |                 max_len = feat_lengths[0]
82 |                 feats_padded = []
83 |                 for feat in feats:
84 |                     this_len = feat.shape[0]
85 |                     dup_times = max_len // this_len
86 |                     remain = max_len - this_len * dup_times
87 |                     to_dup = [feat for t in range(dup_times)]
88 |                     to_dup.append(feat[:remain])
89 |                     duplicated_feat = torch.Tensor(np.concatenate(to_dup, axis=0))
90 |                     feats_padded.append(duplicated_feat)
91 |                 feats_padded = torch.stack(feats_padded, dim=0)
92 |             else:
93 |                 raise NotImplementedError
94 | 
95 |             items[feat_type] = feats_padded
96 |             items[feat_type + "_lengths"] = feat_lengths
97 | 
98 |         return items
99 | 


--------------------------------------------------------------------------------
/sheet/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .non_intrusive import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/sheet/evaluation/metrics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2024 Wen-Chin Huang
 4 | #  MIT License (https://opensource.org/licenses/MIT)
 5 | 
 6 | """Script to calculate metrics."""
 7 | 
 8 | import numpy as np
 9 | import scipy
10 | 
11 | 
12 | def calculate(
13 |     true_mean_scores, predict_mean_scores, true_sys_mean_scores, predict_sys_mean_scores
14 | ):
15 | 
16 |     utt_MSE = np.mean((true_mean_scores - predict_mean_scores) ** 2)
17 |     utt_LCC = np.corrcoef(true_mean_scores, predict_mean_scores)[0][1]
18 |     utt_SRCC = scipy.stats.spearmanr(true_mean_scores, predict_mean_scores)[0]
19 |     utt_KTAU = scipy.stats.kendalltau(true_mean_scores, predict_mean_scores)[0]
20 |     sys_MSE = np.mean((true_sys_mean_scores - predict_sys_mean_scores) ** 2)
21 |     sys_LCC = np.corrcoef(true_sys_mean_scores, predict_sys_mean_scores)[0][1]
22 |     sys_SRCC = scipy.stats.spearmanr(true_sys_mean_scores, predict_sys_mean_scores)[0]
23 |     sys_KTAU = scipy.stats.kendalltau(true_sys_mean_scores, predict_sys_mean_scores)[0]
24 | 
25 |     return {
26 |         "utt_MSE": utt_MSE,
27 |         "utt_LCC": utt_LCC,
28 |         "utt_SRCC": utt_SRCC,
29 |         "utt_KTAU": utt_KTAU,
30 |         "sys_MSE": sys_MSE,
31 |         "sys_LCC": sys_LCC,
32 |         "sys_SRCC": sys_SRCC,
33 |         "sys_KTAU": sys_KTAU,
34 |     }
35 | 


--------------------------------------------------------------------------------
/sheet/evaluation/plot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Wen-Chin Huang
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Script to plot figures."""
  7 | 
  8 | import matplotlib
  9 | import numpy as np
 10 | 
 11 | # Force matplotlib to not use any Xwindows backend.
 12 | matplotlib.use("Agg")
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | STYLE = "seaborn-v0_8-deep"
 16 | 
 17 | 
 18 | def plot_utt_level_hist(true_mean_scores, predict_mean_scores, filename):
 19 |     """Plot utterance-level histrogram.
 20 | 
 21 |     Args:
 22 |         true_mean_scores: ndarray of true scores
 23 |         predict_mean_scores: ndarray of predicted scores
 24 |         filename: name of the saved figure
 25 |     """
 26 |     plt.style.use(STYLE)
 27 |     bins = np.linspace(1, 5, 40)
 28 |     plt.figure(2)
 29 |     plt.hist(
 30 |         [true_mean_scores, predict_mean_scores], bins, label=["true_mos", "predict_mos"]
 31 |     )
 32 |     plt.legend(loc="upper right")
 33 |     plt.xlabel("MOS")
 34 |     plt.ylabel("number")
 35 |     plt.show()
 36 |     plt.savefig(filename, dpi=150)
 37 |     plt.close()
 38 | 
 39 | 
 40 | def plot_utt_level_scatter(
 41 |     true_mean_scores, predict_mean_scores, filename, LCC, SRCC, MSE, KTAU
 42 | ):
 43 |     """Plot utterance-level scatter plot
 44 | 
 45 |     Args:
 46 |         true_mean_scores: ndarray of true scores
 47 |         predict_mean_scores: ndarray of predicted scores
 48 |         filename: name of the saved figure
 49 |         LCC, SRCC, MSE, KTAU: metrics to be shown on the figure
 50 |     """
 51 |     M = np.max([np.max(predict_mean_scores), 5])
 52 |     plt.figure(3)
 53 |     plt.scatter(
 54 |         true_mean_scores,
 55 |         predict_mean_scores,
 56 |         s=15,
 57 |         color="b",
 58 |         marker="o",
 59 |         edgecolors="b",
 60 |         alpha=0.20,
 61 |     )
 62 |     plt.xlim([0.5, M])
 63 |     plt.ylim([0.5, M])
 64 |     plt.xlabel("True MOS")
 65 |     plt.ylabel("Predicted MOS")
 66 |     plt.title(
 67 |         "Utt level LCC= {:.4f}, SRCC= {:.4f}, MSE= {:.4f}, KTAU= {:.4f}".format(
 68 |             LCC, SRCC, MSE, KTAU
 69 |         )
 70 |     )
 71 |     plt.show()
 72 |     plt.savefig(filename, dpi=150)
 73 |     plt.close()
 74 | 
 75 | 
 76 | def plot_sys_level_scatter(
 77 |     true_sys_mean_scores, predict_sys_mean_scores, filename, LCC, SRCC, MSE, KTAU
 78 | ):
 79 |     """Plot system-level scatter plot
 80 | 
 81 |     Args:
 82 |         true_sys_mean_scores: ndarray of true system level scores
 83 |         predict_sys_mean_scores: ndarray of predicted system level scores
 84 |         filename: name of the saved figure
 85 |         LCC, SRCC, MSE, KTAU: metrics to be shown on the figure
 86 |     """
 87 |     M = np.max([np.max(predict_sys_mean_scores), 5])
 88 |     plt.figure(3)
 89 |     plt.scatter(
 90 |         true_sys_mean_scores,
 91 |         predict_sys_mean_scores,
 92 |         s=15,
 93 |         color="b",
 94 |         marker="o",
 95 |         edgecolors="b",
 96 |     )
 97 |     plt.xlim([0.5, M])
 98 |     plt.ylim([0.5, M])
 99 |     plt.xlabel("True MOS")
100 |     plt.ylabel("Predicted MOS")
101 |     plt.title(
102 |         "Sys level LCC= {:.4f}, SRCC= {:.4f}, MSE= {:.4f}, KTAU= {:.4f}".format(
103 |             LCC, SRCC, MSE, KTAU
104 |         )
105 |     )
106 |     plt.show()
107 |     plt.savefig(filename, dpi=150)
108 |     plt.close()
109 | 


--------------------------------------------------------------------------------
/sheet/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .basic_losses import *  # NOQA
2 | from .contrastive_loss import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/sheet/losses/basic_losses.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """Basic losses."""
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | from sheet.modules.utils import make_non_pad_mask
12 | 
13 | 
14 | class ScalarLoss(nn.Module):
15 |     """
16 |     Loss for scalar output (we use the clipped MSE loss)
17 |     """
18 | 
19 |     def __init__(self, tau, order=2, masked_loss=False):
20 |         super(ScalarLoss, self).__init__()
21 |         self.tau = tau
22 |         self.masked_loss = masked_loss
23 |         if order == 2:
24 |             self.criterion = torch.nn.MSELoss(reduction="none")
25 |         elif order == 1:
26 |             self.criterion = torch.nn.L1Loss(reduction="none")
27 |         else:
28 |             raise NotImplementedError
29 | 
30 |     def forward_criterion(self, y_hat, label, criterion_module, masks=None):
31 |         # might investigate how to combine masked loss with categorical output
32 |         if masks is not None:
33 |             y_hat = y_hat.masked_select(masks)
34 |             label = label.masked_select(masks)
35 | 
36 |         y_hat = y_hat.squeeze(-1)
37 |         loss = criterion_module(y_hat, label)
38 |         threshold = torch.abs(y_hat - label) > self.tau
39 |         loss = torch.mean(threshold * loss)
40 |         return loss
41 | 
42 |     def forward(self, pred_score, gt_score, device, lens=None):
43 |         """
44 |         Args:
45 |             pred_mean, pred_score: [batch, time, 1/5]
46 |         """
47 |         # make mask
48 |         if self.masked_loss:
49 |             masks = make_non_pad_mask(lens).to(device)
50 |         else:
51 |             masks = None
52 | 
53 |         # repeat for frame level loss
54 |         time = pred_score.shape[1]
55 |         # gt_mean = gt_mean.unsqueeze(1).repeat(1, time)
56 |         gt_score = gt_score.unsqueeze(1).repeat(1, time)
57 | 
58 |         loss = self.forward_criterion(pred_score, gt_score, self.criterion, masks)
59 |         return loss
60 | 
61 | 
62 | class CategoricalLoss(nn.Module):
63 |     def __init__(self, masked_loss=False):
64 |         super(CategoricalLoss, self).__init__()
65 |         self.masked_loss = masked_loss
66 |         self.criterion = nn.CrossEntropyLoss(reduction="none")
67 | 
68 |     def ce(self, y_hat, label, criterion, masks=None):
69 |         if masks is not None:
70 |             y_hat = y_hat.masked_select(masks)
71 |             label = label.masked_select(masks)
72 | 
73 |         # y_hat must have shape (batch_size, num_classes, ...)
74 |         y_hat = y_hat.permute(0, 2, 1)
75 | 
76 |         ce = criterion(y_hat, label)
77 |         return torch.mean(ce)
78 | 
79 |     def forward(self, pred_score, gt_score, lens, device):
80 |         # make mask
81 |         if self.masked_loss:
82 |             masks = make_non_pad_mask(lens).to(device)
83 |         else:
84 |             masks = None
85 | 
86 |         # repeat for frame level loss
87 |         time = pred_score.shape[1]
88 |         gt_score = gt_score.unsqueeze(1).repeat(1, time).type(torch.long)
89 | 
90 |         score_ce = self.ce(pred_score, gt_score, self.criterion, masks)
91 |         return score_ce
92 | 


--------------------------------------------------------------------------------
/sheet/losses/contrastive_loss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """Contrastive loss proposed in UTMOS."""
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class ContrastiveLoss(nn.Module):
14 |     """
15 |     Contrastive Loss
16 |     Args:
17 |         margin: non-neg value, the smaller the stricter the loss will be, default: 0.2
18 | 
19 |     """
20 | 
21 |     def __init__(self, margin=0.2):
22 |         super(ContrastiveLoss, self).__init__()
23 |         self.margin = margin
24 | 
25 |     def forward(self, pred_score, gt_score, lens, device):
26 |         if pred_score.dim() > 2:
27 |             pred_score = pred_score.mean(dim=1).squeeze(1)
28 |         # pred_score, gt_score: tensor, [batch_size]
29 | 
30 |         gt_diff = gt_score.unsqueeze(1) - gt_score.unsqueeze(0)
31 |         pred_diff = pred_score.unsqueeze(1) - pred_score.unsqueeze(0)
32 | 
33 |         loss = torch.maximum(
34 |             torch.zeros(gt_diff.shape).to(gt_diff.device),
35 |             torch.abs(pred_diff - gt_diff) - self.margin,
36 |         )
37 |         loss = loss.mean().div(2)
38 | 
39 |         return loss
40 | 


--------------------------------------------------------------------------------
/sheet/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .ldnet import *  # NOQA
2 | from .sslmos import *  # NOQA
3 | from .utmos import *  # NOQA
4 | from .alignnet import *  # NOQA
5 | # from .ramp_simple import *  # NOQA


--------------------------------------------------------------------------------
/sheet/modules/ldnet/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2024 Wen-Chin Huang
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | # LDNet modules
  7 | # taken from: https://github.com/unilight/LDNet/blob/main/models/modules.py (written by myself)
  8 | 
  9 | from functools import partial
 10 | from typing import List
 11 | 
 12 | import torch
 13 | from sheet.modules.ldnet.mobilenetv2 import ConvBNActivation
 14 | from sheet.modules.ldnet.mobilenetv3 import InvertedResidual as InvertedResidualV3
 15 | from sheet.modules.ldnet.mobilenetv3 import InvertedResidualConfig
 16 | from torch import nn
 17 | 
 18 | STRIDE = 3
 19 | 
 20 | 
 21 | class Projection(nn.Module):
 22 |     def __init__(
 23 |         self,
 24 |         in_dim,
 25 |         hidden_dim,
 26 |         activation,
 27 |         output_type,
 28 |         _output_dim,
 29 |         output_step=1.0,
 30 |         range_clipping=False,
 31 |     ):
 32 |         super(Projection, self).__init__()
 33 |         self.output_type = output_type
 34 |         self.range_clipping = range_clipping
 35 |         if output_type == "scalar":
 36 |             output_dim = 1
 37 |             if range_clipping:
 38 |                 self.proj = nn.Tanh()
 39 |         elif output_type == "categorical":
 40 |             output_dim = _output_dim
 41 |             self.output_step = output_step
 42 |         else:
 43 |             raise NotImplementedError("wrong output_type: {}".format(output_type))
 44 | 
 45 |         self.net = nn.Sequential(
 46 |             nn.Linear(in_dim, hidden_dim),
 47 |             activation(),
 48 |             nn.Dropout(0.3),
 49 |             nn.Linear(hidden_dim, output_dim),
 50 |         )
 51 | 
 52 |     def forward(self, x, inference=False):
 53 |         output = self.net(x)
 54 | 
 55 |         # scalar / categorical
 56 |         if self.output_type == "scalar":
 57 |             # range clipping
 58 |             if self.range_clipping:
 59 |                 return self.proj(output) * 2.0 + 3
 60 |             else:
 61 |                 return output
 62 |         else:
 63 |             if inference:
 64 |                 return torch.argmax(output, dim=-1) * self.output_step + 1
 65 |             else:
 66 |                 return output
 67 | 
 68 | 
 69 | class MobileNetV3ConvBlocks(nn.Module):
 70 |     def __init__(self, bneck_confs, output_dim):
 71 |         super(MobileNetV3ConvBlocks, self).__init__()
 72 | 
 73 |         bneck_conf = partial(InvertedResidualConfig, width_mult=1)
 74 |         inverted_residual_setting = [bneck_conf(*b_conf) for b_conf in bneck_confs]
 75 | 
 76 |         block = InvertedResidualV3
 77 | 
 78 |         # Never tested if a different eps and momentum is needed
 79 |         # norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
 80 |         norm_layer = nn.BatchNorm2d
 81 | 
 82 |         layers: List[nn.Module] = []
 83 | 
 84 |         # building first layer
 85 |         firstconv_output_channels = inverted_residual_setting[0].input_channels
 86 |         layers.append(
 87 |             ConvBNActivation(
 88 |                 1,
 89 |                 firstconv_output_channels,
 90 |                 kernel_size=3,
 91 |                 stride=STRIDE,
 92 |                 norm_layer=norm_layer,
 93 |                 activation_layer=nn.Hardswish,
 94 |             )
 95 |         )
 96 | 
 97 |         # building inverted residual blocks
 98 |         for cnf in inverted_residual_setting:
 99 |             layers.append(block(cnf, norm_layer))
100 | 
101 |         # building last several layers
102 |         lastconv_input_channels = inverted_residual_setting[-1].out_channels
103 |         lastconv_output_channels = output_dim
104 |         layers.append(
105 |             ConvBNActivation(
106 |                 lastconv_input_channels,
107 |                 lastconv_output_channels,
108 |                 kernel_size=1,
109 |                 norm_layer=norm_layer,
110 |                 activation_layer=nn.Hardswish,
111 |             )
112 |         )
113 |         self.features = nn.Sequential(*layers)
114 | 
115 |         for m in self.modules():
116 |             if isinstance(m, nn.Conv2d):
117 |                 nn.init.kaiming_normal_(m.weight, mode="fan_out")
118 |                 if m.bias is not None:
119 |                     nn.init.zeros_(m.bias)
120 |             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
121 |                 nn.init.ones_(m.weight)
122 |                 nn.init.zeros_(m.bias)
123 |             elif isinstance(m, nn.Linear):
124 |                 nn.init.normal_(m.weight, 0, 0.01)
125 |                 nn.init.zeros_(m.bias)
126 | 
127 |     def forward(self, x):
128 |         time = x.shape[2]
129 |         x = self.features(x)
130 |         x = nn.functional.adaptive_avg_pool2d(x, (time, 1))
131 |         x = x.squeeze(-1).transpose(1, 2)
132 |         return x
133 | 


--------------------------------------------------------------------------------
/sheet/nonparametric/datastore.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """datastore related"""
 8 | 
 9 | import faiss
10 | import h5py
11 | import numpy as np
12 | from scipy.special import softmax
13 | 
14 | 
15 | class Datastore():
16 |     def __init__(
17 |         self,
18 |         datastore_path,
19 |         embed_dim,
20 |         device,
21 |     ):
22 |         """
23 |         Args:
24 |             datastore_path (str): path to the datastore.
25 |             embed_dim (int): dimension of the embed in the datastore
26 |         """
27 |         embeds = []
28 |         scores = []
29 |         paths = []
30 |         with h5py.File(datastore_path, "r") as f:
31 |             for hdf5_path in list(f["scores"].keys()):
32 |                 paths.append(hdf5_path)
33 |                 embeds.append(f["embeds"][hdf5_path][()])
34 |                 scores.append(f["scores"][hdf5_path][()])
35 |         embeds = np.stack(embeds, axis=0)
36 |         scores = np.array(scores)
37 |         
38 |         # build index
39 |         index = faiss.IndexFlatL2(embed_dim)
40 |         if device.type == 'cuda':
41 |             # index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
42 |             index = faiss.index_cpu_to_all_gpus(index, ngpu=1)
43 |         # else:
44 |             # embeds = torch.tensor(embeds, device=device)
45 |         index.add(embeds)
46 | 
47 |         self.embeds = embeds
48 |         self.scores = scores
49 |         self.paths = paths
50 |         self.index = index
51 | 
52 |     def knn(self, query, k, search_only=False):
53 |         # search
54 |         distances, I = self.index.search(query, k)
55 |         scores = np.stack([self.scores[row] for row in I])
56 |         ret = {
57 |             "distances": distances,
58 |             "scores": scores
59 |         }
60 | 
61 |         if search_only:
62 |             return ret
63 | 
64 |         inv_dist = 1 / (distances + 1e-8)
65 |         norm_dist = softmax(inv_dist, axis=1)
66 | 
67 |         mult = np.multiply(norm_dist, scores)
68 | 
69 |         final_score = np.sum(mult, axis=1)[0]
70 | 
71 |         # retrieve IDs
72 |         ids = [[self.paths[e] for e in row] for row in I]
73 | 
74 |         ret["final_score"] = final_score
75 |         ret["ids"] = ids
76 | 
77 |         return ret


--------------------------------------------------------------------------------
/sheet/schedulers/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedulers import get_scheduler  # NOQA
2 | 


--------------------------------------------------------------------------------
/sheet/schedulers/schedulers.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from torch.optim.lr_scheduler import MultiStepLR, StepLR
 4 | 
 5 | # Reference: https://github.com/s3prl/s3prl/blob/master/s3prl/schedulers.py
 6 | 
 7 | 
 8 | def get_scheduler(optimizer, scheduler_name, total_steps, scheduler_config):
 9 |     scheduler_config = copy.deepcopy(scheduler_config)
10 |     scheduler = eval(f"get_{scheduler_name}")(
11 |         optimizer, num_training_steps=total_steps, **scheduler_config
12 |     )
13 |     return scheduler
14 | 
15 | 
16 | def get_multistep(optimizer, num_training_steps, milestones, gamma):
17 |     return MultiStepLR(optimizer, milestones, gamma)
18 | 
19 | 
20 | def get_stepLR(optimizer, num_training_steps, step_size, gamma):
21 |     return StepLR(optimizer, step_size, gamma)
22 | 


--------------------------------------------------------------------------------
/sheet/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | from .non_intrusive import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/sheet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/sheet/utils/types.py:
--------------------------------------------------------------------------------
  1 | from distutils.util import strtobool
  2 | from typing import Optional, Tuple, Union
  3 | 
  4 | 
  5 | def str2bool(value: str) -> bool:
  6 |     return bool(strtobool(value))
  7 | 
  8 | 
  9 | def remove_parenthesis(value: str):
 10 |     value = value.strip()
 11 |     if value.startswith("(") and value.endswith(")"):
 12 |         value = value[1:-1]
 13 |     elif value.startswith("[") and value.endswith("]"):
 14 |         value = value[1:-1]
 15 |     return value
 16 | 
 17 | 
 18 | def remove_quotes(value: str):
 19 |     value = value.strip()
 20 |     if value.startswith('"') and value.endswith('"'):
 21 |         value = value[1:-1]
 22 |     elif value.startswith("'") and value.endswith("'"):
 23 |         value = value[1:-1]
 24 |     return value
 25 | 
 26 | 
 27 | def int_or_none(value: str) -> Optional[int]:
 28 |     """int_or_none.
 29 | 
 30 |     Examples:
 31 |         >>> import argparse
 32 |         >>> parser = argparse.ArgumentParser()
 33 |         >>> _ = parser.add_argument('--foo', type=int_or_none)
 34 |         >>> parser.parse_args(['--foo', '456'])
 35 |         Namespace(foo=456)
 36 |         >>> parser.parse_args(['--foo', 'none'])
 37 |         Namespace(foo=None)
 38 |         >>> parser.parse_args(['--foo', 'null'])
 39 |         Namespace(foo=None)
 40 |         >>> parser.parse_args(['--foo', 'nil'])
 41 |         Namespace(foo=None)
 42 | 
 43 |     """
 44 |     if value.strip().lower() in ("none", "null", "nil"):
 45 |         return None
 46 |     return int(value)
 47 | 
 48 | 
 49 | def float_or_none(value: str) -> Optional[float]:
 50 |     """float_or_none.
 51 | 
 52 |     Examples:
 53 |         >>> import argparse
 54 |         >>> parser = argparse.ArgumentParser()
 55 |         >>> _ = parser.add_argument('--foo', type=float_or_none)
 56 |         >>> parser.parse_args(['--foo', '4.5'])
 57 |         Namespace(foo=4.5)
 58 |         >>> parser.parse_args(['--foo', 'none'])
 59 |         Namespace(foo=None)
 60 |         >>> parser.parse_args(['--foo', 'null'])
 61 |         Namespace(foo=None)
 62 |         >>> parser.parse_args(['--foo', 'nil'])
 63 |         Namespace(foo=None)
 64 | 
 65 |     """
 66 |     if value.strip().lower() in ("none", "null", "nil"):
 67 |         return None
 68 |     return float(value)
 69 | 
 70 | 
 71 | def str_or_int(value: str) -> Union[str, int]:
 72 |     try:
 73 |         return int(value)
 74 |     except ValueError:
 75 |         return value
 76 | 
 77 | 
 78 | def str_or_none(value: str) -> Optional[str]:
 79 |     """str_or_none.
 80 | 
 81 |     Examples:
 82 |         >>> import argparse
 83 |         >>> parser = argparse.ArgumentParser()
 84 |         >>> _ = parser.add_argument('--foo', type=str_or_none)
 85 |         >>> parser.parse_args(['--foo', 'aaa'])
 86 |         Namespace(foo='aaa')
 87 |         >>> parser.parse_args(['--foo', 'none'])
 88 |         Namespace(foo=None)
 89 |         >>> parser.parse_args(['--foo', 'null'])
 90 |         Namespace(foo=None)
 91 |         >>> parser.parse_args(['--foo', 'nil'])
 92 |         Namespace(foo=None)
 93 | 
 94 |     """
 95 |     if value.strip().lower() in ("none", "null", "nil"):
 96 |         return None
 97 |     return value
 98 | 
 99 | 
100 | def str2pair_str(value: str) -> Tuple[str, str]:
101 |     """str2pair_str.
102 | 
103 |     Examples:
104 |         >>> import argparse
105 |         >>> str2pair_str('abc,def ')
106 |         ('abc', 'def')
107 |         >>> parser = argparse.ArgumentParser()
108 |         >>> _ = parser.add_argument('--foo', type=str2pair_str)
109 |         >>> parser.parse_args(['--foo', 'abc,def'])
110 |         Namespace(foo=('abc', 'def'))
111 | 
112 |     """
113 |     value = remove_parenthesis(value)
114 |     a, b = value.split(",")
115 | 
116 |     # Workaround for configargparse issues:
117 |     # If the list values are given from yaml file,
118 |     # the value givent to type() is shaped as python-list,
119 |     # e.g. ['a', 'b', 'c'],
120 |     # so we need to remove double quotes from it.
121 |     return remove_quotes(a), remove_quotes(b)
122 | 
123 | 
124 | def str2triple_str(value: str) -> Tuple[str, str, str]:
125 |     """str2triple_str.
126 | 
127 |     Examples:
128 |         >>> str2triple_str('abc,def ,ghi')
129 |         ('abc', 'def', 'ghi')
130 |     """
131 |     value = remove_parenthesis(value)
132 |     a, b, c = value.split(",")
133 | 
134 |     # Workaround for configargparse issues:
135 |     # If the list values are given from yaml file,
136 |     # the value givent to type() is shaped as python-list,
137 |     # e.g. ['a', 'b', 'c'],
138 |     # so we need to remove quotes from it.
139 |     return remove_quotes(a), remove_quotes(b), remove_quotes(c)
140 | 


--------------------------------------------------------------------------------
/sheet/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Utility functions."""
  7 | 
  8 | import csv
  9 | import fnmatch
 10 | import logging
 11 | import os
 12 | import sys
 13 | 
 14 | import h5py
 15 | import numpy as np
 16 | 
 17 | 
 18 | def get_basename(path):
 19 |     return os.path.splitext(os.path.split(path)[-1])[0]
 20 | 
 21 | 
 22 | def read_csv(path, dict_reader=False, lazy=False):
 23 |     """
 24 | 
 25 |     If `dict_reader` is set to True, then return <list, fieldnames>.
 26 |     If `dict_reader` is set to False, then return <list>.
 27 |     """
 28 | 
 29 |     """Read the csv file.
 30 | 
 31 |         Args:
 32 |             path (str): path to the csv file
 33 |             dict_reader (bool): whether to use dict reader. This should be set to true when the csv file has header.
 34 |             lazy (bool): whether to read the file in this funcion.
 35 |         
 36 |         Return:
 37 |             contents: reader or line of contents
 38 |             fieldnames (list): header. If dict_reader is False, then return None.
 39 | 
 40 |     """
 41 | 
 42 |     with open(path, newline="") as csvfile:
 43 |         if dict_reader:
 44 |             reader = csv.DictReader(csvfile)
 45 |             fieldnames = reader.fieldnames
 46 |         else:
 47 |             reader = csv.reader(csvfile)
 48 |             fieldnames = None
 49 | 
 50 |         if lazy:
 51 |             contents = reader
 52 |         else:
 53 |             contents = [line for line in reader]
 54 | 
 55 |     return contents, fieldnames
 56 | 
 57 | def write_csv(data, path):
 58 |     """Write data to the output path.
 59 | 
 60 |     Args:
 61 |         path (str): path to the output csv file
 62 |         data (list): a list of dicts
 63 | 
 64 |     """
 65 |     fieldnames = list(data[0].keys())
 66 |     with open(path, "w", newline="") as csvfile:
 67 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 68 |         writer.writeheader()
 69 |         for line in data:
 70 |             writer.writerow(line)
 71 | 
 72 | def find_files(root_dir, query="*.wav", include_root_dir=True):
 73 |     """Find files recursively.
 74 | 
 75 |     Args:
 76 |         root_dir (str): Root root_dir to find.
 77 |         query (str): Query to find.
 78 |         include_root_dir (bool): If False, root_dir name is not included.
 79 | 
 80 |     Returns:
 81 |         list: List of found filenames.
 82 | 
 83 |     """
 84 |     files = []
 85 |     for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
 86 |         for filename in fnmatch.filter(filenames, query):
 87 |             files.append(os.path.join(root, filename))
 88 |     if not include_root_dir:
 89 |         files = [file_.replace(root_dir + "/", "") for file_ in files]
 90 | 
 91 |     return files
 92 | 
 93 | 
 94 | def read_hdf5(hdf5_name, hdf5_path):
 95 |     """Read hdf5 dataset.
 96 | 
 97 |     Args:
 98 |         hdf5_name (str): Filename of hdf5 file.
 99 |         hdf5_path (str): Dataset name in hdf5 file.
100 | 
101 |     Return:
102 |         any: Dataset values.
103 | 
104 |     """
105 |     if not os.path.exists(hdf5_name):
106 |         logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
107 |         sys.exit(1)
108 | 
109 |     hdf5_file = h5py.File(hdf5_name, "r")
110 | 
111 |     if hdf5_path not in hdf5_file:
112 |         logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
113 |         sys.exit(1)
114 | 
115 |     hdf5_data = hdf5_file[hdf5_path][()]
116 |     hdf5_file.close()
117 | 
118 |     return hdf5_data
119 | 
120 | 
121 | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
122 |     """Write dataset to hdf5.
123 | 
124 |     Args:
125 |         hdf5_name (str): Hdf5 dataset filename.
126 |         hdf5_path (str): Dataset path in hdf5.
127 |         write_data (ndarray): Data to write.
128 |         is_overwrite (bool): Whether to overwrite dataset.
129 | 
130 |     """
131 |     # convert to numpy array
132 |     write_data = np.array(write_data)
133 | 
134 |     # check folder existence
135 |     folder_name, _ = os.path.split(hdf5_name)
136 |     if not os.path.exists(folder_name) and len(folder_name) != 0:
137 |         os.makedirs(folder_name)
138 | 
139 |     # check hdf5 existence
140 |     if os.path.exists(hdf5_name):
141 |         # if already exists, open with r+ mode
142 |         hdf5_file = h5py.File(hdf5_name, "r+")
143 |         # check dataset existence
144 |         if hdf5_path in hdf5_file:
145 |             if is_overwrite:
146 |                 logging.warning(
147 |                     "Dataset in hdf5 file already exists. recreate dataset in hdf5."
148 |                 )
149 |                 hdf5_file.__delitem__(hdf5_path)
150 |             else:
151 |                 logging.error(
152 |                     "Dataset in hdf5 file already exists. "
153 |                     "if you want to overwrite, please set is_overwrite = True."
154 |                 )
155 |                 hdf5_file.close()
156 |                 sys.exit(1)
157 |     else:
158 |         # if not exists, open with w mode
159 |         hdf5_file = h5py.File(hdf5_name, "w")
160 | 
161 |     # write data to hdf5
162 |     hdf5_file.create_dataset(hdf5_path, data=write_data)
163 |     hdf5_file.flush()
164 |     hdf5_file.close()
165 | 


--------------------------------------------------------------------------------
/sheet/warmup_lr.py:
--------------------------------------------------------------------------------
 1 | """Warm up learning rate scheduler module."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Union
 5 | 
 6 | import torch
 7 | from torch.optim.lr_scheduler import _LRScheduler
 8 | 
 9 | 
10 | class AbsBatchStepScheduler(ABC):
11 |     @abstractmethod
12 |     def step(self, epoch: int = None):
13 |         pass
14 | 
15 |     @abstractmethod
16 |     def state_dict(self):
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def load_state_dict(self, state):
21 |         pass
22 | 
23 | 
24 | class WarmupLR(_LRScheduler, AbsBatchStepScheduler):
25 |     """The WarmupLR scheduler
26 | 
27 |     This scheduler is almost same as NoamLR Scheduler except for following difference:
28 | 
29 |     NoamLR:
30 |         lr = optimizer.lr * model_size ** -0.5
31 |              * min(step ** -0.5, step * warmup_step ** -1.5)
32 |     WarmupLR:
33 |         lr = optimizer.lr * warmup_step ** 0.5
34 |              * min(step ** -0.5, step * warmup_step ** -1.5)
35 | 
36 |     Note that the maximum lr equals to optimizer.lr in this scheduler.
37 | 
38 |     """
39 | 
40 |     def __init__(
41 |         self,
42 |         optimizer: torch.optim.Optimizer,
43 |         warmup_steps: Union[int, float] = 4000,
44 |         last_epoch: int = -1,
45 |     ):
46 |         self.warmup_steps = warmup_steps
47 | 
48 |         # __init__() must be invoked before setting field
49 |         # because step() is also invoked in __init__()
50 |         super().__init__(optimizer, last_epoch)
51 | 
52 |     def __repr__(self):
53 |         return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
54 | 
55 |     def get_lr(self):
56 |         step_num = self.last_epoch + 1
57 |         return [
58 |             lr
59 |             * self.warmup_steps**0.5
60 |             * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
61 |             for lr in self.base_lrs
62 |         ]
63 | 


--------------------------------------------------------------------------------
/tools/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON:= python
 2 | CUDA_VERSION:= 12.1
 3 | PYTORCH_VERSION:= 2.3.0
 4 | DOT:= .
 5 | .PHONY: all clean show_variables
 6 | 
 7 | all: show_variables virtualenv.done pytorch.done sheet.done
 8 | 
 9 | sheet: sheet.done
10 | 
11 | show_variables:
12 | 	@echo PYTHON=$(PYTHON)
13 | 	@echo CUDA_VERSION=$(CUDA_VERSION)
14 | 	@echo PYTORCH_VERSION=$(PYTORCH_VERSION)
15 | 
16 | virtualenv.done: show_variables
17 | 	test -d venv || $(PYTHON) -m venv venv
18 | 	. venv/bin/activate; cd ../; pip install -U pip
19 | 	# install numpy here since python3.6 is not supported in > 1.20
20 | 	. venv/bin/activate; cd ../; pip install numpy
21 | 	touch virtualenv.done
22 | 
23 | pytorch.done: virtualenv.done
24 | ifeq ($(CUDA_VERSION),)
25 | 	. venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \
26 | 		-f https://download.pytorch.org/whl/cpu/stable.html
27 | else
28 | 	. venv/bin/activate; pip install torch==$(PYTORCH_VERSION) \
29 | 		-f https://download.pytorch.org/whl/cu$(subst $(DOT),,$(CUDA_VERSION))/torch_stable.html
30 | endif
31 | 	touch pytorch.done
32 | 
33 | sheet.done: virtualenv.done pytorch.done
34 | 	. venv/bin/activate; cd ../; pip install -e .
35 | 	touch sheet.done
36 | 
37 | apex.done: virtualenv.done pytorch.done
38 | 	git clone https://github.com/NVIDIA/apex.git
39 | 	. venv/bin/activate; cd apex; \
40 | 		pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
41 | 	touch apex.done
42 | 
43 | clean:
44 | 	rm -fr venv apex *.done
45 | 	find -iname "*.pyc" -delete
46 | 


--------------------------------------------------------------------------------
/utils/BENCHMARKS:
--------------------------------------------------------------------------------
1 | ../egs/BENCHMARKS/


--------------------------------------------------------------------------------
/utils/calculate_metrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | import argparse
 8 | from collections import defaultdict
 9 | import csv
10 | import numpy as np
11 | import scipy
12 | 
13 | def calculate(
14 |     true_mean_scores, predict_mean_scores, true_sys_mean_scores, predict_sys_mean_scores
15 | ):
16 | 
17 |     utt_MSE = np.mean((true_mean_scores - predict_mean_scores) ** 2)
18 |     utt_LCC = np.corrcoef(true_mean_scores, predict_mean_scores)[0][1]
19 |     utt_SRCC = scipy.stats.spearmanr(true_mean_scores, predict_mean_scores)[0]
20 |     utt_KTAU = scipy.stats.kendalltau(true_mean_scores, predict_mean_scores)[0]
21 |     sys_MSE = np.mean((true_sys_mean_scores - predict_sys_mean_scores) ** 2)
22 |     sys_LCC = np.corrcoef(true_sys_mean_scores, predict_sys_mean_scores)[0][1]
23 |     sys_SRCC = scipy.stats.spearmanr(true_sys_mean_scores, predict_sys_mean_scores)[0]
24 |     sys_KTAU = scipy.stats.kendalltau(true_sys_mean_scores, predict_sys_mean_scores)[0]
25 | 
26 |     return {
27 |         "utt_MSE": utt_MSE,
28 |         "utt_LCC": utt_LCC,
29 |         "utt_SRCC": utt_SRCC,
30 |         "utt_KTAU": utt_KTAU,
31 |         "sys_MSE": sys_MSE,
32 |         "sys_LCC": sys_LCC,
33 |         "sys_SRCC": sys_SRCC,
34 |         "sys_KTAU": sys_KTAU,
35 |     }
36 | 
37 | def get_parser():
38 |     parser = argparse.ArgumentParser(description="calculate metrics given an input csv file.")
39 |     parser.add_argument("--csv", required=True, type=str, help="input csv file")
40 |     parser.add_argument("--answer_column", default="answer", type=str, help="the column that stores predicted scores. default to be `answer`")
41 |     parser.add_argument("--gt_column", default="avg_score", type=str, help="the column that stores GT scores. default to be `avg_score`")
42 |     return parser
43 | 
44 | def main():
45 |     args = get_parser().parse_args()
46 | 
47 |     with open(args.csv, newline="") as csvfile:
48 |         reader = csv.DictReader(csvfile)
49 |         contents = [line for line in reader]
50 |     
51 |     eval_results = defaultdict(list)
52 |     eval_sys_results = defaultdict(lambda: defaultdict(list))
53 | 
54 |     for item in contents:
55 |         sys_name = item["system_id"]
56 |         answer = float(item[args.answer_column])
57 |         avg_score = float(item[args.gt_column])
58 |         eval_results["pred_mean_scores"].append(answer)
59 |         eval_results["true_mean_scores"].append(avg_score)
60 |         
61 |         eval_sys_results["pred_mean_scores"][sys_name].append(answer)
62 |         eval_sys_results["true_mean_scores"][sys_name].append(avg_score)
63 | 
64 |     eval_results["true_mean_scores"] = np.array(eval_results["true_mean_scores"])
65 |     eval_results["pred_mean_scores"] = np.array(eval_results["pred_mean_scores"])
66 |     eval_sys_results["true_mean_scores"] = np.array(
67 |         [np.mean(scores) for scores in eval_sys_results["true_mean_scores"].values()]
68 |     )
69 |     eval_sys_results["pred_mean_scores"] = np.array(
70 |         [np.mean(scores) for scores in eval_sys_results["pred_mean_scores"].values()]
71 |     )
72 | 
73 |     # calculate metrics
74 |     results = calculate(
75 |         eval_results["true_mean_scores"],
76 |         eval_results["pred_mean_scores"],
77 |         eval_sys_results["true_mean_scores"],
78 |         eval_sys_results["pred_mean_scores"],
79 |     )
80 |     print(
81 |         f'[UTT][ MSE = {results["utt_MSE"]:.3f} | LCC = {results["utt_LCC"]:.3f} | SRCC = {results["utt_SRCC"]:.3f} | KTAU = {results["utt_KTAU"]:.3f} ] [SYS][ MSE = {results["sys_MSE"]:.3f} | LCC = {results["sys_LCC"]:.4f} | SRCC = {results["sys_SRCC"]:.4f}  | KTAU = {results["sys_KTAU"]:.3f} ]'
82 |     )
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/utils/combine_datasets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """Combine multiple datasets (represented with csv files) into one."""
 8 | 
 9 | import argparse
10 | import csv
11 | import logging
12 | import sys
13 | 
14 | from sheet.utils import read_csv
15 | 
16 | 
17 | def main():
18 |     """Run training process."""
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument(
21 |         "--original-paths",
22 |         required=True,
23 |         type=str,
24 |         nargs="+",
25 |         help=("original csv file paths."),
26 |     )
27 |     parser.add_argument(
28 |         "--out",
29 |         required=True,
30 |         type=str,
31 |         help=("output csv file path."),
32 |     )
33 |     args = parser.parse_args()
34 | 
35 |     # set logger
36 |     logging.basicConfig(
37 |         level=logging.INFO,
38 |         stream=sys.stdout,
39 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
40 |     )
41 | 
42 |     # read csv
43 |     logging.info("Reading original csv files.")
44 |     originals = [
45 |         read_csv(original_path, dict_reader=True)[0]
46 |         for original_path in args.original_paths
47 |     ]
48 | 
49 |     # take the union of all headers
50 |     all_keys = set()
51 |     for original in originals:
52 |         for k in original[0].keys():
53 |             all_keys.add(k)
54 |     fieldnames = list(all_keys)
55 | 
56 |     # write csv
57 |     logging.info("Writing output csv file.")
58 |     with open(args.out, "w", newline="") as csvfile:
59 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
60 |         writer.writeheader()
61 |         for original in originals:
62 |             for line in original:
63 |                 writer.writerow(line)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/utils/hf_download.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2023 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | import argparse
 8 | from huggingface_hub import hf_hub_download
 9 | 
10 | def get_parser():
11 |     parser = argparse.ArgumentParser(description="download files from huggingface hub.")
12 |     parser.add_argument("--repo_id", required=True, type=str, help="id of the huggingface repo")
13 |     parser.add_argument("--filename", required=True, type=str, help="file name to download")
14 |     parser.add_argument("--outdir", required=True, type=str, help="directory to save the downloaded file")
15 |     return parser
16 | 
17 | def main():
18 |     args = get_parser().parse_args()
19 | 
20 |     hf_hub_download(repo_id=args.repo_id,
21 |                     filename=args.filename,
22 |                     local_dir=args.outdir)
23 | 
24 | if __name__ == "__main__":
25 |     main()


--------------------------------------------------------------------------------
/utils/parse_options.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
 4 | #                 Arnab Ghoshal, Karel Vesely
 5 | 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #  http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
15 | # MERCHANTABLITY OR NON-INFRINGEMENT.
16 | # See the Apache 2 License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | 
20 | # Parse command-line options.
21 | # To be sourced by another script (as in ". parse_options.sh").
22 | # Option format is: --option-name arg
23 | # and shell variable "option_name" gets set to value "arg."
24 | # The exception is --help, which takes no arguments, but prints the 
25 | # $help_message variable (if defined).
26 | 
27 | 
28 | ###
29 | ### The --config file options have lower priority to command line 
30 | ### options, so we need to import them first...
31 | ###
32 | 
33 | # Now import all the configs specified by command-line, in left-to-right order
34 | for ((argpos=1; argpos<$#; argpos++)); do
35 |   if [ "${!argpos}" == "--config" ]; then
36 |     argpos_plus1=$((argpos+1))
37 |     config=${!argpos_plus1}
38 |     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
39 |     . $config  # source the config file.
40 |   fi
41 | done
42 | 
43 | 
44 | ###
45 | ### No we process the command line options
46 | ###
47 | while true; do
48 |   [ -z "${1:-}" ] && break;  # break if there are no arguments
49 |   case "$1" in
50 |     # If the enclosing script is called with --help option, print the help 
51 |     # message and exit.  Scripts should put help messages in $help_message
52 |   --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
53 | 	  else printf "$help_message\n" 1>&2 ; fi; 
54 | 	  exit 0 ;; 
55 |   --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
56 |        exit 1 ;;
57 |     # If the first command-line argument begins with "--" (e.g. --foo-bar), 
58 |     # then work out the variable name as $name, which will equal "foo_bar".
59 |   --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
60 |     # Next we test whether the variable in question is undefned-- if so it's 
61 |     # an invalid option and we die.  Note: $0 evaluates to the name of the 
62 |     # enclosing script.
63 |     # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
64 |     # is undefined.  We then have to wrap this test inside "eval" because 
65 |     # foo_bar is itself inside a variable ($name).
66 |       eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
67 |       
68 |       oldval="`eval echo \\$$name`";
69 |     # Work out whether we seem to be expecting a Boolean argument.
70 |       if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 
71 | 	was_bool=true;
72 |       else 
73 | 	was_bool=false;
74 |       fi
75 | 
76 |     # Set the variable to the right value-- the escaped quotes make it work if
77 |     # the option had spaces, like --cmd "queue.pl -sync y"
78 |       eval $name=\"$2\"; 
79 |         
80 |     # Check that Boolean-valued arguments are really Boolean.
81 |       if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
82 |         echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
83 |         exit 1;
84 |       fi
85 |       shift 2;
86 |       ;;
87 |   *) break;
88 |   esac
89 | done
90 | 
91 | 
92 | # Check for an empty argument to the --cmd option, which can easily occur as a 
93 | # result of scripting errors.
94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
95 | 
96 | 
97 | true; # so this script returns exit code 0.
98 | 


--------------------------------------------------------------------------------
/utils/subsample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2024 Wen-Chin Huang
 5 | #  MIT License (https://opensource.org/licenses/MIT)
 6 | 
 7 | """Subsampling a csv file"""
 8 | 
 9 | import argparse
10 | import csv
11 | import logging
12 | import sys
13 | import random
14 | 
15 | from sheet.utils import read_csv
16 | 
17 | 
18 | def main():
19 |     """Run training process."""
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument(
22 |         "--original-path",
23 |         required=True,
24 |         type=str,
25 |         help=("original csv file path."),
26 |     )
27 |     parser.add_argument(
28 |         "--out",
29 |         required=True,
30 |         type=str,
31 |         help=("output csv file path."),
32 |     )
33 |     parser.add_argument(
34 |         "--num-samples",
35 |         type=int,
36 |         default=-1,
37 |         help=("num of total samples to sub-sample. if <=0, then use whole dataset."),
38 |     )
39 |     parser.add_argument(
40 |         "--seed",
41 |         default=1337,
42 |         type=int,
43 |         help=("Random seed. This is used to get consistent random sampling results."),
44 |     )
45 |     args = parser.parse_args()
46 | 
47 |     # set seed
48 |     random.seed(args.seed)
49 | 
50 |     # set logger
51 |     logging.basicConfig(
52 |         level=logging.INFO,
53 |         stream=sys.stdout,
54 |         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
55 |     )
56 | 
57 |     # read csv
58 |     logging.info("Reading original csv file.")
59 |     filelist, _ = read_csv(args.original_path, dict_reader=True)
60 |     fieldnames = list(filelist[0].keys())
61 | 
62 |     # randomly subsample based on num-total-samples
63 |     if args.num_samples >= 0:
64 |         filelist = random.sample(filelist, args.num_samples)
65 | 
66 |     # write csv
67 |     logging.info("Writing output csv file.")
68 |     with open(args.out, "w", newline="") as csvfile:
69 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
70 |         writer.writeheader()
71 |         for line in filelist:
72 |             writer.writerow(line)
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------