├── .DS_Store
├── README.md
├── asr1
    ├── espnet_cmn
    │   ├── asr.sh
    │   ├── cmd.sh
    │   ├── conf
    │   │   ├── .DS_Store
    │   │   ├── decode_asr.yaml
    │   │   ├── decode_asr_lm.yaml
    │   │   ├── fbank.conf
    │   │   ├── pbs.conf
    │   │   ├── pitch.conf
    │   │   ├── queue.conf
    │   │   ├── slurm.conf
    │   │   ├── train.yaml
    │   │   └── tuning
    │   │   │   ├── .DS_Store
    │   │   │   ├── train_asr_conformer.yaml
    │   │   │   ├── train_asr_conformer_seame.yaml
    │   │   │   ├── train_lm_conf.yaml
    │   │   │   ├── train_lm_lstm.yaml
    │   │   │   ├── train_lm_lstm2.yaml
    │   │   │   └── train_lm_transformer.yaml
    │   ├── db.sh
    │   ├── filter.sh
    │   ├── local
    │   │   ├── __pycache__
    │   │   │   └── preprocess.cpython-39.pyc
    │   │   ├── add_lid.py
    │   │   ├── add_lid_seame.py
    │   │   ├── add_lid_seame_v2.py
    │   │   ├── cmi.py
    │   │   ├── cmi2.py
    │   │   ├── data.sh
    │   │   ├── path.sh
    │   │   ├── preprocess.py
    │   │   ├── score.sh
    │   │   ├── split_lang_trn.py
    │   │   ├── subset_seame_cs.py
    │   │   └── subset_seame_mono.py
    │   ├── path.sh
    │   ├── pyscripts
    │   ├── run.sh
    │   ├── run_bigram.sh
    │   ├── run_bigram_subset.sh
    │   ├── run_bigram_subset2.sh
    │   ├── run_mono.sh
    │   ├── run_uni.sh
    │   ├── run_uni_imp.sh
    │   ├── sample.sh
    │   ├── scripts
    │   ├── seperate_mono.sh
    │   ├── steps
    │   └── utils
    └── kaldi_cmn
    │   ├── align.sh
    │   ├── cmd.sh
    │   ├── conf
    │       ├── cmu2pinyin
    │       ├── decode.config
    │       ├── decode_dnn.config
    │       ├── g2p_model
    │       ├── mfcc.conf
    │       ├── mfcc_hires.conf
    │       ├── online_cmvn.conf
    │       ├── pinyin2cmu
    │       ├── slurm.conf
    │       └── vad.conf
    │   ├── decode.sh
    │   ├── decode_test.sh
    │   ├── fix_spk_pref.sh
    │   ├── local
    │       ├── format_data.sh
    │       ├── format_data2.sh
    │       ├── prep_dict_en_zh.sh
    │       ├── prepare_dict.sh
    │       ├── prepare_dict2.sh
    │       ├── prepare_grammar.sh
    │       ├── sample_data.sh
    │       ├── score.sh
    │       ├── train_lms.sh
    │       └── train_lms_extra.sh
    │   ├── path.sh
    │   ├── results.sh
    │   ├── run.sh
    │   ├── sample.sh
    │   ├── steps
    │   └── utils
├── conf
    └── slurm.conf
├── environment.yml
├── images
    └── high-level.png
├── run.sh
├── run_cmn.sh
├── src
    ├── __pycache__
    │   ├── splice_bigram_random.cpython-38.pyc
    │   ├── splice_unigram.cpython-38.pyc
    │   ├── splice_unigram.cpython-39.pyc
    │   ├── splice_unigram_improved.cpython-37.pyc
    │   ├── splice_unigram_improved.cpython-38.pyc
    │   ├── splice_unigram_improved.cpython-39.pyc
    │   ├── utils.cpython-37.pyc
    │   └── utils.cpython-38.pyc
    ├── generate_bigram.py
    ├── generate_unigram.py
    ├── generate_unigram_improved.py
    ├── seg2rec_ctm.py
    ├── setup_recording_dict.py
    ├── setup_supervision_bigram_dict.py
    ├── setup_supervision_dict.py
    ├── setup_supervision_improved_dict.py
    ├── splice_bigram_random.py
    ├── splice_unigram.py
    ├── splice_unigram_improved.py
    └── utils.py
├── test
    ├── bigram_supervisions.pkl
    ├── recording_dict.pkl
    └── supervisions.pkl
└── utils
    ├── make_utt2spk.py
    └── make_wav_scp.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # **Speech Collage**
  2 | 
  3 | This repository contains the code used for the paper titled ["SPEECH COLLAGE: CODE-SWITCHED AUDIO GENERATION BY COLLAGING MONOLINGUAL CORPORA"](https://arxiv.org/pdf/2309.15674.pdf).
  4 | 
  5 | 🔹 **Dataset Samples:** You can listen to a sample of the generated audio [here](<https://huggingface.co/datasets/AmirHussein/SpeechCollage>).
  6 | 
  7 | ---
  8 | 
  9 | ## **High-Level Approach Description**
 10 | 
 11 | ![Proposed Approach](images/high-level.png)
 12 | 
 13 | ---
 14 | 
 15 | ## Requirements
 16 | 
 17 | ### Python Environment
 18 | 
 19 | - Python version `3.8.12`
 20 | - To create an Anaconda environment, run the following command:
 21 | 
 22 |   ```bash
 23 |   conda env create -f environment.yml
 24 |   ```
 25 | 
 26 | ### Install Necessary Toolkits
 27 | 
 28 | 1. Install ESPnet and Kaldi by following the instructions provided [here](https://espnet.github.io/espnet/installation.html).
 29 | 
 30 | 2. Install SOX format libraries:
 31 | 
 32 |    ```bash
 33 |    sudo apt-get install libsox-fmt-all
 34 |    ```
 35 | 
 36 | ## Steps to generate audio from monolingual data
 37 | 
 38 | 1. Train a standard HMM-GMM ASR system following the standard Kaldi recipes for your monolingual data. You can also follow the provided monolingual Chinese-English (Aishel+Tedlium3) recipe in `asr1/kaldi/`.
 39 | 
 40 | 2. Generate the alignments (ctm) file using the Kaldi script `steps/get_train_ctm.sh` and save it in your `data_dir`. Additionally, copy the `text` (in this case, code-switching) used for generation. Note that you can use any text as long as you have the monolingual audios for that text.
 41 | 
 42 |    To generate the ctm using Kaldi:
 43 | 
 44 |    ```bash
 45 |    steps/get_train_ctm.sh --use-segments false data/train data/lang exp/tri3_ali data_dir/ctm.mono
 46 |    ```
 47 | 
 48 |    If the first column of the `ctm` file contains segments, run:
 49 | 
 50 |    ```bash
 51 |    python src/seg2rec_ctm.py data_dir
 52 |    ```
 53 | 
 54 |    This will convert the segments to the names of audio recordings from `wav.scp`.
 55 | 
 56 | ### Note: From this step you can follow `run.sh` for an automated execution of the below procedures:
 57 | 
 58 | 3. Following the Kaldi style, copy the `wav.scp` file containing monolingual utterances to `data_dir`. Generate a recording dictionary as follows:
 59 | 
 60 |    ```bash
 61 |    python src/setup_recording_dict.py ${indir}/wav.scp outdir
 62 |    ```
 63 | 
 64 | 4. With the ctm file for the monolingual utterances and recording dictionary, create a supervision dictionary. Choose one of the following options based on your requirements:
 65 | 
 66 |    - For randomly generated utterances with unigram units and no signal enhancement:
 67 | 
 68 |      ```bash
 69 |      python src/setup_supervision_dict.py data_dir/ctm.mono outdir/recording_dict.pkl outdir
 70 |      ```
 71 | 
 72 |    - For randomly generated utterances with unigram units and signal enhancement:
 73 | 
 74 |      ```bash
 75 |      python src/setup_supervision_improved_dict.py data_dir/ctm.mono outdir/recording_dict.pkl outdir
 76 |      ```
 77 | 
 78 |    - For randomly generated utterances with bigram units and signal enhancement:
 79 | 
 80 |      ```bash
 81 |      python src/setup_bigram_sup_dict.py data_dir/ctm.mono outdir/recording_dict.pkl outdir
 82 |      ```
 83 | 
 84 | 5. Run the audio generation. Below is an example for generating bigrams:
 85 | 
 86 |    ```bash
 87 |    ./src/generate_bigram.py \
 88 |        --input text \
 89 |        --output outdir/bigrams \
 90 |        --data outdir \
 91 |        --jobs $nj
 92 |    ```
 93 | 
 94 | 6. Once the audios are generated, run `make_wav_scp.py` to create the `wav.scp` file.
 95 | 
 96 |    ```bash
 97 |    python utils/make_wav_scp.py --audio-dir outdir/bigrams --out-dir data_dir_mode
 98 |    ```
 99 | 
100 | 7. Create the rest of the necessary files: `text`, `utt2spk`, and `spk2utt`.
101 | 
102 |    ```bash
103 |    cp outdir/bigrams/transcripts.txt data_dir_mode/text
104 |    cat data_dir_mode/wav.scp | awk '{print $1 " " $1}' > data_dir_mode/utt2spk
105 |    cp data_dir_mode/utt2spk data_dir_mode/spk2utt
106 |    ```
107 | 
108 | 8. Use the `data_dir_mode` data folder for ESPnet training.
109 | 
110 | 
111 | ## Cite the Paper
112 | 
113 | If you use this code for your work, please consider citing the paper:
114 | 
115 | ```markdown
116 | @INPROCEEDINGS{10446857,
117 |   author={Hussein, Amir and Zeinali, Dorsa and Klejch, Ondřej and Wiesner, Matthew and Yan, Brian and Chowdhury, Shammur and Ali, Ahmed and Watanabe, Shinji and Khudanpur, Sanjeev},
118 |   booktitle={ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
119 |   title={Speech Collage: Code-Switched Audio Generation by Collaging Monolingual Corpora}, 
120 |   year={2024},
121 |   pages={12006-12010},
122 |   keywords={Training;Speech coding;Zero-shot learning;Splicing;Signal processing;Data augmentation;Data models;Code-switching;ASR;data augmentation;end-to-end;zero-shot learning},
123 |   doi={10.1109/ICASSP48485.2024.10446857}
124 | }
125 | ```
126 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/cmd.sh:
--------------------------------------------------------------------------------
  1 | # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
  2 | # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
  3 | # e.g.
  4 | #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
  5 | #
  6 | # Options:
  7 | #   --time <time>: Limit the maximum time to execute.
  8 | #   --mem <mem>: Limit the maximum memory usage.
  9 | #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
 10 | #   --num-threads <ngpu>: Specify the number of CPU core.
 11 | #   --gpu <ngpu>: Specify the number of GPU devices.
 12 | #   --config: Change the configuration file from default.
 13 | #
 14 | # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
 15 | # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
 16 | # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
 17 | # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
 18 | #
 19 | # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
 20 | # These options are mapping to specific options for each backend and
 21 | # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
 22 | # If jobs failed, your configuration might be wrong for your environment.
 23 | #
 24 | #
 25 | # The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 26 | #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 27 | # =========================================================~
 28 | 
 29 | 
 30 | # Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
 31 | cmd_backend='slurm'
 32 | 
 33 | # Local machine, without any Job scheduling system
 34 | if [ "${cmd_backend}" = local ]; then
 35 | 
 36 |     # The other usage
 37 |     export train_cmd="run.pl"
 38 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
 39 |     export cuda_cmd="run.pl"
 40 |     # Used for "*_recog.py"
 41 |     export decode_cmd="run.pl"
 42 | 
 43 | # Local machine logging to stdout and log file, without any Job scheduling system
 44 | elif [ "${cmd_backend}" = stdout ]; then
 45 | 
 46 |     # The other usage
 47 |     export train_cmd="stdout.pl"
 48 |     # Used for "*_train.py": "--gpu" is appended optionally by run.sh
 49 |     export cuda_cmd="stdout.pl"
 50 |     # Used for "*_recog.py"
 51 |     export decode_cmd="stdout.pl"
 52 | 
 53 | 
 54 | # "qsub" (Sun Grid Engine, or derivation of it)
 55 | elif [ "${cmd_backend}" = sge ]; then
 56 |     # The default setting is written in conf/queue.conf.
 57 |     # You must change "-q g.q" for the "queue" for your environment.
 58 |     # To know the "queue" names, type "qhost -q"
 59 |     # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
 60 | 
 61 |     export train_cmd="queue.pl"
 62 |     export cuda_cmd="queue.pl"
 63 |     export decode_cmd="queue.pl"
 64 | 
 65 | 
 66 | # "qsub" (Torque/PBS.)
 67 | elif [ "${cmd_backend}" = pbs ]; then
 68 |     # The default setting is written in conf/pbs.conf.
 69 | 
 70 |     export train_cmd="pbs.pl"
 71 |     export cuda_cmd="pbs.pl"
 72 |     export decode_cmd="pbs.pl"
 73 | 
 74 | 
 75 | # "sbatch" (Slurm)
 76 | elif [ "${cmd_backend}" = slurm ]; then
 77 |     # The default setting is written in conf/slurm.conf.
 78 |     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
 79 |     # To know the "partion" names, type "sinfo".
 80 |     # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
 81 |     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 82 | 
 83 |     export train_cmd="slurm.pl"
 84 |     export cuda_cmd="slurm.pl"
 85 |     export decode_cmd="slurm.pl"
 86 | 
 87 | elif [ "${cmd_backend}" = ssh ]; then
 88 |     # You have to create ".queue/machines" to specify the host to execute jobs.
 89 |     # e.g. .queue/machines
 90 |     #   host1
 91 |     #   host2
 92 |     #   host3
 93 |     # Assuming you can login them without any password, i.e. You have to set ssh keys.
 94 | 
 95 |     export train_cmd="ssh.pl"
 96 |     export cuda_cmd="ssh.pl"
 97 |     export decode_cmd="ssh.pl"
 98 | 
 99 | # This is an example of specifying several unique options in the JHU CLSP cluster setup.
100 | # Users can modify/add their own command options according to their cluster environments.
101 | elif [ "${cmd_backend}" = jhu ]; then
102 | 
103 |     export train_cmd="queue.pl --mem 2G"
104 |     export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
105 |     export decode_cmd="queue.pl --mem 4G"
106 | 
107 | else
108 |     echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
109 |     return 1
110 | fi
111 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/asr1/espnet_cmn/conf/.DS_Store


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/decode_asr.yaml:
--------------------------------------------------------------------------------
1 | ctc_weight: 0.4
2 | beam_size: 10


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/decode_asr_lm.yaml:
--------------------------------------------------------------------------------
1 | lm_weight: 0.2
2 | ctc_weight: 0.4
3 | beam_size: 10


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/fbank.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000 
2 | --num-mel-bins=80
3 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/pbs.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -V -v PATH -S /bin/bash
 3 | option name=* -N $0
 4 | option mem=* -l mem=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -l ncpus=$0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option num_nodes=* -l nodes=$0:ppn=1
 9 | default gpu=0
10 | option gpu=0
11 | option gpu=* -l ngpus=$0
12 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/pitch.conf:
--------------------------------------------------------------------------------
1 | --sample-frequency=16000
2 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/queue.conf:
--------------------------------------------------------------------------------
 1 | # Default configuration
 2 | command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
 3 | option name=* -N $0
 4 | option mem=* -l mem_free=$0,ram_free=$0
 5 | option mem=0          # Do not add anything to qsub_opts
 6 | option num_threads=* -pe smp $0
 7 | option num_threads=1  # Do not add anything to qsub_opts
 8 | option max_jobs_run=* -tc $0
 9 | option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
10 | default gpu=0
11 | option gpu=0
12 | option gpu=* -l gpu=$0 -q g.q
13 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --ntasks-per-node=1 --partition=gpu-all  -x crimv3srv040
 2 | option mem=* --mem-per-cpu=$0
 3 | option mem=0          # Do not add anything to qsub_opts
 4 | option num_threads=* --cpus-per-task=$0 --ntasks-per-node=1
 5 | option num_threads=1 --cpus-per-task=1  --ntasks-per-node=1 # Do not add anything to qsub_opts
 6 | option max_jobs_run=*     # Do nothing
 7 | option name=* --job-name $0
 8 | option time=* --time $0
 9 | option num_nodes=* --nodes $0
10 | option gpu=* -p gpu-all --gres=gpu:$0 -c $0
11 | # option gpu=* -N1 -n1 -p gpu-all  --mem=8GB --gres=gpu:$0 --time=72:0:0  # in reality, we probably should have --cpus-per-task=$((6*$0))
12 | 
13 | option gpu=0


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/train.yaml:
--------------------------------------------------------------------------------
1 | tuning/train_asr_conformer.yaml


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/asr1/espnet_cmn/conf/tuning/.DS_Store


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/train_asr_conformer.yaml:
--------------------------------------------------------------------------------
 1 | # This configuration requires 2 GPUs with 32GB memory and 2 days for training
 2 | batch_type: numel
 3 | batch_bins: 25000000
 4 | 
 5 | accum_grad: 2
 6 | grad_clip: 5
 7 | max_epoch: 50
 8 | patience: none
 9 | init: none
10 | num_workers: 4
11 | #init: xavier_uniform
12 | best_model_criterion:
13 | -   - valid
14 |     - acc
15 |     - max
16 | keep_nbest_models: 10
17 | 
18 | encoder: conformer
19 | encoder_conf:
20 |     output_size: 256
21 |     attention_heads: 4
22 |     linear_units: 2048
23 |     num_blocks: 12
24 |     dropout_rate: 0.1
25 |     positional_dropout_rate: 0.1
26 |     attention_dropout_rate: 0.0
27 |     input_layer: conv2d
28 |     normalize_before: true
29 |     pos_enc_layer_type: rel_pos
30 |     selfattention_layer_type: rel_selfattn
31 |     activation_type: swish
32 |     macaron_style: true
33 |     use_cnn_module: true
34 |     cnn_module_kernel: 15
35 | 
36 | decoder: transformer
37 | decoder_conf:
38 |     attention_heads: 4
39 |     linear_units: 2048
40 |     num_blocks: 6
41 |     dropout_rate: 0.1
42 |     positional_dropout_rate: 0.1
43 |     self_attention_dropout_rate: 0.0
44 |     src_attention_dropout_rate: 0.0
45 | 
46 | model_conf:
47 |     ctc_weight: 0.3
48 |     lsm_weight: 0.1
49 |     length_normalized_loss: false
50 | 
51 | optim: adam
52 | optim_conf:
53 |     lr: 0.001
54 | scheduler: warmuplr
55 | scheduler_conf:
56 |     warmup_steps: 25000
57 | 
58 | specaug: specaug
59 | specaug_conf:
60 |     apply_time_warp: true
61 |     time_warp_window: 5
62 |     time_warp_mode: bicubic
63 |     apply_freq_mask: true
64 |     freq_mask_width_range:
65 |     - 0
66 |     - 30
67 |     num_freq_mask: 2
68 |     apply_time_mask: true
69 |     time_mask_width_range:
70 |     - 0
71 |     - 40
72 |     num_time_mask: 2


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/train_asr_conformer_seame.yaml:
--------------------------------------------------------------------------------
 1 | # This configuration requires 2 GPUs with 32GB memory and 2 days for training
 2 | batch_type: numel
 3 | batch_bins: 25000000
 4 | 
 5 | accum_grad: 2
 6 | grad_clip: 5
 7 | max_epoch: 60
 8 | patience: none
 9 | init: xavier_uniform
10 | best_model_criterion:
11 | -   - valid
12 |     - acc
13 |     - max
14 | keep_nbest_models: 10
15 | 
16 | encoder: conformer
17 | encoder_conf:
18 |     output_size: 256
19 |     attention_heads: 4
20 |     linear_units: 2048
21 |     num_blocks: 12
22 |     dropout_rate: 0.1
23 |     positional_dropout_rate: 0.1
24 |     attention_dropout_rate: 0.0
25 |     input_layer: conv2d
26 |     normalize_before: true
27 |     pos_enc_layer_type: rel_pos
28 |     selfattention_layer_type: rel_selfattn
29 |     activation_type: swish
30 |     macaron_style: true
31 |     use_cnn_module: true
32 |     cnn_module_kernel: 15
33 | 
34 | decoder: transformer
35 | decoder_conf:
36 |     attention_heads: 4
37 |     linear_units: 2048
38 |     num_blocks: 6
39 |     dropout_rate: 0.1
40 |     positional_dropout_rate: 0.1
41 |     self_attention_dropout_rate: 0.0
42 |     src_attention_dropout_rate: 0.0
43 | 
44 | model_conf:
45 |     ctc_weight: 0.3
46 |     lsm_weight: 0.1
47 |     length_normalized_loss: false
48 | 
49 | optim: adam
50 | optim_conf:
51 |     lr: 0.001
52 | scheduler: warmuplr
53 | scheduler_conf:
54 |     warmup_steps: 25000
55 | 
56 | specaug: specaug
57 | specaug_conf:
58 |     apply_time_warp: true
59 |     time_warp_window: 5
60 |     time_warp_mode: bicubic
61 |     apply_freq_mask: true
62 |     freq_mask_width_range:
63 |     - 0
64 |     - 30
65 |     num_freq_mask: 2
66 |     apply_time_mask: true
67 |     time_mask_width_range:
68 |     - 0
69 |     - 40
70 |     num_time_mask: 2


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/train_lm_conf.yaml:
--------------------------------------------------------------------------------
 1 | lm_conf:        
 2 |     nlayers: 4
 3 |     unit: 2048
 4 |     tie_weights: true
 5 |     dropout_rate: 0.5
 6 | optim: adam
 7 | optim_conf:
 8 |     lr: 0.001
 9 | batch_type: folded
10 | batch_size: 100   # batch size in LM training
11 | max_epoch: 20     # if the data size is large, we can reduce this
12 | patience: 3
13 | accum_grad: 4
14 | 
15 | best_model_criterion:
16 | -   - valid
17 |     - loss
18 |     - min
19 | keep_nbest_models: 5
20 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/train_lm_lstm.yaml:
--------------------------------------------------------------------------------
 1 | lm_conf:
 2 |     nlayers: 4
 3 |     unit: 2048
 4 |     tie_weights: true
 5 |     dropout_rate: 0.2
 6 | optim: adam
 7 | optim_conf:
 8 |     lr: 0.001
 9 | batch_type: folded
10 | batch_size: 400   # batch size in LM training
11 | max_epoch: 30     # if the data size is large, we can reduce this
12 | patience: 3
13 | 
14 | best_model_criterion:
15 | -   - valid
16 |     - loss
17 |     - min
18 | #keep_nbest_models: 1
19 | keep_nbest_models: 5


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/train_lm_lstm2.yaml:
--------------------------------------------------------------------------------
 1 | lm_conf:
 2 |     nlayers: 4
 3 |     unit: 2048
 4 |     # tie_weights: true
 5 |     # dropout_rate: 0.2
 6 | optim: adam
 7 | optim_conf:
 8 |     lr: 0.001
 9 | batch_type: folded
10 | batch_size: 400   # batch size in LM training
11 | max_epoch: 30     # if the data size is large, we can reduce this
12 | patience: 3
13 | 
14 | best_model_criterion:
15 | -   - valid
16 |     - loss
17 |     - min
18 | #keep_nbest_models: 1
19 | keep_nbest_models: 5


--------------------------------------------------------------------------------
/asr1/espnet_cmn/conf/tuning/train_lm_transformer.yaml:
--------------------------------------------------------------------------------
 1 | lm: transformer
 2 | lm_conf:
 3 |     pos_enc: null
 4 |     embed_unit: 128
 5 |     att_unit: 512
 6 |     head: 8
 7 |     unit: 2048
 8 |     layer: 16
 9 |     dropout_rate: 0.1
10 | 
11 | grad_clip: 5.0
12 | batch_type: numel
13 | batch_bins: 1000000
14 | accum_grad: 2
15 | max_epoch: 15
16 | 
17 | optim: adam
18 | optim_conf:
19 |    lr: 0.0002
20 | scheduler: warmuplr
21 | scheduler_conf:
22 |    warmup_steps: 25000
23 | 
24 | best_model_criterion:
25 | -   - valid
26 |     - loss
27 |     - min
28 | keep_nbest_models: 10


--------------------------------------------------------------------------------
/asr1/espnet_cmn/db.sh:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/competitions/IWSLT/brian/espnet-ml/egs2/TEMPLATE/asr1/db.sh


--------------------------------------------------------------------------------
/asr1/espnet_cmn/filter.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/asr1/espnet_cmn/filter.sh


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/__pycache__/preprocess.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/asr1/espnet_cmn/local/__pycache__/preprocess.cpython-39.pyc


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/add_lid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | cnt = 0
 8 | 
 9 | def is_aishell(tag):
10 |     if "BAC" in tag:
11 |         global cnt
12 |         cnt += 1
13 |         return True
14 |     return False
15 | 
16 | def add_lid(id, txt):
17 |     if is_aishell(id):
18 |         txt = "<zh> " + txt
19 |     else:
20 |         txt = "<en> " + txt
21 |     return id + " " + txt + "\n"
22 | 
23 | if __name__ == "__main__":
24 |     # Parse arguments
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--src", type=str)
27 |     args = parser.parse_args()
28 | 
29 | 
30 |     lines = [x.strip().split(" ", 1) for x in open(args.src, "r").readlines()]
31 |     new_lines = [add_lid(id, txt) for id, txt in lines]
32 |     print(cnt)
33 |     
34 |     with open(args.src, "w") as f:
35 |         f.writelines(new_lines)
36 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/add_lid_seame.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | alph = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
 8 | alph_lower = [x.lower() for x in alph]
 9 | num = ["0","1","2","3","4","5","6","7","8","9"]
10 | eng_set = alph + alph_lower + num
11 | 
12 | def lid(c):
13 |     if c[0] in eng_set:
14 |         return "<en>"
15 |     else:
16 |         return "<zh>"
17 | 
18 | def add_lid(x):
19 |     if len(x) == 1:
20 |         id = x[0]
21 |         txt = ""
22 |     else:
23 |         id = x[0]
24 |         txt = x[1]
25 |     new_txt = []
26 |     prev = ""
27 |     for i, c in enumerate(txt.split()):
28 |         curr = lid(c)
29 |         if c != "<noise>" and curr != prev:
30 |             new_txt.append(curr)
31 |             prev = curr
32 |         new_txt.append(c)
33 |     return id + " " + " ".join(new_txt) + "\n"
34 |         
35 | 
36 | if __name__ == "__main__":
37 |     # Parse arguments
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument("--src", type=str)
40 |     args = parser.parse_args()
41 | 
42 | 
43 |     lines = [x.strip().split(" ", 1) for x in open(args.src, "r").readlines()]
44 |     new_lines = [add_lid(x) for x in lines]
45 |     
46 |     with open(args.src + "_lid", "w") as f:
47 |         f.writelines(new_lines)
48 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/add_lid_seame_v2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf8 -*-
 3 | 
 4 | # No ID version
 5 | 
 6 | import os
 7 | import argparse
 8 | 
 9 | alph = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
10 | alph_lower = [x.lower() for x in alph]
11 | num = ["0","1","2","3","4","5","6","7","8","9"]
12 | eng_set = alph + alph_lower + num
13 | 
14 | def lid(c):
15 |     if len(c) == 0:
16 |         return ""
17 |     if c[0] in eng_set:
18 |         return "<en>"
19 |     else:
20 |         return "<zh>"
21 | 
22 | def add_lid(x):
23 |     id_, txt = x[0], x[1:]
24 |     new_txt = []
25 |     new_txt.append(id_)
26 |     prev = ""
27 |     for i, c in enumerate(txt):
28 |         curr = lid(c)
29 |         if c != "<noise>" and curr != prev:
30 |             new_txt.append(curr)
31 |             prev = curr
32 |         new_txt.append(c)
33 |     return " ".join(new_txt) + "\n"
34 |         
35 | 
36 | if __name__ == "__main__":
37 |     # Parse arguments
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument("--src", type=str)
40 |     args = parser.parse_args()
41 | 
42 | 
43 |     lines = [x.strip().split(" ") for x in open(args.src, "r").readlines()]
44 |     new_lines = [add_lid(x) for x in lines]
45 |     
46 |     with open(args.src + "_lid", "w") as f:
47 |         f.writelines(new_lines)
48 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/cmi.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import argparse
 4 | 
 5 | alph = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
 6 | alph_lower = [x.lower() for x in alph]
 7 | num = ["0","1","2","3","4","5","6","7","8","9"]
 8 | eng_set = alph + alph_lower + num
 9 | tagset = ["<en>", "<zh>", "<other>"]
10 | 
11 | def lid(c):
12 |     if len(c) == 0:
13 |         return ""
14 |     if c[0] in eng_set:
15 |         return "<en>"
16 |     else:
17 |         return "<zh>"
18 | 
19 | def add_lid(x):
20 |     id_, txt = x[0], x[1:]
21 |     new_txt = []
22 |     new_txt.append(id_)
23 |     prev = ""
24 |     for i, c in enumerate(txt):
25 |         curr = lid(c)
26 |         if c != "<noise>":
27 |             new_txt.append(curr)
28 |             prev = curr
29 |         #new_txt.append(c)
30 |     return " ".join(new_txt) + "\n"
31 | 
32 | 
33 | # Define the switchpoint function (simplified version)
34 | def switchpoint(tag, tagset, P, currlang):
35 |     langs = tagset
36 |     if currlang == 0 and (tag in langs):
37 |         return P, tag
38 |     elif tag != currlang and (tag in langs):
39 |         return P+1, tag
40 |     else:
41 |         return P, currlang
42 | 
43 | def cmi_one_utterance(utterance, tagset):
44 |     P = 0
45 |     currlang = 0
46 |     tags = [0 for x in range(len(tagset))]
47 |     
48 |     # For simplicity, we'll assume each word is a tuple (word, tag)
49 |     for tag in utterance:
50 |         if tag == '' or tag is None:
51 |             print("No tag for word ")
52 |         elif tag in tagset:
53 |             tags[tagset.index(tag)] += 1
54 |             P, currlang = switchpoint(tag, tagset, P, currlang)
55 | 
56 |     lang = sum(tags) - tags[-1]
57 |     nummatrix = max(tags)
58 |     if lang == 0:
59 |         return 0, P, tags
60 |     else:
61 |         return 1/2 - (nummatrix - P)/(2*lang), P, tags
62 | 
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     # Parse arguments
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument("--src", type=str)
69 |     args = parser.parse_args()
70 | 
71 |     cmi_sum = 0
72 |     lines = [x.strip().split(" ") for x in open(args.src, "r").readlines()]
73 |     breakpoint()
74 |     new_lines = [add_lid(x) for x in lines]
75 |     
76 |     # with open(args.src + "_lid", "w") as f:
77 |     #     f.writelines(new_lines)
78 |     
79 |     for line in new_lines:
80 |         text = line.strip().split(" ")[1:]
81 |         cmi, _, _ = cmi_one_utterance(text,tagset)
82 |         cmi_sum += cmi*100
83 |     cmi_avg = round(cmi_sum/len(lines),2)
84 |     print(f"Avg CMI percentage: {cmi_avg}")
85 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/cmi2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import argparse
  4 | from jiwer import wer
  5 | 
  6 | alph = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"]
  7 | alph_lower = [x.lower() for x in alph]
  8 | num = ["0","1","2","3","4","5","6","7","8","9"]
  9 | eng_set = alph + alph_lower
 10 | tagset = ["<en>", "<zh>", "<other>"]
 11 | 
 12 | def lid(c):
 13 |     if len(c) == 0:
 14 |         return ""
 15 |     if c[0] in eng_set:
 16 |         return "<en>"
 17 |     else:
 18 |         return "<zh>"
 19 | 
 20 | def add_lid(txt):
 21 |     #id_, txt = x[0], x[]
 22 |     new_txt = []
 23 |     #new_txt.append(id_)
 24 |     prev = ""
 25 |     for i, c in enumerate(txt):
 26 |         if c != " ":
 27 |             curr = lid(c)
 28 |             if c != "<noise>":
 29 |                 new_txt.append((c,curr))
 30 |                 prev = curr
 31 |         #new_txt.append(c)
 32 |     return new_txt
 33 | 
 34 | 
 35 | # Define the switchpoint function (simplified version)
 36 | def switchpoint(tag, tagset, P, currlang):
 37 |     langs = tagset
 38 |     if currlang == 0 and (tag in langs):
 39 |         return P, tag
 40 |     elif tag != currlang and (tag in langs):
 41 |         return P+1, tag
 42 |     else:
 43 |         return P, currlang
 44 | 
 45 | def cmi_one_utterance(utterance, tagset):
 46 |     P = 0
 47 |     currlang = 0
 48 |     tags = [0 for x in range(len(tagset))]
 49 |     
 50 |     # For simplicity, we'll assume each word is a tuple (word, tag)
 51 |     for (word, tag) in utterance:
 52 |         if tag == '' or tag is None or word == ' ':
 53 |             print(f"No tag for word {word}")
 54 |         elif tag in tagset:
 55 |             tags[tagset.index(tag)] += 1
 56 |             P, currlang = switchpoint(tag, tagset, P, currlang)
 57 | 
 58 |     lang = sum(tags) - tags[-1]
 59 |     nummatrix = max(tags)
 60 |     if lang == 0:
 61 |         return 0, P, tags
 62 |     else:
 63 |         return 1/2 - (nummatrix - P)/(2*lang), P, tags
 64 | 
 65 | 
 66 | 
 67 | 
 68 | def read_lines(args):
 69 |     src_dict, ref_dict = {},{}
 70 |     # with open(src, 'r') as a, open(ref, 'r') as b:
 71 |     #     src_lines = a.readlines()
 72 |     #     ref_lines = b.readlines()
 73 |     src_lines = [x.strip().split(" ") for x in open(args.src, "r").readlines()]
 74 |     ref_lines = [x.strip().split(" ") for x in open(args.ref, "r").readlines()]
 75 |     # for line in sorted(src_lines, key=cmp_to_key(locale.strcoll)):
 76 |     #     new_src.append(line)
 77 |     # for line in sorted(ref_lines, key=cmp_to_key(locale.strcoll)):
 78 |     for a in src_lines:
 79 |         src_dict[a[0].lower()] = " ".join(a[1:]).strip()
 80 | 
 81 |         
 82 |     for b in ref_lines:
 83 | 
 84 |         ref_dict[b[0].lower()] = " ".join(b[1:]).strip() 
 85 | 
 86 |     return src_dict, ref_dict
 87 | 
 88 | def get_correct_lines(src_dict, ref_dict, per=0.3):
 89 |     new_src, new_ref = [],[]
 90 |     for id_ in src_dict:
 91 |         if id_ in ref_dict:
 92 |             error = wer(ref_dict[id_], src_dict[id_])
 93 |             if error <= per:
 94 |                 new_src.append(src_dict[id_])
 95 |                 new_ref.append(ref_dict[id_])
 96 |     return new_src, new_ref
 97 | 
 98 | if __name__ == "__main__":
 99 |     # Parse arguments
100 |     parser = argparse.ArgumentParser()
101 |     parser.add_argument("--src", type=str)
102 |     parser.add_argument("--ref", type=str)
103 |     args = parser.parse_args()
104 |     src_dict, ref_dict = read_lines(args)
105 | 
106 |     cmi_sum = 0
107 |     # lines = [x.strip().split(" ") for x in open(args.src, "r").readlines()]
108 |     
109 |     src_lines, _ = get_correct_lines(src_dict, ref_dict, per=0.3)
110 |     
111 |     new_lines = [add_lid(x.split()) for x in src_lines]
112 |     
113 |     # with open(args.src + "_lid", "w") as f:
114 |     #     f.writelines(new_lines)
115 |     for line in new_lines:
116 |         #text = line.strip().split(" ")[1:]
117 |         cmi, _, _ = cmi_one_utterance(line,tagset)
118 |         cmi_sum += cmi*100
119 |     cmi_avg = round(cmi_sum/len(src_lines),2)
120 |     print(f"Avg CMI percentage: {cmi_avg}")
121 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | log() {
 9 |     local fname=${BASH_SOURCE[1]##*/}
10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
11 | }
12 | SECONDS=0
13 | 
14 | 
15 | stage=1
16 | stop_stage=100000
17 | dev_repo_dir=data/SEAME-dev-set
18 | 
19 | log "$0 $*"
20 | . utils/parse_options.sh
21 | 
22 | . ./db.sh
23 | . ./path.sh
24 | . ./cmd.sh
25 | 
26 | if [ -z "${SEAME}" ]; then
27 |     log "Fill the value of 'SEAME' of db.sh"
28 |     exit 1
29 | fi
30 | 
31 | if [ $# -ne 0 ]; then
32 |     log "Error: No positional arguments are required."
33 |     exit 2
34 | fi
35 | 
36 | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${dev_repo_dir}" ]; then
37 |     log "stage 1: Clone official SEAME repository"
38 |     
39 |     git clone https://github.com/zengzp0912/SEAME-dev-set.git ${dev_repo_dir}
40 | fi
41 | 
42 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
43 |     log "stage 2: Data Preparation"
44 |     
45 |     local/preprocess.py --out data --data ${SEAME} --repo ${dev_repo_dir}
46 |     
47 |     for set in train valid devman devsge
48 |     do
49 |         cp data/${set}/text.rm.noise data/${set}/text
50 |         utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
51 |         utils/validate_data_dir.sh --no-feats data/${set} || exit 1
52 |     done
53 | fi
54 | 
55 | log "Successfully finished. [elapsed=${SECONDS}s]"
56 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/path.sh:
--------------------------------------------------------------------------------
1 | if [ ! which flac &> /dev/null ]
2 | then
3 |     echo "Error: flac is not installed"
4 |     return 1
5 | fi


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf8 -*-
  3 | 
  4 | """
  5 |     This is an python implementation of preprocessing of
  6 |     the SEAME Mandarin-English code-switching corpus.
  7 |     We follow original papers [1, 2] and the official
  8 |     github repository [3] to make this code produces the
  9 |     same amount of training and testing data.
 10 | 
 11 |     [1] Dau-Cheng Lyu, Tien-Ping Tan, Eng-Siong Chng, and
 12 |         Haizhou Li, "SEAME: a Mandarin-English Code-switching
 13 |         Speech Corpus in South-East Asia," in Interspeech, 2010.
 14 |     [2] Zhiping Zeng, Yerbolat Khassanov, Van Tung Pham, Haihua
 15 |         Xu, Eng Siong Chng, and Haizhou Li, "On the End-to-End
 16 |         Solution to Mandarin-English Code-switching Speech
 17 |         Recognition," in Interspeech, 2019.
 18 |     [3] https://github.com/zengzp0912/SEAME-dev-set
 19 | """
 20 | 
 21 | import re
 22 | import os
 23 | import sys
 24 | import argparse
 25 | import itertools
 26 | import collections
 27 | import random as rd
 28 | 
 29 | rd.seed(531)
 30 | 
 31 | remove_punc = '()[]{}.,?·@，。、「」＃"~-—#%_`｀×*（）［］&【】～ｌ\\'
 32 | pattern = str.maketrans(remove_punc, " " * len(remove_punc))
 33 | 
 34 | translate_char_source = "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺé"
 35 | translate_char_target = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyze"
 36 | pattern2 = str.maketrans(translate_char_source, translate_char_target)
 37 | 
 38 | all_chars = (chr(i) for i in range(sys.maxunicode))
 39 | categories = {"Cc"}
 40 | control_chars = "".join(map(chr, itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0))))
 41 | control_char_re = re.compile("[%s]" % re.escape(control_chars))
 42 | 
 43 | 
 44 | def remove_control_chars(text):
 45 |     """remove unprintable characters"""
 46 |     return control_char_re.sub("", text)
 47 | 
 48 | 
 49 | def remove_redundant_whitespaces(text):
 50 |     """remove redundant whitespaces"""
 51 |     return re.sub(" +", " ", text).strip()
 52 | 
 53 | 
 54 | def is_english(c):
 55 |     """check character is in English"""
 56 |     return ord(c.lower()) >= ord("a") and ord(c.lower()) <= ord("z")
 57 | 
 58 | 
 59 | def is_mandarin(c):
 60 |     """check character is Mandarin"""
 61 |     return (
 62 |         not is_english(c)
 63 |         and not c.isdigit()
 64 |         and c != " "
 65 |         and c != "<"
 66 |         and c != ">"
 67 |         and c != "'"
 68 |     )
 69 | 
 70 | 
 71 | def extract_mandarin_only(text):
 72 |     """remove other symbols except for Mandarin characters in a string"""
 73 |     return "".join([c for c in text if is_mandarin(c)])
 74 | 
 75 | 
 76 | def extract_non_mandarin(text):
 77 |     """remove Mandarin characters in a string"""
 78 |     return " ".join([w for w in text.split(" ") if not any(is_mandarin(c) for c in w)])
 79 | 
 80 | 
 81 | def insert_space_between_mandarin(text):
 82 |     """insert space between Mandarin characters"""
 83 | 
 84 |     if len(text) <= 1:
 85 |         return text
 86 |     out_text = text[0]
 87 |     for i in range(1, len(text)):
 88 |         if is_mandarin(text[i]):
 89 |             out_text += " "
 90 |         out_text += text[i]
 91 |         if is_mandarin(text[i]):
 92 |             out_text += " "
 93 | 
 94 |     return out_text
 95 | 
 96 | 
 97 | def remove_repeated_noise(text, pattern="<noise>"):
 98 |     """remove repeated <noise>"""
 99 | 
100 |     if len(re.findall(pattern, text)) <= 1:
101 |         return text
102 | 
103 |     out_text = ""
104 |     text_split = text.split()
105 |     out_text = [text_split[0]]
106 |     for i in range(1, len(text_split)):
107 |         if text_split[i] == pattern and text_split[i - 1] == pattern:
108 |             continue
109 |         else:
110 |             out_text.append(text_split[i])
111 | 
112 |     return " ".join(out_text)
113 | 
114 | 
115 | def normalize_text(text):
116 |     """normalize a text sequence"""
117 | 
118 |     rmtext = re.sub(
119 |         r"\(((pp)(\w)+)\)",
120 |         "<noise>",
121 |         text.lower(),
122 |     )
123 |     rmtext = re.sub(
124 |         r"\<((pp)(\w)+)\>",
125 |         "<noise>",
126 |         rmtext,
127 |     )
128 |     rmtext = rmtext.translate(pattern)
129 |     rmtext = remove_control_chars(rmtext)
130 |     output_text = ""
131 |     for wrd in rmtext.split():
132 |         if wrd in {
133 |             "ppl",
134 |             "ppc",
135 |             "ppb",
136 |             "ppo",
137 |             "<v-noise>",
138 |         }:
139 |             wrd = "<noise>"
140 |         output_text += f"{wrd} "
141 | 
142 |     output_text = output_text.strip()
143 |     output_text = output_text.translate(pattern2)
144 |     output_text = output_text.replace("<unl>", "<unk>")
145 |     output_text = output_text.replace("< unk >", "<unk>")
146 |     output_text = re.sub(r"\<((unk)[a-z ]+)\>", "<unk>", output_text)
147 |     output_text = insert_space_between_mandarin(output_text)
148 |     output_text = remove_redundant_whitespaces(output_text)
149 |     output_text = remove_repeated_noise(output_text, "<noise>")
150 | 
151 |     return output_text
152 | 
153 | 
154 | def read_list(pth):
155 |     """read data list (data/SEAME-dev-set/train/wav_file.txt)"""
156 | 
157 |     stypes, idxs = [], []
158 |     with open(pth, "r") as f:
159 |         for line in f:
160 |             line = line.strip()
161 |             if line == "":
162 |                 continue
163 |             stype, idx = line.split("/")[-3], line.split("/")[-2]
164 |             stypes.append(stype)
165 |             idxs.append(idx)
166 |         return stypes, idxs
167 | 
168 | 
169 | def read_text(pth, rmspk=False):
170 |     """read dev set text data (data/SEAME-dev-set/{devset}/text)"""
171 | 
172 |     idxs = []
173 |     with open(pth, "r") as f:
174 |         for line in f:
175 |             line = line.strip()
176 |             if line == "":
177 |                 continue
178 | 
179 |             line = line.split()[0]
180 |             if rmspk:
181 |                 line = line.split("-", 1)[-1]
182 |             idxs.append(line.lower())
183 |         return idxs
184 | 
185 | 
186 | def read_trans(data_dict, pth, phs, audio_list, aduio_pth):
187 |     """read transcriptions (SEAME/{type}/transcript/phaseII/??.txt)"""
188 | 
189 |     audio_dict = set(audio_list)
190 | 
191 |     with open(pth, "r") as f:
192 |         for line in f:
193 |             line = line.strip()
194 |             if line == "":
195 |                 continue
196 | 
197 |             if phs.lower() == "phasei":
198 |                 lang = None
199 |                 if len(line.split("\t")) == 4:
200 |                     idx, start, end, text = line.split("\t")
201 |                 else:
202 |                     idx, cont = line.split("\t", 1)
203 |                     print(f"Skip {idx} with {cont}... (no transcript)")
204 |                     continue
205 |             elif phs.lower() == "phaseii":
206 |                 idx, start, end, lang, text = line.split("\t")
207 |             else:
208 |                 print("folder error! not PhaseI or PhaseII")
209 |                 raise
210 |             # start: start time in msec
211 |             # end: end time in msec
212 | 
213 |             start_ms = start
214 |             end_ms = end
215 | 
216 |             # fit the devset format
217 |             s_len, e_len = len(start), len(end)
218 |             if s_len < 5:
219 |                 start = int(round(fit_format(start) / 10, 0))
220 |                 start = str(start).zfill(5)
221 |             else:
222 |                 start = int(round(float(start) / 10, 0))
223 |             if e_len < 5:
224 |                 end = int(round(fit_format(end) / 10, 0))
225 |                 end = str(end).zfill(5)
226 |             else:
227 |                 end = int(round(float(end) / 10, 0))
228 | 
229 |             name = f"{idx}-{start}-{end}"
230 |             if name not in data_dict:
231 |                 if idx.split("_")[0][0].isdigit():
232 |                     spkr = idx.split("_")[0][2:-2].lower()
233 |                 else:
234 |                     spkr = idx.split("_")[0][:5].lower()
235 | 
236 |                 if idx.split("-")[0] in audio_dict:
237 |                     apth = os.path.join(audio_pth, name.split("-")[0] + ".flac")
238 |                 else:
239 |                     print("FLAC idx error!")
240 |                     raise
241 | 
242 |                 data_dict[name.lower()] = {
243 |                     "text": text,
244 |                     "start": start,
245 |                     "end": end,
246 |                     "speaker": spkr,
247 |                     "split": "train",
248 |                     "audio_pth": apth,
249 |                     "start_ms": start_ms,
250 |                     "end_ms": end_ms,
251 |                     "phase": phs,
252 |                 }
253 |             else:
254 |                 print("Repeated idx!")
255 |                 raise
256 | 
257 | 
258 | def fit_format(digit):
259 |     """fit file name format"""
260 |     str_digit = str(float(digit) / 10.0)
261 |     if int(str_digit[-1]) >= 5:
262 |         return float(digit) + 1
263 |     else:
264 |         return float(digit)
265 | 
266 | 
267 | def check_audio(data_dict, audio_dict):
268 |     """check whether data_dict and audio_dict match"""
269 |     for key in data_dict.keys():
270 |         if key.split("-")[0] not in audio_dict:
271 |             print(f"key = {key} not in audio files")
272 | 
273 | 
274 | def check_test_split(test, data_dict, splitname):
275 |     """find testing data in data_dict"""
276 | 
277 |     train_idx = []
278 |     data = list(data_dict.keys())
279 |     count = 0
280 |     space = {}
281 |     idx_space = {}
282 |     for key in data:
283 |         idx, start, end = key.split("-")
284 |         idx_space[idx] = idx_space.get(idx, []) + [[str(start), str(end)]]
285 |         space[idx] = space.get(idx, []) + [[float(start), float(end)]]
286 | 
287 |     for key in test:
288 |         idx, start, end = key.split("-")
289 |         start, end = float(start), float(end)
290 |         for list_idx, time in enumerate(space[idx]):
291 |             if abs(start - time[0]) < 3 and abs(end - time[1]) < 3:
292 |                 count += 1
293 |                 time1, time2 = idx_space.get(idx)[list_idx]
294 |                 data_dict[(f"{idx}-{time1}-{time2}")]["split"] = splitname
295 |                 break
296 | 
297 |     print(f"=> Test set = {count}/{len(test)}")
298 | 
299 | 
300 | def sieve_train(data_dict, train_dict):
301 |     """tag samples other than training or testing data"""
302 | 
303 |     for key in data_dict.keys():
304 |         if data_dict[key]["split"] == "train" and key.split("-")[0] in train_dict:
305 |             continue
306 |         elif data_dict[key]["split"] in ["devman", "devsge"]:
307 |             continue
308 |         else:
309 |             data_dict[key]["split"] = "other"
310 | 
311 | 
312 | def split_val(data_dict, num_val=None):
313 |     """split train/val sets"""
314 | 
315 |     count = 0
316 |     test_list = []
317 |     tr_list = []
318 |     for key, content in data_dict.items():
319 |         if content["split"] in {"devman", "devsge"}:
320 |             test_list.append(key)
321 |         elif content["split"] == "train":
322 |             tr_list.append(key)
323 | 
324 |     rd.shuffle(tr_list)
325 |     val_len = num_val if num_val else int(len(tr_list) * 0.05)
326 |     tr_list, val_list = tr_list[:-val_len], tr_list[-val_len:]
327 | 
328 |     for key in val_list:
329 |         data_dict[key]["split"] = "valid"
330 | 
331 |     return data_dict, tr_list, val_list, test_list
332 | 
333 | 
334 | def count_data(data_dict):
335 |     """count audio length and number of speakers"""
336 | 
337 |     lens = {"train": 0.0, "valid": 0.0, "devman": 0.0, "devsge": 0.0, "other": 0.0}
338 |     spkr_dict = {
339 |         "train": set(),
340 |         "valid": set(),
341 |         "devman": set(),
342 |         "devsge": set(),
343 |         "other": set(),
344 |     }
345 |     for key, val in data_dict.items():
346 |         lens[val["split"]] += (float(val["end_ms"]) - float(val["start_ms"])) / 1000.0
347 |         spkr_dict[val["split"]].add(val["speaker"])
348 | 
349 |     for key in lens.keys():
350 |         print(
351 |             "=> {} set : {:.2f} hours / {} speakers".format(
352 |                 key, lens[key] / 3600.0, len(spkr_dict[key])
353 |             )
354 |         )
355 | 
356 | 
357 | def write_f(pth, filename, data_dict):
358 |     """write kaldi-compatible files"""
359 | 
360 |     print(f"=> Writing {filename}...")
361 |     idx_pth = os.path.join(pth, "list")
362 |     txt_pth = os.path.join(pth, "text.ori")
363 |     rmtxt_pth = os.path.join(pth, "text.rm")
364 |     idxtxt_pth = os.path.join(pth, "text.clean")
365 |     idxnoisetxt_pth = os.path.join(pth, "text.rm.noise")
366 |     seg_pth = os.path.join(pth, "segments")
367 |     wav_pth = os.path.join(pth, "wav.scp")
368 |     spk_pth = os.path.join(pth, "utt2spk")
369 |     gender_pth = os.path.join(pth, "spk2gender")
370 |     wav_cmds = {}
371 |     gender = {}
372 |     total_len = 0.0
373 |     total_utt = 0
374 | 
375 |     # write idx list
376 |     with open(txt_pth, "w") as tlist:
377 |         with open(rmtxt_pth, "w") as rtlist:
378 |             with open(idxtxt_pth, "w") as itlist:
379 |                 with open(idxnoisetxt_pth, "w") as intlist:
380 |                     with open(seg_pth, "w") as slist:
381 |                         with open(idx_pth, "w") as flist:
382 |                             with open(wav_pth, "w") as wlist:
383 |                                 with open(spk_pth, "w") as spklist:
384 |                                     with open(gender_pth, "w") as genlist:
385 |                                         for idx, content in data_dict.items():
386 |                                             if filename != content["split"]:
387 |                                                 continue
388 | 
389 |                                             # id & text
390 |                                             text = content["text"]
391 |                                             audio_pth = content["audio_pth"]
392 |                                             spkr = content["speaker"]
393 | 
394 |                                             # process text
395 |                                             normalized_text = normalize_text(text)
396 |                                             no_noise_text = normalized_text.replace(
397 |                                                 "<noise>", ""
398 |                                             ).replace("<unk>", "")
399 |                                             no_noise_text = (
400 |                                                 remove_redundant_whitespaces(
401 |                                                     no_noise_text
402 |                                                 )
403 |                                             )
404 |                                             normalized_text = normalized_text.replace(
405 |                                                 "<unk>", "<UNK>"
406 |                                             )
407 | 
408 |                                             # remove short utterances
409 |                                             if len(no_noise_text) == 0:
410 |                                                 continue
411 | 
412 |                                             # fit kaldi format
413 |                                             prefix, id_start, id_end = idx.split("-")
414 | 
415 |                                             # remove some short utterance
416 |                                             if float(id_end) - float(id_start) <= 1:
417 |                                                 continue
418 |                                             idx = (
419 |                                                 prefix
420 |                                                 + "-"
421 |                                                 + "0" * (6 - len(id_start))
422 |                                                 + id_start
423 |                                                 + "-"
424 |                                                 + "0" * (6 - len(id_end))
425 |                                                 + id_end
426 |                                             )
427 | 
428 |                                             uttidx = f"{spkr}-{idx}"
429 |                                             if spkr[-1] in ["m", "f"]:
430 |                                                 gender[spkr] = spkr[-1]
431 |                                             else:
432 |                                                 # some SEAME's bug
433 |                                                 for g in reversed(prefix.split("_")[0]):
434 |                                                     if g.lower() in ["m", "f"]:
435 |                                                         gender[spkr] = g.lower()
436 | 
437 |                                             spklist.write(f"{uttidx} {spkr}\n")
438 |                                             flist.write(f"{uttidx}\n")
439 |                                             tlist.write(f"{uttidx} {text}\n")
440 |                                             _, recordid, start, end = uttidx.split("-")
441 |                                             wav_cmds[
442 |                                                 recordid
443 |                                             ] = f"flac -c -d -s {audio_pth} |"
444 |                                             # map to sec, original ms,
445 |                                             # idx here has /10.
446 |                                             start, end = (
447 |                                                 float(start) / 100,
448 |                                                 float(end) / 100,
449 |                                             )
450 |                                             # write segments
451 |                                             slist.write(
452 |                                                 f"{uttidx} {recordid} {start} {end}\n"
453 |                                             )
454 | 
455 |                                             rtlist.write(normalized_text + "\n")
456 |                                             intlist.write(
457 |                                                 f"{uttidx} {normalized_text}\n"
458 |                                             )
459 | 
460 |                                             itlist.write(f"{uttidx} {no_noise_text}\n")
461 | 
462 |                                             total_len += end - start
463 |                                             total_utt += 1
464 | 
465 |                                         for recordid in sorted(wav_cmds.keys()):
466 |                                             wlist.write(
467 |                                                 f"{recordid} {wav_cmds[recordid]}\n"
468 |                                             )
469 | 
470 |                                         for spkr in sorted(gender.keys()):
471 |                                             genlist.write(f"{spkr} {gender[spkr]}\n")
472 | 
473 |     print(
474 |         "=>    {}: {} utts, {:.2f} hours, avg {:.2f} sec/utt".format(
475 |             filename, total_utt, total_len / 3600.0, total_len / total_utt
476 |         )
477 |     )
478 | 
479 | 
480 | def write_mandarin_only_text(data_dict, file, char_file1, char_file2):
481 |     """write Mandarin text data"""
482 | 
483 |     counter = collections.Counter()
484 |     with open(file, "w") as fp:
485 |         for idx, content in data_dict.items():
486 |             if "train" == content["split"]:
487 |                 text = normalize_text(content["text"])
488 |                 text = text.replace("<noise>", "")
489 |                 text = text.replace("<unk>", "")
490 |                 text = remove_redundant_whitespaces(text)
491 |                 text = extract_mandarin_only(text)
492 |                 counter.update(text)
493 |                 if text != "":
494 |                     fp.write(text + "\n")
495 | 
496 |     vocab_list = sorted(counter.keys())
497 |     print(f"=> Mandarin vocab size = {len(vocab_list)}")
498 | 
499 |     with open(char_file1, "w") as fp:
500 |         fp.write("\n".join(vocab_list))
501 |     with open(char_file2, "w") as fp:
502 |         fp.write('bpe_nlsyms="<noise>,▁' + ",▁".join(vocab_list) + '"\n')
503 |         fp.write(f"man_chars={len(vocab_list)}")
504 | 
505 | 
506 | def write_bpe_train_text(data_dict, file):
507 |     """write English BPE training text data"""
508 | 
509 |     with open(file, "w") as fp:
510 |         for idx, content in data_dict.items():
511 |             if "train" == content["split"]:
512 |                 text = normalize_text(content["text"])
513 |                 text = text.replace("<noise>", "")
514 |                 text = text.replace("<unk>", "")
515 |                 text = remove_redundant_whitespaces(text)
516 |                 text = extract_non_mandarin(text)
517 |                 if text != "":
518 |                     fp.write(text + "\n")
519 | 
520 | 
521 | if __name__ == "__main__":
522 |     # parse arguments
523 |     parser = argparse.ArgumentParser()
524 |     parser.add_argument(
525 |         "--out",
526 |         "-o",
527 |         type=str,
528 |         help="Path to output directory.",
529 |     )
530 |     parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
531 |     parser.add_argument(
532 |         "--repo", "-r", type=str, help="Path to official repo (downloaded)."
533 |     )
534 |     args = parser.parse_args()
535 | 
536 |     # basic variables setup
537 |     out_pth = args.out
538 |     ori_data_pth = args.data
539 | 
540 |     # read data
541 |     print("=> Preprocessing transcription files...")
542 |     audio_type = ["conversation", "interview"]
543 |     audios, trans = [], []
544 |     data_dict, audio_idx_list = {}, []
545 |     for atp in audio_type:
546 |         # read audio
547 |         audio_pth = os.path.abspath(os.path.join(ori_data_pth, atp, "audio"))
548 |         for au in os.listdir(os.path.join(ori_data_pth, atp, "audio")):
549 |             audios.append(au.strip(".flac"))
550 |             audio_idx_list.append(au.split("/")[-1].strip(".flac").lower())
551 | 
552 |         # read transcription
553 |         for phs in ["phaseII"]:
554 |             for txt in os.listdir(os.path.join(ori_data_pth, atp, "transcript", phs)):
555 |                 trans_pth = os.path.join(ori_data_pth, atp, "transcript", phs, txt)
556 |                 read_trans(data_dict, trans_pth, phs, audios, audio_pth)
557 | 
558 |     # check whether the audio file exists for each utterance
559 |     print("=> Checking audio files...")
560 |     check_audio(data_dict, set(audio_idx_list))
561 | 
562 |     # get train set
563 |     print("=> Reading wav_file.txt for training set...")
564 |     all_audio_pth = os.path.join(args.repo, "train", "wav_file.txt")
565 |     folder_type, all_audio_idx = read_list(all_audio_pth)
566 | 
567 |     print("=> Getting training set...")
568 |     sieve_train(data_dict, set(all_audio_idx))
569 | 
570 |     # dev set
571 |     print("=> Reading dev set indices...")
572 |     rmspk = True
573 |     dev_man = os.path.join(args.repo, "dev_man", "text")
574 |     devman_idx = read_text(dev_man, rmspk)
575 | 
576 |     dev_sge = os.path.join(args.repo, "dev_sge", "text")
577 |     devsge_idx = read_text(dev_sge, rmspk)
578 | 
579 |     # check
580 |     print("=> Checking testing sets...")
581 |     check_test_split(devman_idx, data_dict, "devman")
582 |     check_test_split(devsge_idx, data_dict, "devsge")
583 | 
584 |     # split
585 |     print("=> Splitting train/val sets...")
586 |     data_dict, tr_list, val_list, test_list = split_val(data_dict)
587 | 
588 |     # report some results
589 |     print(f"=> Audio files = {len(audios)}")
590 |     print(f"=> Total utterance = {len(data_dict.keys())}")
591 |     print(f"=> Number of train set = {len(tr_list)}; validation set = {len(val_list)}")
592 |     print(f"=> Number of devman = {len(devman_idx)}; devsge = {len(devsge_idx)}")
593 | 
594 |     # report corpus size (in hours)
595 |     count_data(data_dict)
596 | 
597 |     # sort by speaker
598 |     print("=> Sorting data by speaker id...")
599 |     data_idx = []
600 |     spkr_dict = collections.OrderedDict([])
601 |     for k, v in data_dict.items():
602 |         speaker = data_dict[k]["speaker"]
603 |         spkr_dict[speaker] = spkr_dict.get(speaker, []) + [k]
604 |     for k in sorted(spkr_dict.keys()):
605 |         data_idx += sorted(spkr_dict[k])
606 | 
607 |     sorted_idx = []
608 |     prev_name = None
609 |     buff = {}
610 |     for idx in data_idx:
611 |         name, start = idx.split("-")[0], idx.split("-")[1]
612 |         if prev_name:
613 |             if prev_name == name:
614 |                 buff[int(start)] = idx
615 |             else:
616 |                 sorted_idx += [buff[k] for k in sorted(buff.keys())]
617 |                 # clean buff
618 |                 buff = {int(start): idx}
619 |                 prev_name = name
620 |         else:
621 |             prev_name = name
622 |             buff = {int(start): idx}
623 |     sorted_data_dict = collections.OrderedDict()
624 |     for key in sorted_idx:
625 |         sorted_data_dict[key] = data_dict[key]
626 | 
627 |     # make kaldi format files
628 |     print("=> Writing files...")
629 |     for name in ["train", "valid", "devman", "devsge"]:
630 |         data_pth = os.path.join(out_pth, name)
631 |         os.makedirs(data_pth, exist_ok=True)
632 |         write_f(data_pth, name, sorted_data_dict)
633 | 
634 |     write_mandarin_only_text(
635 |         sorted_data_dict,
636 |         os.path.join(out_pth, "train", "text.man"),
637 |         os.path.join(out_pth, "train", "token.man.1"),
638 |         os.path.join(out_pth, "train", "token.man.2"),
639 |     )
640 | 
641 |     write_bpe_train_text(
642 |         sorted_data_dict,
643 |         os.path.join(out_pth, "train", "text.eng.bpe"),
644 |     )
645 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/score.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | # This script computes CER of Mandarin and WER of English separately
 9 | 
10 | if [ $# -eq 1 ]; then
11 |     exp=$1
12 | else
13 |     echo "only one argument is required"
14 | fi
15 | 
16 | while IFS= read -r expdir; do
17 |     if ls "${expdir}"/*/*/score_wer/hyp.trn &> /dev/null; then
18 |         for scoredir in "${expdir}"/*/*/score_wer; do
19 |             # split Mandarin and English transcriptions
20 |             local/split_lang_trn.py -t ${scoredir}/hyp.trn -o ${scoredir}
21 |             local/split_lang_trn.py -t ${scoredir}/ref.trn -o ${scoredir}
22 | 
23 |             # respectively computes the error rates
24 |             for lang in eng man; do
25 |                 sclite -e utf-8 -c NOASCII \
26 |                     -r "${scoredir}/ref.trn.${lang}" trn \
27 |                     -h "${scoredir}/hyp.trn.${lang}" trn \
28 |                     -i rm -o all stdout \
29 |                     > "${scoredir}/result.${lang}.txt"
30 |             done
31 |         done
32 | 
33 |         # show results
34 |         for lang in eng man; do
35 |             if [ $lang = eng ]; then
36 |                 echo "English WER"
37 |             else
38 |                 echo "Mandarin CER"
39 |             fi
40 | 
41 |             echo "|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
42 | |---|---|---|---|---|---|---|---|---|"
43 |             grep -H -e Avg "${expdir}"/*/*/score_wer/result.${lang}.txt \
44 |                 | sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_wer/result.${lang}.txt:#|\1#g" \
45 |                 | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
46 |             echo
47 |         done
48 |     fi
49 | done < <(find ${exp} -mindepth 0 -maxdepth 1 -type d)
50 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/split_lang_trn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | from preprocess import (
 8 |     remove_redundant_whitespaces,
 9 |     extract_mandarin_only,
10 |     extract_non_mandarin,
11 |     insert_space_between_mandarin,
12 | )
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     # Parse arguments
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--trn", "-t", type=str, help=".trn file")
19 |     parser.add_argument("--out", "-o", type=str, help="Output dir.")
20 |     args = parser.parse_args()
21 | 
22 |     out_name = args.trn.split("/")[-1]  # hyp.trn / ref.trn
23 |     eng_out_path = os.path.join(args.out, out_name + ".eng")
24 |     man_out_path = os.path.join(args.out, out_name + ".man")
25 | 
26 |     with open(args.trn, "r") as fp:
27 |         with open(eng_out_path, "w") as fp_eng:
28 |             with open(man_out_path, "w") as fp_man:
29 |                 for line in fp:
30 |                     sent, idx = line.split("\t")
31 | 
32 |                     sent_eng = extract_non_mandarin(sent)
33 |                     sent_man = extract_mandarin_only(sent)
34 |                     sent_man = insert_space_between_mandarin(sent_man)
35 |                     sent_eng = remove_redundant_whitespaces(sent_eng)
36 |                     sent_man = remove_redundant_whitespaces(sent_man)
37 | 
38 |                     fp_eng.write(sent_eng + "\t" + idx)
39 |                     fp_man.write(sent_man + "\t" + idx)
40 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/subset_seame_cs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | import pdb
 7 | 
 8 | cnt = 0
 9 | 
10 | def is_mono(txt):
11 |     for i, c in enumerate(txt.split()):
12 |         if i > 0 and (c == "<zh>" or c == "<en>"):
13 |             return True
14 |     return False
15 | 
16 | if __name__ == "__main__":
17 |     # Parse arguments
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--src", type=str)
20 |     parser.add_argument("--dst", type=str)
21 |     args = parser.parse_args()
22 | 
23 | 
24 |     lines = [x.strip().split(" ", 1) for x in open(args.src+"/text_lid", "r").readlines()]
25 |     segments = {x.strip().split()[0]: x.strip().split() for x in open(args.src+"/segments", "r").readlines()}
26 |     utt2spk = {x.strip().split()[0]: x.strip().split() for x in open(args.src+"/utt2spk", "r").readlines()}
27 |     new_lines = [id + ' ' + txt[5:] + "\n" for id, txt in lines if is_mono(txt)]
28 |     mono_ids = [segments[x.split(" ")[0]][1] for x in new_lines]
29 |     new_segments = [segments[x.split(" ")[0]] for x in new_lines]
30 |     new_utt2spk = [utt2spk[x.split(" ")[0]] for x in new_lines]
31 |     wavs = {id:wav for id,wav in [x.strip().split(" ", 1) for x in open(args.src+"/wav.scp", "r").readlines()]}
32 |     # new_wav_lines = [id + ' ' + txt + "\n" for id, txt in lines if id in wav_lines]
33 |     
34 |     with open(args.dst+"/text", "w") as f:
35 |         f.writelines(new_lines)
36 | 
37 |     with open(args.dst+"/wav.scp", "w") as f:
38 |         for id in mono_ids:
39 |             f.write(id + " " + wavs[id] + "\n")
40 | 
41 |     # durs = {id:dur for id,dur in [x.strip().split(" ", 1) for x in open(args.src+"/utt2dur", "r").readlines()]}
42 | 
43 |     with open(args.dst+"/segments", "w") as f:
44 |         for seg in new_segments:
45 |             f.write(" ".join(seg) + "\n")
46 | 
47 |     with open(args.dst+"/utt2spk", "w") as f:
48 |         for spk in new_utt2spk:
49 |             f.write(" ".join(spk) + "\n")
50 | 
51 |     print(len(new_lines))


--------------------------------------------------------------------------------
/asr1/espnet_cmn/local/subset_seame_mono.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | import pdb
 7 | 
 8 | cnt = 0
 9 | 
10 | def is_mono(txt):
11 |     for i, c in enumerate(txt.split()):
12 |         if i > 0 and (c == "<zh>" or c == "<en>"):
13 |             return False
14 |     global cnt
15 |     cnt += 1
16 |     return True
17 | 
18 | if __name__ == "__main__":
19 |     # Parse arguments
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--src", type=str)
22 |     parser.add_argument("--dst", type=str)
23 |     args = parser.parse_args()
24 | 
25 | 
26 |     lines = [x.strip().split(" ", 1) for x in open(args.src+"/text_lid", "r").readlines()]
27 |     segments = {x.strip().split()[0]: x.strip().split() for x in open(args.src+"/segments", "r").readlines()}
28 |     utt2spk = {x.strip().split()[0]: x.strip().split() for x in open(args.src+"/utt2spk", "r").readlines()}
29 |     new_lines = [id + ' ' + txt[5:] + "\n" for id, txt in lines if is_mono(txt)]
30 |     mono_ids = [segments[x.split(" ")[0]][1] for x in new_lines]
31 |     new_segments = [segments[x.split(" ")[0]] for x in new_lines]
32 |     new_utt2spk = [utt2spk[x.split(" ")[0]] for x in new_lines]
33 |     #pdb.set_trace()
34 |     wavs = {id:wav for id,wav in [x.strip().split(" ", 1) for x in open(args.src+"/wav.scp", "r").readlines()]}
35 |     # new_wav_lines = [id + ' ' + txt + "\n" for id, txt in lines if id in wav_lines]
36 |     
37 |     with open(args.dst+"/text", "w") as f:
38 |         f.writelines(new_lines)
39 | 
40 |     with open(args.dst+"/wav.scp", "w") as f:
41 |         for id in mono_ids:
42 |             f.write(id + " " + wavs[id] + "\n")
43 | 
44 |     # durs = {id:dur for id,dur in [x.strip().split(" ", 1) for x in open(args.src+"/utt2dur", "r").readlines()]}
45 | 
46 |     with open(args.dst+"/segments", "w") as f:
47 |         for seg in new_segments:
48 |             f.write(" ".join(seg) + "\n")
49 | 
50 |     with open(args.dst+"/utt2spk", "w") as f:
51 |         for spk in new_utt2spk:
52 |             f.write(" ".join(spk) + "\n")
53 | 
54 |     print(cnt)


--------------------------------------------------------------------------------
/asr1/espnet_cmn/path.sh:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/competitions/IWSLT/brian/espnet-ml/egs2/TEMPLATE/asr1/path.sh


--------------------------------------------------------------------------------
/asr1/espnet_cmn/pyscripts:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/competitions/IWSLT/brian/espnet-ml/egs2/TEMPLATE/asr1/pyscripts


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=12
 9 | stop_stage=13
10 | 
11 | train_set="train"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer_seame.yaml
15 | lm_config=conf/tuning/train_lm_lstm2.yaml
16 | inference_config=conf/decode_asr.yaml
17 | 
18 | if [ ! -f "data/${train_set}/token.man.2" ]; then
19 |     # must preprocess data first to get Mandarin character tokens
20 |     if [ ${stage} -eq 1 ]; then
21 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
22 |         stage=2
23 |     else
24 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
25 |         exit 1
26 |     fi
27 | fi
28 | 
29 | #man_chars=2622
30 | bpe_nlsyms=""
31 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
32 | #nbpe=$((3000 + man_chars + 4))  # 5626
33 | nbpe=5615
34 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
35 | 
36 | ./asr.sh \
37 |     --nj 100 \
38 |     --inference_nj 100 \
39 |     --use_lm false \
40 |     --expdir exp_seame \
41 |     --dumpdir dump_seame\
42 |     --ngpu 1\
43 |     --stage ${stage} \
44 |     --stop_stage ${stop_stage} \
45 |     --nbpe ${nbpe} \
46 |     --bpe_nlsyms "${bpe_nlsyms}" \
47 |     --speed_perturb_factors "0.9 1.0 1.1" \
48 |     --max_wav_duration 30 \
49 |     --asr_config "${asr_config}" \
50 |     --lm_config "${lm_config}" \
51 |     --inference_config "${inference_config}" \
52 |     --train_set "${train_set}" \
53 |     --valid_set "${valid_set}" \
54 |     --test_sets "${valid_set} ${test_sets}" \
55 |     --lm_train_text "data/${train_set}/text" \
56 |     --bpe_train_text "data/${train_set}/text" \
57 |     --score_opts "-e utf-8 -c NOASCII" \
58 |     "$@"
59 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run_bigram.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=12
 9 | stop_stage=13
10 | 
11 | train_set="train_bigram_gen_cs"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer.yaml
15 | lm_config=conf/tuning/train_lm_transformer.yaml
16 | inference_config=conf/decode_asr.yaml
17 | 
18 | if [ ! -f "data/${train_set}/token.man.2" ]; then
19 |     # must preprocess data first to get Mandarin character tokens
20 |     if [ ${stage} -eq 1 ]; then
21 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
22 |         stage=2
23 |     else
24 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
25 |         exit 1
26 |     fi
27 | fi
28 | 
29 | #man_chars=2622
30 | bpe_nlsyms=""
31 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
32 | nbpe=$((3000 + man_chars))  # 5626
33 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
34 | 
35 | ./asr.sh \
36 |     --nj 100 \
37 |     --inference_nj 100 \
38 |     --ngpu 3 \
39 |     --expdir exp_bigram \
40 |     --dumpdir dump_uni_imp \
41 |     --stage ${stage} \
42 |     --stop_stage ${stop_stage} \
43 |     --nbpe ${nbpe} \
44 |     --bpe_nlsyms "${bpe_nlsyms}" \
45 |     --speed_perturb_factors "0.9 1.0 1.1" \
46 |     --max_wav_duration 30 \
47 |     --asr_config "${asr_config}" \
48 |     --lm_config "${lm_config}" \
49 |     --inference_config "${inference_config}" \
50 |     --train_set "${train_set}" \
51 |     --valid_set "${valid_set}" \
52 |     --test_sets "${valid_set} ${test_sets}" \
53 |     --lm_train_text "data/${train_set}/text" \
54 |     --bpe_train_text "data/${train_set}/text" \
55 |     --score_opts "-e utf-8 -c NOASCII" \
56 |     "$@"
57 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run_bigram_subset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=12
 9 | stop_stage=13
10 | 
11 | train_set="train_bigram_gen_cs5k"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer.yaml
15 | lm_config=conf/tuning/train_lm_transformer.yaml
16 | inference_config=conf/decode_asr.yaml
17 | 
18 | if [ ! -f "data/${train_set}/token.man.2" ]; then
19 |     # must preprocess data first to get Mandarin character tokens
20 |     if [ ${stage} -eq 1 ]; then
21 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
22 |         stage=2
23 |     else
24 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
25 |         exit 1
26 |     fi
27 | fi
28 | 
29 | #man_chars=2622
30 | bpe_nlsyms=""
31 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
32 | nbpe=$((3000 + man_chars))  # 5626
33 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
34 | 
35 | ./asr.sh \
36 |     --nj 100 \
37 |     --use_lm false \
38 |     --inference_nj 100 \
39 |     --ngpu 1 \
40 |     --expdir exp_bigram5k \
41 |     --dumpdir dump_subset5k \
42 |     --stage ${stage} \
43 |     --stop_stage ${stop_stage} \
44 |     --nbpe ${nbpe} \
45 |     --bpe_nlsyms "${bpe_nlsyms}" \
46 |     --speed_perturb_factors "0.9 1.0 1.1" \
47 |     --max_wav_duration 30 \
48 |     --asr_config "${asr_config}" \
49 |     --lm_config "${lm_config}" \
50 |     --inference_config "${inference_config}" \
51 |     --train_set "${train_set}" \
52 |     --valid_set "${valid_set}" \
53 |     --test_sets "${valid_set} ${test_sets}" \
54 |     --lm_train_text "data/${train_set}/text" \
55 |     --bpe_train_text "data/${train_set}/text" \
56 |     --score_opts "-e utf-8 -c NOASCII" \
57 |     "$@"
58 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run_bigram_subset2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=12
 9 | stop_stage=13
10 | 
11 | train_set="train_bigram_gen_cs25k"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer.yaml
15 | lm_config=conf/tuning/train_lm_transformer.yaml
16 | inference_config=conf/decode_asr.yaml
17 | 
18 | if [ ! -f "data/${train_set}/token.man.2" ]; then
19 |     # must preprocess data first to get Mandarin character tokens
20 |     if [ ${stage} -eq 1 ]; then
21 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
22 |         stage=2
23 |     else
24 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
25 |         exit 1
26 |     fi
27 | fi
28 | 
29 | #man_chars=2622
30 | bpe_nlsyms=""
31 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
32 | nbpe=$((3000 + man_chars))  # 5626
33 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
34 | 
35 | ./asr.sh \
36 |     --nj 100 \
37 |     --inference_nj 100 \
38 |     --ngpu 4 \
39 |     --expdir exp_bigram25k \
40 |     --dumpdir dump_subset25k \
41 |     --stage ${stage} \
42 |     --stop_stage ${stop_stage} \
43 |     --nbpe ${nbpe} \
44 |     --bpe_nlsyms "${bpe_nlsyms}" \
45 |     --speed_perturb_factors "0.9 1.0 1.1" \
46 |     --max_wav_duration 30 \
47 |     --asr_config "${asr_config}" \
48 |     --lm_config "${lm_config}" \
49 |     --inference_config "${inference_config}" \
50 |     --train_set "${train_set}" \
51 |     --valid_set "${valid_set}" \
52 |     --test_sets "${valid_set} ${test_sets}" \
53 |     --lm_train_text "data/${train_set}/text" \
54 |     --bpe_train_text "data/${train_set}/text" \
55 |     --score_opts "-e utf-8 -c NOASCII" \
56 |     "$@"
57 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run_mono.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=12
 9 | stop_stage=13
10 | 
11 | train_set="train_mono_cmn"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer.yaml
15 | lm_config=conf/tuning/train_lm_lstm2.yaml
16 | #inference_config=conf/decode_asr.yaml
17 | inference_config=conf/decode_asr_lm.yaml
18 | 
19 | if [ ! -f "data/${train_set}/token.man.2" ]; then
20 |     # must preprocess data first to get Mandarin character tokens
21 |     if [ ${stage} -eq 1 ]; then
22 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
23 |         stage=2
24 |     else
25 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | #man_chars=2622
31 | bpe_nlsyms=""
32 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
33 | nbpe=$((3000 + man_chars))  # 5626
34 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
35 | 
36 | ./asr.sh \
37 |     --nj 100 \
38 |     --use_lm true \
39 |     --inference_nj 100 \
40 |     --ngpu 2 \
41 |     --stage ${stage} \
42 |     --stop_stage ${stop_stage} \
43 |     --nbpe ${nbpe} \
44 |     --bpe_nlsyms "${bpe_nlsyms}" \
45 |     --speed_perturb_factors "0.9 1.0 1.1" \
46 |     --max_wav_duration 30 \
47 |     --asr_config "${asr_config}" \
48 |     --lm_config "${lm_config}" \
49 |     --inference_config "${inference_config}" \
50 |     --train_set "${train_set}" \
51 |     --valid_set "${valid_set}" \
52 |     --test_sets "${valid_set} ${test_sets}" \
53 |     --lm_train_text "data/${train_set}/text" \
54 |     --bpe_train_text "data/${train_set}/text" \
55 |     --score_opts "-e utf-8 -c NOASCII" \
56 |     "$@"
57 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run_uni.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=13
 9 | stop_stage=13
10 | 
11 | train_set="train_unigram_gen_cs"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer.yaml
15 | lm_config=conf/tuning/train_lm_transformer.yaml
16 | inference_config=conf/decode_asr.yaml
17 | 
18 | if [ ! -f "data/${train_set}/token.man.2" ]; then
19 |     # must preprocess data first to get Mandarin character tokens
20 |     if [ ${stage} -eq 1 ]; then
21 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
22 |         stage=2
23 |     else
24 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
25 |         exit 1
26 |     fi
27 | fi
28 | 
29 | #man_chars=2622
30 | bpe_nlsyms=""
31 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
32 | nbpe=$((3000 + man_chars))  # 5626
33 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
34 | 
35 | ./asr.sh \
36 |     --nj 100 \
37 |     --inference_nj 100 \
38 |     --ngpu 4 \
39 |     --expdir exp_uni \
40 |     --dumpdir dump_uni\
41 |     --stage ${stage} \
42 |     --stop_stage ${stop_stage} \
43 |     --nbpe ${nbpe} \
44 |     --bpe_nlsyms "${bpe_nlsyms}" \
45 |     --speed_perturb_factors "0.9 1.0 1.1" \
46 |     --max_wav_duration 30 \
47 |     --asr_config "${asr_config}" \
48 |     --lm_config "${lm_config}" \
49 |     --inference_config "${inference_config}" \
50 |     --train_set "${train_set}" \
51 |     --valid_set "${valid_set}" \
52 |     --test_sets "${valid_set} ${test_sets}" \
53 |     --lm_train_text "data/${train_set}/text" \
54 |     --bpe_train_text "data/${train_set}/text" \
55 |     --score_opts "-e utf-8 -c NOASCII" \
56 |     "$@"
57 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/run_uni_imp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash to 'debug' mode, it will exit on :
 3 | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
 4 | set -e
 5 | set -u
 6 | set -o pipefail
 7 | 
 8 | stage=12
 9 | stop_stage=13
10 | 
11 | train_set="train_unigram_imp_gen_cs"
12 | valid_set="valid"
13 | test_sets="devman devsge"
14 | asr_config=conf/tuning/train_asr_conformer.yaml
15 | lm_config=conf/tuning/train_lm_transformer.yaml
16 | inference_config=conf/decode_asr.yaml
17 | 
18 | if [ ! -f "data/${train_set}/token.man.2" ]; then
19 |     # must preprocess data first to get Mandarin character tokens
20 |     if [ ${stage} -eq 1 ]; then
21 |         ./asr.sh --stage 1 --stop_stage 1 --train_set "${train_set}" --valid_set "${valid_set}" --test_sets "${test_sets}"
22 |         stage=2
23 |     else
24 |         echo "Error: data/${train_set}/token.man.2 does not exist! Run from stage=1 again."
25 |         exit 1
26 |     fi
27 | fi
28 | 
29 | #man_chars=2622
30 | bpe_nlsyms=""
31 | source data/${train_set}/token.man.2  # for bpe_nlsyms & man_chars
32 | nbpe=$((3000 + man_chars))  # 5626
33 | # English BPE: 2000 / Mandarin: 2622 / other symbols: 4
34 | 
35 | ./asr.sh \
36 |     --nj 100 \
37 |     --inference_nj 100 \
38 |     --ngpu 3 \
39 |     --expdir exp_uni_imp \
40 |     --dumpdir dump_uni_imp\
41 |     --stage ${stage} \
42 |     --stop_stage ${stop_stage} \
43 |     --nbpe ${nbpe} \
44 |     --bpe_nlsyms "${bpe_nlsyms}" \
45 |     --speed_perturb_factors "0.9 1.0 1.1" \
46 |     --max_wav_duration 30 \
47 |     --asr_config "${asr_config}" \
48 |     --lm_config "${lm_config}" \
49 |     --inference_config "${inference_config}" \
50 |     --train_set "${train_set}" \
51 |     --valid_set "${valid_set}" \
52 |     --test_sets "${valid_set} ${test_sets}" \
53 |     --lm_train_text "data/${train_set}/text" \
54 |     --bpe_train_text "data/${train_set}/text" \
55 |     --score_opts "-e utf-8 -c NOASCII" \
56 |     "$@"
57 | 


--------------------------------------------------------------------------------
/asr1/espnet_cmn/sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | #SBATCH -J seame2 #job name
 3 | ##SBATCH --mail-user=anh21@mail.aub.edu
 4 | ##SBATCH --mail-type=ALL
 5 | #SBATCH -o output-%j.txt #standard output file
 6 | #SBATCH -e errors-%j.txt #standard error file
 7 | #SBATCH -p gpu-all #queue used
 8 | #SBATCH --gres gpu:4 #number of gpus needed
 9 | #SBATCH -N 1
10 | #SBATCH --cpus-per-task 20
11 | #SBATCH --mem=50G
12 | #SBATCH -w crimv3mgpu026
13 | ##SBATCH -w crimv3mgpu018
14 | 
15 | module purge
16 | module load slurm
17 | #module load cuda10.0/toolkit
18 | module load cuda11.3/toolkit
19 | module load gcc
20 | 
21 | ./run_bigram_subset2.sh
22 | 
23 | # srun -p gpu-all --gres gpu:3 --mem-per-cpu=10G -w crimv3mgpu005 --pty bash


--------------------------------------------------------------------------------
/asr1/espnet_cmn/scripts:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/competitions/IWSLT/brian/espnet-ml/egs2/TEMPLATE/asr1/scripts


--------------------------------------------------------------------------------
/asr1/espnet_cmn/seperate_mono.sh:
--------------------------------------------------------------------------------
1 | cp -r data/train data/train_mono
2 | python local/add_lid_seame_v2.py --src data/train/text
3 | python local/subset_seame_mono.py --src data/train --dst data/train_mono


--------------------------------------------------------------------------------
/asr1/espnet_cmn/steps:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/competitions/IWSLT/brian/espnet-ml/tools/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/asr1/espnet_cmn/utils:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/competitions/IWSLT/brian/espnet-ml/tools/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/align.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | . ./path.sh
 4 | . ./cmd.sh
 5 | 
 6 | . utils/parse_options.sh
 7 | 
 8 | 
 9 | set -e -o pipefail -u
10 | 
11 | # feature extraction
12 | stage=2
13 | mfccdir=mfcc
14 | data_set=train
15 | nj=100
16 | echo "Computing features"
17 | 
18 | if [ $stage -le 0 ]; then
19 | steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$data_set \
20 |     exp/mer/make_mfcc/$data_set/log $mfccdir
21 | steps/compute_cmvn_stats.sh data/$data_set \
22 |     exp/mer/make_mfcc/$data_set/log $mfccdir
23 | utils/fix_data_dir.sh data/$data_set
24 | fi
25 | 
26 | if [ $stage -le 1 ]; then
27 | # # perform alignments
28 | steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" data/$data_set data/lang exp/tri4 exp/tri4_ali
29 | 
30 | fi
31 | 
32 | if [ $stage -le 2 ]; then
33 | # produce ctm alignments
34 | 
35 | #steps/get_train_ctm.sh --use-segments false data/$data_set data/lang exp/tri4_ali exp/tri4_ali_ctm
36 | steps/get_train_ctm.sh --use-segments false data/$data_set data/lang exp/tri3_ali exp/ctm
37 | 
38 | fi


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/cmd.sh:
--------------------------------------------------------------------------------
 1 | # you can change cmd.sh depending on what type of queue you are using.
 2 | # If you have no queueing system and want to run on a local machine, you
 3 | # can change all instances 'queue.pl' to run.pl (but be careful and run
 4 | # commands one by one: most recipes will exhaust the memory on your
 5 | # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 6 | # with slurm.  Different queues are configured differently, with different
 7 | # queue names and different ways of specifying things like memory;
 8 | # to account for these differences you can create and edit the file
 9 | # conf/queue.conf to match your queue's configuration.  Search for
10 | # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
11 | # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
12 | 
13 | export train_cmd="run.pl"
14 | export decode_cmd="run.pl"
15 | export mkgraph_cmd="run.pl"
16 | 
17 | 
18 | export train_cmd="slurm.pl --mem 20G --config conf/slurm.conf"
19 | export decode_cmd="slurm.pl  --config conf/slurm.conf"
20 | export cuda_cmd="slurm.pl gpu --mem 20G --gpu 2 --config conf/slurm.conf"
21 | #export cuda_cmd=$train_cmd
22 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/cmu2pinyin:
--------------------------------------------------------------------------------
 1 | AA A
 2 | AE A
 3 | AH A
 4 | AO UO
 5 | AW U
 6 | AY AI
 7 | B B
 8 | CH CH 
 9 | D D
10 | DH S I
11 | EH AI
12 | ER E
13 | EY AI
14 | F F
15 | G G
16 | HH H
17 | IH I
18 | IY I
19 | JH ZH 
20 | K K
21 | L L
22 | M M
23 | N N
24 | NG N
25 | OW UO
26 | OY UO
27 | P P
28 | R R
29 | S S
30 | SH SH
31 | T T
32 | TH S
33 | UH U
34 | UW U
35 | V W
36 | W W
37 | Y Y
38 | Z Z 
39 | ZH X  


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/decode.config:
--------------------------------------------------------------------------------
1 | beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
2 | first_beam=8.0 # beam for 1st-pass decoding in SAT.
3 | num_threads=4
4 | 
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/decode_dnn.config:
--------------------------------------------------------------------------------
1 | beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
2 | lattice_beam=10.0 # this has most effect on size of the lattices.
3 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/g2p_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/asr1/kaldi_cmn/conf/g2p_model


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/mfcc.conf:
--------------------------------------------------------------------------------
1 | --use-energy=false   # only non-default option.
2 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/mfcc_hires.conf:
--------------------------------------------------------------------------------
 1 | # config for high-resolution MFCC features, intended for neural network training.
 2 | # Note: we keep all cepstra, so it has the same info as filterbank features,
 3 | # but MFCC is more easily compressible (because less correlated) which is why
 4 | # we prefer this method.
 5 | --use-energy=false   # use average of log energy, not energy.
 6 | --sample-frequency=16000 
 7 | --num-mel-bins=40    
 8 | --num-ceps=40   
 9 | --low-freq=40    # low cutoff frequency for mel bins
10 | --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
11 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/online_cmvn.conf:
--------------------------------------------------------------------------------
1 | # configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
2 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/pinyin2cmu:
--------------------------------------------------------------------------------
 1 | A AA
 2 | AI AY
 3 | AN AE N 
 4 | ANG AE NG
 5 | AO AW   
 6 | B B 
 7 | CH CH
 8 | C T S
 9 | D D
10 | E ER 
11 | EI EY
12 | EN AH N
13 | ENG AH NG
14 | ER AA R 
15 | F F
16 | G G
17 | H HH
18 | IA IY AA
19 | IANG IY AE NG
20 | IAN IY AE N
21 | IAO IY AW
22 | IE IY EH
23 | I IY
24 | ING IY NG
25 | IN IY N
26 | IONG IY UH NG
27 | IU IY UH 
28 | J J
29 | K K
30 | L L
31 | M M
32 | N N
33 | O AO
34 | ONG UH NG
35 | OU OW
36 | P P
37 | Q Q
38 | R R
39 | SH SH
40 | S S
41 | T T
42 | UAI UW AY
43 | UANG UW AE NG
44 | UAN UW AE N
45 | UA UW AA
46 | UI UW IY 
47 | UN UW AH N
48 | UO UW AO
49 | U UW
50 | UE IY EH 
51 | VE IY EH 
52 | V IY UW
53 | VN IY N 
54 | W W
55 | X X 
56 | Y Y
57 | ZH JH 
58 | Z Z


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | command sbatch --ntasks-per-node=1 --partition=gpu-all  --exclude crimv3srv025,crimv3srv024 #-x crimv3srv028,crimv3srv034,crimv3srv037,crimv3mgpu009
 2 | option mem=* --mem-per-cpu=$0
 3 | option mem=0          # Do not add anything to qsub_opts
 4 | option num_threads=* --cpus-per-task=$0 --ntasks-per-node=1
 5 | option num_threads=1 --cpus-per-task=1  --ntasks-per-node=1 # Do not add anything to qsub_opts
 6 | option max_jobs_run=*     # Do nothing
 7 | option gpu=* -N1 -n1 -p gpu-all --mem=6GB --gres=gpu:$0 --cpus-per-task=10 --time=72:0:0  # in reality, we probably should have --cpus-per-task=$((6*$0))
 8 | 
 9 | option gpu=0
10 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/conf/vad.conf:
--------------------------------------------------------------------------------
1 | --vad-energy-threshold=5.5
2 | --vad-energy-mean-scale=0.5
3 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/decode.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
  4 | # Apache 2.0.
  5 | 
  6 | # This script does decoding with a neural-net.
  7 | 
  8 | # Begin configuration section.
  9 | stage=1
 10 | nj=4 # number of decoding jobs.
 11 | acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
 12 | post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
 13 |                       # regular scoring script works.
 14 | cmd=run.pl
 15 | beam=15.0
 16 | frames_per_chunk=50
 17 | max_active=7000
 18 | min_active=200
 19 | ivector_scale=1.0
 20 | lattice_beam=8.0 # Beam we use in lattice generation.
 21 | iter=final
 22 | num_threads=1 # if >1, will use gmm-latgen-faster-parallel
 23 | use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
 24 |               # In that case it is recommended to set num-threads to a large
 25 |               # number, e.g. 20 if you have that many free CPU slots on a GPU
 26 |               # node, and to use a small number of jobs.
 27 | scoring_opts=
 28 | skip_diagnostics=false
 29 | skip_scoring=false
 30 | extra_left_context=0
 31 | extra_right_context=0
 32 | extra_left_context_initial=-1
 33 | extra_right_context_final=-1
 34 | online_ivector_dir=
 35 | minimize=false
 36 | # End configuration section.
 37 | 
 38 | echo "$0 $@"  # Print the command line for logging
 39 | 
 40 | [ -f ./path.sh ] && . ./path.sh; # source the path.
 41 | . utils/parse_options.sh || exit 1;
 42 | 
 43 | if [ $# -ne 4 ]; then
 44 |   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir> <model>"
 45 |   echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
 46 |   echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
 47 |   echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
 48 |   echo "main options (for others, see top of script file)"
 49 |   echo "  --config <config-file>                   # config containing options"
 50 |   echo "  --nj <nj>                                # number of parallel jobs"
 51 |   echo "  --cmd <cmd>                              # Command to run in parallel with"
 52 |   echo "  --beam <beam>                            # Decoding beam; default 15.0"
 53 |   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
 54 |   echo "  --scoring-opts <string>                  # options to local/score.sh"
 55 |   echo "  --num-threads <n>                        # number of threads to use, default 1."
 56 |   echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
 57 |   echo "                                           # to use large --num-threads as the graph"
 58 |   echo "                                           # search becomes the limiting factor."
 59 |   exit 1;
 60 | fi
 61 | 
 62 | graphdir=$1
 63 | data=$2
 64 | dir=$3
 65 | model=$4
 66 | srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
 67 | #model=$srcdir/$iter.mdl
 68 | 
 69 | 
 70 | extra_files=
 71 | if [ ! -z "$online_ivector_dir" ]; then
 72 |   steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
 73 |   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
 74 | fi
 75 | 
 76 | utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
 77 | 
 78 | for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
 79 |   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 80 | done
 81 | 
 82 | sdata=$data/split$nj;
 83 | if [ -f $srcdir/cmvn_opts ]; then
 84 |     cmvn_opts=`cat $srcdir/cmvn_opts`
 85 | else
 86 |     cmvn_opts="--norm-means=false --norm-vars=false"
 87 | fi
 88 | thread_string=
 89 | if $use_gpu; then
 90 |   if [ $num_threads -eq 1 ]; then
 91 |     echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
 92 |   fi
 93 |   thread_string="-batch --num-threads=$num_threads"
 94 |   queue_opt="--num-threads $num_threads --gpu 1"
 95 | elif [ $num_threads -gt 1 ]; then
 96 |   thread_string="-parallel --num-threads=$num_threads"
 97 |   queue_opt="--num-threads $num_threads"
 98 | fi
 99 | 
100 | mkdir -p $dir/log
101 | [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
102 | echo $nj > $dir/num_jobs
103 | 
104 | ## Set up features.
105 | if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
106 | else online_cmvn=false; fi
107 | 
108 | if ! $online_cmvn; then
109 |   echo "$0: feature type is raw"
110 |   feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
111 | else
112 |   echo "$0: feature type is raw (apply-cmvn-online)"
113 |   feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
114 | fi
115 | 
116 | if [ ! -z "$online_ivector_dir" ]; then
117 |   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
118 |   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
119 | fi
120 | 
121 | if [ "$post_decode_acwt" == 1.0 ]; then
122 |   lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
123 | else
124 |   lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
125 | fi
126 | 
127 | frame_subsampling_opt=
128 | if [ -f $srcdir/frame_subsampling_factor ]; then
129 |   # e.g. for 'chain' systems
130 |   frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
131 | elif [ -f $srcdir/init/info.txt ]; then
132 |     frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$srcdir/init/info.txt)
133 |     if [ ! -z $frame_subsampling_factor ]; then
134 |         frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
135 |     fi
136 | fi
137 | 
138 | if [ $stage -le 1 ]; then
139 | 	echo "decoding stage 1"
140 |   $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
141 |     nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
142 |      --frames-per-chunk=$frames_per_chunk \
143 |      --extra-left-context=$extra_left_context \
144 |      --extra-right-context=$extra_right_context \
145 |      --extra-left-context-initial=$extra_left_context_initial \
146 |      --extra-right-context-final=$extra_right_context_final \
147 |      --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
148 |      --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
149 |      --word-symbol-table=$graphdir/words.txt "$model" \
150 |      $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
151 | fi
152 | 
153 | 
154 | if [ $stage -le 2 ]; then
155 |   if ! $skip_diagnostics ; then
156 |     [ ! -z $iter ] && iter_opt="--iter $iter"
157 |     steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
158 |   fi
159 | fi
160 | 
161 | 
162 | # The output of this script is the files "lat.*.gz"-- we'll rescore this at
163 | # different acoustic scales to get the final output.
164 | if [ $stage -le 3 ]; then
165 |   if ! $skip_scoring ; then
166 |     [ ! -x local/score.sh ] && \
167 |       echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
168 |     echo "score best paths"
169 |     [ "$iter" != "final" ] && iter_opt="--iter $iter"
170 |     local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
171 |     echo "score confidence and timing with sclite"
172 |   fi
173 | fi
174 | echo "Decoding done."
175 | exit 0;
176 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/decode_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e -o pipefail
 4 | stage=0
 5 | nj=30
 6 | mer=80 
 7 | train_set=train_mer$mer
 8 |                  # should have alignments for the specified training data.
 9 | nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
10 | 
11 | # Options which are not passed through to run_ivector_common.sh
12 | affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
13 | common_egs_dir=
14 | reporting_email=
15 | test_sets=$1
16 | data_dir=$2
17 | 
18 | # training chunk-options
19 | chunk_width=150,110,100
20 | 
21 | # End configuration section.
22 | echo "$0 $@"  # Print the command line for logging
23 | 
24 | 
25 | . ./cmd.sh
26 | . ./path.sh
27 | . ./utils/parse_options.sh
28 | 
29 | 
30 | if ! cuda-compiled; then
31 |   cat <<EOF && exit 1
32 | This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
33 | If you want to use GPUs (and have them), go to src/, and configure and make on a machine
34 | where "nvcc" is installed.
35 | EOF
36 | fi
37 | 
38 | lat_dir=exp/mer$mer/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
39 | dir=exp/mer$mer/chain${nnet3_affix}/tdnn${affix}_sp
40 | 
41 | tree_dir=exp/mer$mer/chain${nnet3_affix}/tree_a_sp
42 | 
43 | mfccdir=mfcc
44 | for test_set in ${test_sets}; do 
45 |     #nj=200
46 | 	nj=$(wc -l < data/$data_dir/${test_set}/spk2utt)
47 | 	steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
48 |       --cmd "$train_cmd" --write-utt2dur false data/$data_dir/${test_set}
49 | 	steps/compute_cmvn_stats.sh data/$data_dir/$test_set
50 | 	utils/fix_data_dir.sh data/$data_dir/$test_set
51 | 	
52 | 	steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
53 |      data/$data_dir/${test_set} exp/mer$mer/nnet3${nnet3_affix}/extractor \
54 |      exp/mer$mer/nnet3${nnet3_affix}/$data_dir/ivectors_${test_set}
55 | 	 
56 | 	#steps/online/nnet2/extract_ivectors.sh --nj $nj --cmd "$train_cmd" \
57 | 	#  data/$test_set data/lang exp/mer80/nnet3/extractor exp/mer80/nnet3/ivectors_$test_set
58 | done
59 | 
60 | 
61 | #nj=200
62 | nj=$(wc -l < data/$data_dir/${test_set}/spk2utt)
63 | if [ $stage -le 18 ]; then
64 | 	echo "decoding "
65 |   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
66 |   rm $dir/.error 2>/dev/null || true
67 |     for test_set in ${test_sets}; do 
68 |     (
69 |     ./decode.sh \
70 |       --acwt 1.0 --post-decode-acwt 10.0 \
71 |       --extra-left-context 0 --extra-right-context 0 \
72 |       --extra-left-context-initial 0 \
73 |       --extra-right-context-final 0 \
74 |       --frames-per-chunk $frames_per_chunk \
75 |       --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
76 |       --online-ivector-dir exp/mer$mer/nnet3${nnet3_affix}/$data_dir/ivectors_${test_set} \
77 |       $tree_dir/graph data/$data_dir/${test_set} ${dir}/${data_dir}/${test_set} exp/mer80/chain_all/tdnn_1a_sp/final.mdl || exit 1 
78 | 	     
79 | 		 
80 |     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_test{,_fg} \
81 |       data/$data_dir/${test_set} ${dir}/${data_dir}/${test_set} ${dir}/${data_dir}/${test_set}_rescore || exit 1 
82 | 	     ) || touch $dir/.error &
83 |   done
84 |   wait
85 |   if [ -f $dir/.error ]; then
86 |     echo "$0: something went wrong in decoding"
87 |     exit 1
88 |   fi
89 | fi
90 | 
91 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/fix_spk_pref.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | filename=$1  # Replace with your input file name
 4 | 
 5 | # Create a temporary file to store the modified contents
 6 | tempfile=tmp
 7 | 
 8 | # Use paste and awk to modify the columns and store the result in the temporary file
 9 | paste -d ' ' <(awk '{ print $2 }' "$filename") <(awk '{ print $1 }' "$filename") | awk '{ print $1 "-" $2 " " $1 }' > "$tempfile"
10 | 
11 | # Replace the original file with the modified contents
12 | mv "$tempfile" "$filename"
13 | 
14 | echo "Modification complete."
15 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/format_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright (C) 2022, Johns Hopkins University, Amir Hussein
 4 | 
 5 | if [ -f path.sh ]; then
 6 |   . path.sh; else
 7 |    echo "missing path.sh"; exit 1;
 8 | fi 
 9 | 
10 | lang_test=$1
11 | arpa_lm=$2
12 | 
13 | . utils/parse_options.sh
14 | 
15 | mkdir -p $lang_test
16 | 
17 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
18 | 
19 | # grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
20 | # LM doesn't have these "invalid combinations".  These can cause 
21 | # determinization failures of CLG [ends up being epsilon cycles].
22 | # Note: remove_oovs.pl takes a list of words in the LM that aren't in
23 | # our word list.  Since our LM doesn't have any, we just give it
24 | # /dev/null [we leave it in the script to show how you'd do it].
25 | gunzip -c "$arpa_lm" | \
26 |    grep -v '<s> <s>' | \
27 |    grep -v '</s> <s>' | \
28 |    grep -v '</s> </s>' | \
29 |    arpa2fst - | fstprint | \
30 |    utils/remove_oovs.pl /dev/null | \
31 |    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang_test/words.txt \
32 |      --osymbols=$lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
33 |     fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst
34 |   fstisstochastic $lang_test/G.fst
35 | 
36 | exit 1
37 | 
38 | echo  "Checking how stochastic G is (the first of these numbers should be small):"
39 | fstisstochastic $lang_test/G.fst 
40 | 
41 | ## Check lexicon.
42 | ## just have a look and make sure it seems sane.
43 | echo "First few lines of lexicon FST:"
44 | fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
45 | 
46 | echo Performing further checks
47 | 
48 | # Checking that G.fst is determinizable.
49 | fstdeterminize $lang_test/G.fst /dev/null || echo Error determinizing G.
50 | 
51 | # Checking that L_disambig.fst is determinizable.
52 | fstdeterminize $lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
53 | 
54 | # Checking that disambiguated lexicon times G is determinizable
55 | # Note: we do this with fstdeterminizestar not fstdeterminize, as
56 | # fstdeterminize was taking forever (presumbaly relates to a bug
57 | # in this version of OpenFst that makes determinization slow for
58 | # some case).
59 | fsttablecompose $lang_test/L_disambig.fst $lang_test/G.fst | \
60 |    fstdeterminizestar >/dev/null || echo Error
61 | 
62 | # Checking that LG is stochastic:
63 | fsttablecompose data/lang/L_disambig.fst $lang_test/G.fst | \
64 |    fstisstochastic || echo LG is not stochastic
65 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/format_data2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | 
 4 | . ./path.sh
 5 | 
 6 | silprob=0.5
 7 | mkdir -p data/lang_test data/train data/dev
 8 | 
 9 | 
10 | arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
11 | [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
12 | 
13 | # Copy stuff into its final locations...
14 | 
15 | for f in spk2utt utt2spk wav.scp text; do
16 |   cp data/local/train/$f data/train/$f || exit 1;
17 | done
18 | 
19 | for f in spk2utt utt2spk wav.scp text; do
20 |   cp data/local/dev/$f data/dev/$f || exit 1;
21 | done
22 | 
23 | rm -r data/lang_test
24 | cp -r data/lang data/lang_test
25 | 
26 | gunzip -c "$arpa_lm" | \
27 |   arpa2fst --disambig-symbol=#0 \
28 |            --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
29 | 
30 | 
31 | echo  "Checking how stochastic G is (the first of these numbers should be small):"
32 | fstisstochastic data/lang_test/G.fst
33 | 
34 | ## Check lexicon.
35 | ## just have a look and make sure it seems sane.
36 | echo "First few lines of lexicon FST:"
37 | fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
38 | 
39 | echo Performing further checks
40 | 
41 | # Checking that G.fst is determinizable.
42 | fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
43 | 
44 | # Checking that L_disambig.fst is determinizable.
45 | fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
46 | 
47 | # Checking that disambiguated lexicon times G is determinizable
48 | # Note: we do this with fstdeterminizestar not fstdeterminize, as
49 | # fstdeterminize was taking forever (presumbaly relates to a bug
50 | # in this version of OpenFst that makes determinization slow for
51 | # some case).
52 | fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
53 |    fstdeterminizestar >/dev/null || echo Error
54 | 
55 | # Checking that LG is stochastic:
56 | fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
57 |    fstisstochastic || echo LG is not stochastic
58 | 
59 | 
60 | echo format_data succeeded.


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/prep_dict_en_zh.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #Copyright 2016 LeSpeech (Author: Xingyu Na)
  3 | 
  4 | # prepare dictionary for aidatatang
  5 | # it is done for English and Chinese separately,
  6 | # For English, we use CMU dictionary, and Sequitur G2P
  7 | # for OOVs, while all englist phone set will concert to Chinese
  8 | # phone set at the end. For Chinese, we use an online dictionary,
  9 | # for OOV, we just produce pronunciation using Charactrt Mapping.
 10 | 
 11 | . ./path.sh
 12 | 
 13 | [ $# != 0 ] && echo "Usage: $0" && exit 1;
 14 | 
 15 | train_dir=data/train
 16 | dev_dir=data/dev
 17 | dict_dir=data/local/dict
 18 | mkdir -p $dict_dir
 19 | mkdir -p $dict_dir/lexicon-{en,ch}
 20 | 
 21 | # extract full vocabulary
 22 | cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
 23 |   perl -ape 's/ /\n/g;' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
 24 |   grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt || exit 1;
 25 | 
 26 | # split into English and Chinese
 27 | cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt || exit 1;
 28 | cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt || exit 1;
 29 | 
 30 | 
 31 | ##### produce pronunciations for english
 32 | if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
 33 |   echo "--- Downloading CMU dictionary ..."
 34 |   svn co -r 13068 http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
 35 |     $dict_dir/cmudict || exit 1;
 36 | fi
 37 | 
 38 | # format cmudict
 39 | echo "--- Striping stress and pronunciation variant markers from cmudict ..."
 40 | perl $dict_dir/cmudict/scripts/make_baseform.pl \
 41 |   $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
 42 |   sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt || exit 1;
 43 | 
 44 | # extract in-vocab lexicon and oov words
 45 | echo "--- Searching for English OOV words ..."
 46 | awk 'NR==FNR{words[tolower($1)]; next;} !(tolower($1) in words)' \
 47 |   $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\
 48 |   egrep -v '<.?s>' | awk '{print toupper($0)}' > $dict_dir/lexicon-en/words-en-oov.txt || exit 1;
 49 | 
 50 | awk 'NR==FNR{words[tolower($1)]; next;} (tolower($1) in words)' \
 51 |   $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\
 52 |   egrep -v '<.?s>' | awk '{print tolower($0)}' > $dict_dir/lexicon-en/lexicon-en-iv.txt || exit 1;
 53 | 
 54 | wc -l $dict_dir/lexicon-en/words-en-oov.txt
 55 | wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt
 56 | 
 57 | # setup g2p and generate oov lexicon
 58 | if [ ! -f conf/g2p_model ]; then
 59 |   echo "--- Downloading a pre-trained Sequitur G2P model ..."
 60 |   wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
 61 |   if [ ! -f conf/g2p_model ]; then
 62 |     echo "Failed to download the g2p model!"
 63 |     exit 1
 64 |   fi
 65 | fi
 66 | 
 67 | echo "--- Preparing pronunciations for OOV words ..."
 68 | g2p=`which g2p.py`
 69 | if [ ! -x $g2p ]; then
 70 |   echo "g2p.py is not found. Checkout tools/extras/install_sequitur.sh."
 71 |   exit 1
 72 | fi
 73 | g2p.py --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt | awk '{print tolower($0)}' \
 74 |   > $dict_dir/lexicon-en/lexicon-en-oov.txt || exit 1;
 75 | 
 76 | # merge in-vocab and oov lexicon
 77 | cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\
 78 |   sort > $dict_dir/lexicon-en/lexicon-en-phn.txt || exit 1;
 79 | 
 80 | # convert cmu phoneme to pinyin phonenme
 81 | mkdir -p $dict_dir/map
 82 | cat conf/cmu2pinyin |awk '{print tolower($0)}'| awk '{print $1;}' | sort -u > $dict_dir/map/cmu || exit 1;
 83 | cat conf/pinyin2cmu | awk '{print tolower($0)}'| awk -v cmu=$dict_dir/map/cmu \
 84 |   'BEGIN{while((getline<cmu)) dict[$1] = 1;}
 85 |    {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/map/cmu-used || exit 1;
 86 | cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \
 87 |   'BEGIN{while((getline<cmu)) dict[$1] = 1;}
 88 |    {if (!dict[$1]) print $1;}' > $dict_dir/map/cmu-not-used || exit 1;
 89 | 
 90 | awk 'NR==FNR{words[tolower($1)]; next;} (tolower($1) in words)' \
 91 |   $dict_dir/map/cmu-not-used conf/cmu2pinyin |\
 92 |   egrep -v '<.?s>'| awk '{print tolower($0)}' > $dict_dir/map/cmu-py || exit 1;
 93 | 
 94 | cat $dict_dir/map/cmu-py | \
 95 |   perl -e '
 96 |   open(MAPS, $ARGV[0]) or die("could not open map file");
 97 |   my %py2ph;
 98 |   foreach $line (<MAPS>) {
 99 |     @A = split(" ", $line);
100 |     $py = shift(@A);
101 |     $py2ph{$py} = [@A];
102 |   }
103 |   my @entry;
104 |   while (<STDIN>) {
105 |     @A = split(" ", $_);
106 |     @entry = ();
107 |     $W = shift(@A);
108 |     push(@entry, $W);
109 |     for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
110 |     print "@entry";
111 |     print "\n";
112 |   }
113 | ' conf/pinyin2cmu |awk '{print tolower($0)}' > $dict_dir/map/cmu-cmu || exit 1;
114 | 
115 | cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \
116 |   perl -e '
117 |   open(MAPS, $ARGV[0]) or die("could not open map file");
118 |   my %py2ph;
119 |   foreach $line (<MAPS>) {
120 |     @A = split(" ", $line);
121 |     $py = shift(@A);
122 |     $py2ph{$py} = [@A];
123 |   }
124 |   my @entry;
125 |   while (<STDIN>) {
126 |     @A = split(" ", $_);
127 |     @entry = ();
128 |     $W = shift(@A);
129 |     push(@entry, $W);
130 |     for($i = 0; $i < @A; $i++) {
131 |       if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
132 |       else {push(@entry, $A[$i])};
133 |     }
134 |     print "@entry";
135 |     print "\n";
136 |   }
137 | ' $dict_dir/map/cmu-cmu | awk '{print tolower($0)}' > $dict_dir/lexicon-en/lexicon-en.txt || exit 1;
138 | 
139 | 
140 | ##### produce pronunciations for chinese
141 | if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then
142 |   echo "------------- Downloading cedit dictionary ---------------"
143 |   mkdir -p $dict_dir/cedict 
144 |   wget --no-check-certificate -P $dict_dir/cedict https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
145 |   gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
146 | fi
147 | 
148 | cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
149 |  perl -e '
150 |   while (<STDIN>) {
151 |     @A = split(" ", $_);
152 |     print $A[1];
153 |     for($n = 2; $n < @A; $n++) {
154 |       $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
155 |       $tmp = uc($A[$n]);
156 |       print " $tmp";
157 |     }
158 |     print "\n";
159 |   }
160 |  ' | sort -k1 > $dict_dir/cedict/ch-dict.txt || exit 1;
161 | 
162 | echo "--- Searching for Chinese OOV words ..."
163 | awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
164 |   $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\
165 |   egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt || exit 1;
166 | 
167 | awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
168 |   $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\
169 |   egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt || exit 1;
170 | 
171 | wc -l $dict_dir/lexicon-ch/words-ch-oov.txt
172 | wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
173 | 
174 | 
175 | # validate Chinese dictionary and compose a char-based
176 | # dictionary in order to get OOV pronunciations
177 | cat $dict_dir/cedict/ch-dict.txt |\
178 |   perl -e '
179 |   use utf8;
180 |   binmode(STDIN,":encoding(utf8)");
181 |   binmode(STDOUT,":encoding(utf8)");
182 |   while (<STDIN>) {
183 |     @A = split(" ", $_);
184 |     $word_len = length($A[0]);
185 |     $proun_len = @A - 1 ;
186 |     if ($word_len == $proun_len) {print $_;}
187 |   }
188 |   ' > $dict_dir/cedict/ch-dict-1.txt || exit 1;
189 | 
190 | # extract chars
191 | cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
192 |   perl -e '
193 |   use utf8;
194 |   binmode(STDIN,":encoding(utf8)");
195 |   binmode(STDOUT,":encoding(utf8)");  
196 |   while (<STDIN>) {
197 |     @A = split(" ", $_);
198 |     @chars = split("", $A[0]);
199 |     foreach (@chars) {
200 |       print "$_\n";
201 |     }
202 |   }
203 |   ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt || exit 1;
204 | 
205 | # extract individual pinyins
206 | cat $dict_dir/cedict/ch-dict-1.txt |\
207 |   awk '{for(i=2; i<=NF; i++) print $i}' |\
208 |   perl -ape 's/ /\n/g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;
209 | 
210 | # first make sure number of characters and pinyins
211 | # are equal, so that a char-based dictionary can
212 | # be composed.
213 | nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt`
214 | npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt`
215 | if [ $nchars -ne $npinyin ]; then
216 |   echo "Found $nchars chars and $npinyin pinyin. Please check!"
217 |   exit 1
218 | fi
219 | 
220 | paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt |\
221 |   sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt || exit 1;
222 | 
223 | # create a multiple pronunciation dictionary
224 | cat $dict_dir/lexicon-ch/ch-char-dict.txt |\
225 |   perl -e '
226 |   my $prev = "";
227 |   my $out_line = "";
228 |   while (<STDIN>) {
229 |     @A = split(" ", $_);
230 |     $cur = $A[0];
231 |     $cur_py = $A[1];
232 |     #print length($prev);
233 |     if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
234 |     if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);}
235 |     if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
236 |     $prev = $cur;
237 |   }
238 |   print $out_line;
239 |   ' >  $dict_dir/lexicon-ch/ch-char-dict-mp.txt || exit 1;
240 | 
241 | # get lexicon for Chinese OOV words
242 | local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt \
243 |   $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt || exit 1;
244 | 
245 | # seperate multiple prons for Chinese OOV lexicon
246 | cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\
247 |   perl -e '
248 |   my @entry;
249 |   my @entry1;
250 |   while (<STDIN>) {
251 |     @A = split(" ", $_);
252 |     @entry = ();
253 |     push(@entry, $A[0]);
254 |     for($i = 1; $i < @A; $i++ ) {
255 |       @py = split("/", $A[$i]);
256 |       @entry1 = @entry;
257 |       @entry = ();
258 |       for ($j = 0; $j < @entry1; $j++) {
259 |         for ($k = 0; $k < @py; $k++) {
260 |           $tmp = $entry1[$j]." ".$py[$k];
261 |           push(@entry, $tmp);
262 |         }
263 |       }
264 |     }
265 |     for ($i = 0; $i < @entry; $i++) {
266 |       print $entry[$i];
267 |       print "\n";
268 |     }
269 |   }
270 |   ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt || exit 1;
271 | 
272 | # compose IV and OOV lexicons for Chinese
273 | cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\
274 |   awk '{if (NF > 1 && $2 ~ /[A-Za-z0-9]+/) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt || exit 1;
275 | 
276 | # convert Chinese pinyin to CMU format
277 | cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
278 |   utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt || exit 1;
279 | 
280 | # combine English and Chinese lexicons
281 | cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\
282 |   sort -u > $dict_dir/lexicon1.txt || exit 1;
283 | 
284 | cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
285 |   sort -u |\
286 |   perl -e '
287 |   my %ph_cl;
288 |   while (<STDIN>) {
289 |     $phone = $_;
290 |     chomp($phone);
291 |     chomp($_);
292 |     $phone =~ s:([A-Z]+)[0-9]:$1:;
293 |     if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
294 |     else { $ph_cl{$phone} = [$_]; }
295 |   }
296 |   foreach $key ( keys %ph_cl ) {
297 |      print "@{ $ph_cl{$key} }\n"
298 |   }
299 |   ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
300 | 
301 | ( echo SIL; echo SPN; echo NSN; echo LAU ) > $dict_dir/silence_phones.txt
302 | 
303 | echo SIL > $dict_dir/optional_silence.txt
304 | 
305 | # No "extra questions" in the input to this setup, as we don't
306 | # have stress or tone
307 | 
308 | cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
309 | cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
310 |   $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
311 |  >> $dict_dir/extra_questions.txt || exit 1;
312 | 
313 | # Add to the lexicon the silences, noises etc.
314 | (echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
315 |  echo '<UNK> SPN' ) | \
316 |  cat - $dict_dir/lexicon1.txt | awk 'NF>1' > $dict_dir/lexicon.txt || exit 1;
317 | 
318 | echo "$0: aidatatang_200zh dict preparation succeeded"
319 | exit 0;


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/prepare_dict.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | # Copyright (C) 2022, Johns Hopkins, 
 5 | #                   Amir Hussein
 6 | 
 7 | 
 8 | # run this from ../
 9 | export LC_ALL=en_US.UTF-8
10 | 
11 | local_lm=$1
12 | dir=$2
13 | mkdir -p $dir
14 | 
15 | #Dictionary preparation:
16 | lexicon_raw_nosil=$dir/lexicon_nosil
17 | cat $local_lm/words.txt | sed 's:\(.\):\1 :g' > $local_lm/uniq_grapheme
18 | paste -d ' '  $local_lm/words.txt $local_lm/uniq_grapheme > $lexicon_raw_nosil
19 | 
20 | # silence phones, one per line.
21 | 
22 | silence_phones=$dir/silence_phones.txt
23 | optional_silence=$dir/optional_silence.txt
24 | nonsil_phones=$dir/nonsilence_phones.txt
25 | 
26 | echo "Preparing phone lists and clustering questions"
27 | (echo SIL; echo SPN;) > $silence_phones
28 | echo SIL > $optional_silence
29 | # nonsilence phones; on each line is a list of phones that correspond
30 | # really to the same base phone.
31 | cat $lexicon_raw_nosil | cut -d ' ' -f2- | tr -s ' ' '\n' | sort -u > $nonsil_phones || exit 1;
32 | # A few extra questions that will be added to those obtained by automatically clustering
33 | # the "real" phones.  These ask about stress; there's also one for silence.
34 | echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
35 | echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
36 | echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
37 | 
38 | 
39 | 
40 | (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
41 |   cat - $lexicon_raw_nosil | uniq >$dir/lexicon.txt
42 |   echo "Lexicon text file saved as: $dir/lexicon.txt"
43 | 
44 | 
45 | exit 0
46 | 
47 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/prepare_dict2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | # Copyright (C) 2022, Johns Hopkins, 
 5 | #                   Amir Hussein
 6 | 
 7 | 
 8 | # run this from ../
 9 | 
10 | lm=$1
11 | dir=$2
12 | mkdir -p $dir
13 | 
14 | #Dictionary preparation:
15 | lexicon_raw_nosil=$dir/lexicon_nosil
16 | #cat $lm/lexicon | sed 's:\(.\):\1 :g' > $lm/uniq_grapheme
17 | #paste -d ' '  $lm/words.txt $lm/uniq_grapheme > $lexicon_raw_nosil
18 | cat $lm/lexicon | sed 's:\(.\):\1 :g' > $lm/uniq_phonemes
19 | paste -d ' '  $lm/words_lex $lm/uniq_phonemes | sort -u > $lexicon_raw_nosil
20 | 
21 | 
22 | # silence phones, one per line.
23 | 
24 | silence_phones=$dir/silence_phones.txt
25 | optional_silence=$dir/optional_silence.txt
26 | nonsil_phones=$dir/nonsilence_phones.txt
27 | 
28 | echo "Preparing phone lists and clustering questions"
29 | (echo SIL; echo SPN;) > $silence_phones
30 | echo SIL > $optional_silence
31 | # nonsilence phones; on each line is a list of phones that correspond
32 | # really to the same base phone.
33 | cat $lexicon_raw_nosil | cut -d ' ' -f2- | tr -s ' ' '\n' | sort -u > $nonsil_phones || exit 1;
34 | # A few extra questions that will be added to those obtained by automatically clustering
35 | # the "real" phones.  These ask about stress; there's also one for silence.
36 | echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
37 | echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
38 | echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
39 | 
40 | 
41 | 
42 | (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
43 |   cat - $lexicon_raw_nosil | uniq >$dir/lexicon.txt
44 |   echo "Lexicon text file saved as: $dir/lexicon.txt"
45 | 
46 | 
47 | exit 0
48 | 
49 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/prepare_grammar.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 4 | 
 5 | # Takes no arguments. 
 6 | 
 7 | 
 8 | 
 9 | tmpdir=data/local/tmp
10 | [ ! -f $tmpdir/G.txt ] && echo "No such file $tmpdir/G.txt" && exit 1;
11 | 
12 | . ./path.sh || exit 1; # for KALDI_ROOT
13 | 
14 | fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
15 |     --keep_osymbols=false $tmpdir/G.txt | fstarcsort --sort_type=ilabel > data/lang/G.fst || exit 1;
16 | 
17 | # Checking that G is stochastic [note, it wouldn't be for an Arpa]
18 | fstisstochastic data/lang/G.fst || echo Error: G is not stochastic
19 | 
20 | # Checking that G.fst is determinizable.
21 | fstdeterminize data/lang/G.fst /dev/null || echo Error determinizing G.
22 | 
23 | # Checking that L_disambig.fst is determinizable.
24 | fstdeterminize data/lang/L_disambig.fst /dev/null || echo Error determinizing L.
25 | 
26 | # Checking that disambiguated lexicon times G is determinizable
27 | fsttablecompose data/lang/L_disambig.fst data/lang/G.fst | \
28 |    fstdeterminize >/dev/null || echo Error
29 | 
30 | # Checking that LG is stochastic:
31 | fsttablecompose data/lang/L.fst data/lang/G.fst | \
32 |    fstisstochastic || echo Error: LG is not stochastic.
33 | 
34 | # Checking that L_disambig.G is stochastic:
35 | fsttablecompose data/lang/L_disambig.fst data/lang/G.fst | \
36 |    fstisstochastic || echo Error: LG is not stochastic.
37 | 
38 | echo "Succeeded preparing grammar for RM."
39 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/sample_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2022 Johns Hopkins (Amir Hussein)
 4 | 
 5 | . ./path.sh || exit 1;
 6 | 
 7 | indir=$1
 8 | outdir=$2
 9 | sample_h=$3 # number of hours
10 | i=0
11 | dur=0
12 | 
13 | if [ ! -d $outdir ] ; then
14 | 	  mkdir -p $outdir
15 | fi
16 | 
17 | if [ -f $indir/segments ] ; then
18 |   while IFS= read -r line; do
19 |   	((i=i+1))
20 |   	dur_line=$(echo $line | awk '{x=$4-$3; print x}')
21 |     dur=$(bc -l <<<"${dur}+${dur_line}/3600")
22 |     #echo "line: $i duration: $dur"
23 |   	if [ $(echo "$dur > $sample_h" | bc) -ne 0 ]; then
24 |       echo "line: $i duration: $dur"
25 |       head -n $i $indir/segments > $outdir/segments
26 |       cp $indir/text $outdir/text
27 |       cp $indir/wav.scp $outdir/wav.scp
28 |       cp $indir/utt2spk  $outdir/utt2spk
29 |   	  cp $indir/spk2utt $outdir/spk2utt
30 |       utils/fix_data_dir.sh $outdir
31 |       break
32 |     fi
33 |   	
34 |   done < $indir/segments
35 | 
36 | else
37 | 
38 |   cat $indir/wav.scp | while read line; do  
39 |     wav=$(echo $line | cut -d " " -f2-) 
40 |     dur_line=$(soxi -D  $wav)
41 |     dur=$(bc -l <<<"${dur}+${dur_line}/3600"); 
42 |     ((i=i+1))
43 |     if [ $(echo "$dur > $sample_h" | bc) -ne 0 ]; then
44 |       echo "line: $i duration: $dur"
45 |       head -n $i $indir/wav.scp > $outdir/wav.scp
46 |       head -n $i $indir/text > $outdir/text
47 |       cp $indir/utt2spk $outdir/utt2spk
48 |   	  cp $indir/spk2utt $outdir/spk2utt
49 |       utils/fix_data_dir.sh $outdir
50 |       break
51 |     fi
52 |     done
53 | fi


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/score.sh:
--------------------------------------------------------------------------------
1 | 
2 | #!/usr/bin/env bash
3 | 
4 | 
5 | steps/scoring/score_kaldi_wer.sh "$@"
6 | steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
7 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/train_lms.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | # To be run from one directory above this script.
 5 | 
 6 | 
 7 | text=data/train/text
 8 | lexicon=data/local/dict/lexicon.txt
 9 | 
10 | for f in "$text" "$lexicon"; do
11 |   [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
12 | done
13 | 
14 | # This script takes no arguments.  It assumes you have already run
15 | # aidatatang_data_prep.sh.
16 | # It takes as input the files
17 | #data/local/train/text
18 | #data/local/dict/lexicon.txt
19 | dir=data/local/lm
20 | mkdir -p $dir
21 | 
22 | export LC_ALL=C # You'll get errors about things being not sorted, if you
23 |                 # have a different locale.
24 | kaldi_lm=`which train_lm.sh`
25 | if [ ! -x $kaldi_lm ]; then
26 |   echo "$0: train_lm.sh is not found. That might mean it's not installed"
27 |   echo "$0: or it is not added to PATH"
28 |   echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
29 |   exit 1
30 | fi
31 | 
32 | cleantext=$dir/text.no_oov
33 | 
34 | cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
35 |   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
36 |   > $cleantext || exit 1;
37 | 
38 | 
39 | cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
40 |    sort -nr > $dir/word.counts || exit 1;
41 | 
42 | 
43 | # Get counts from acoustic training transcripts, and add  one-count
44 | # for each word in the lexicon (but not silence, we don't want it
45 | # in the LM-- we'll add it optionally later).
46 | cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
47 |   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
48 |    sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
49 | 
50 | # note: we probably won't really make use of <UNK> as there aren't any OOVs
51 | cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
52 |    || exit 1;
53 | 
54 | # note: ignore 1st field of train.txt, it's the utterance-id.
55 | cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
56 |   { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
57 |    || exit 1;
58 | 
59 | train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
60 | 
61 | # LM is small enough that we don't need to prune it (only about 0.7M N-grams).
62 | # Perplexity over 128254.000000 words is 90.446690
63 | 
64 | # note: output is
65 | # data/local/lm/3gram-mincount/lm_unpruned.gz
66 | 
67 | exit 0
68 | 
69 | 
70 | # From here is some commands to do a baseline with SRILM (assuming
71 | # you have it installed).
72 | heldout_sent=10000 # Don't change this if you want result to be comparable with
73 |     # kaldi_lm results
74 | sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
75 | mkdir -p $sdir
76 | cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
77 |   head -$heldout_sent > $sdir/heldout
78 | cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
79 |   tail -n +$heldout_sent > $sdir/train
80 | 
81 | cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
82 | 
83 | 
84 | ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
85 |   -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
86 | ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
87 | # 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
88 | 
89 | # Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
90 | # Difference in WSJ must have been due to different treatment of <UNK>.
91 | ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
92 | # 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/local/train_lms_extra.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Copyright (C) 2022, Johns Hopkins, Amir Hussein 
  4 | # To be run from one directory above this script.
  5 | 
  6 | lm_text=$1
  7 | 
  8 | if [ $# -ne 1 ]; then
  9 |   echo "Usage: $0 <lm-text>"
 10 |   exit 1
 11 | fi
 12 | 
 13 | lexicon=data/local/dict/lexicon.txt 
 14 | [ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
 15 | 
 16 | # check if sri is installed or no
 17 | sri_installed=false
 18 | which ngram-count  &>/dev/null
 19 | if [[ $? == 0 ]]; then 
 20 | sri_installed=true
 21 | fi
 22 | 
 23 | 
 24 | export LC_ALL=C # You'll get errors about things being not sorted, if you
 25 | # have a different locale.
 26 | export PATH=$PATH:/alt-arabic/speech/amir/kaldi/tools/kaldi_lm
 27 | ( # First make sure the kaldi_lm toolkit is installed.
 28 |  cd $KALDI_ROOT/tools || exit 1;
 29 |  if [ -d kaldi_lm ]; then
 30 |    echo Not installing the kaldi_lm toolkit since it is already there.
 31 |  else
 32 |    echo Downloading and installing the kaldi_lm tools
 33 |    if [ ! -f kaldi_lm.tar.gz ]; then
 34 |      wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
 35 |    fi
 36 |    tar -xvzf kaldi_lm.tar.gz || exit 1;
 37 |    cd kaldi_lm
 38 |    make || exit 1;
 39 |    echo Done making the kaldi_lm tools
 40 |  fi
 41 | ) || exit 1;
 42 | 
 43 | dir=data/local/ngram
 44 | if [ -d $dir ]; then
 45 |   rm -rf $dir
 46 | else
 47 |   mkdir -p $dir
 48 | fi 
 49 | 
 50 | cleantext=$lm_text
 51 | 
 52 | 
 53 | # Get counts from acoustic training transcripts, and add  one-count
 54 | # for each word in the lexicon (but not silence, we don't want it
 55 | # in the LM-- we'll add it optionally later).
 56 | 
 57 | cat $cleantext | awk '{for(n=1;n<=NF;n++) print $n; }' | \
 58 |   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
 59 |   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
 60 | 
 61 | # note: we probably won't really make use of <UNK> as there aren't any OOVs
 62 | cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
 63 |   || exit 1;
 64 | 
 65 | cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
 66 | { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
 67 |   || exit 1;
 68 | 
 69 | train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
 70 | 
 71 | train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1;
 72 | 
 73 | # From here is some commands to do a baseline with SRILM (assuming
 74 | # you have it installed).
 75 | if $sri_installed; then 
 76 | 
 77 |  heldout_sent=10000 
 78 |  sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
 79 |  mkdir -p $sdir
 80 |  cat $cleantext | awk '{for(n=1;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
 81 |    head -$heldout_sent > $sdir/heldout
 82 |  cat $cleantext | awk '{for(n=1;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
 83 |    tail -n +$heldout_sent > $sdir/train
 84 | 
 85 |  cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
 86 | 
 87 | 
 88 |  ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
 89 |    -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
 90 |  ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout 
 91 |  ngram-count -text $sdir/train -order 4 -limit-vocab -vocab $sdir/wordlist -unk \
 92 |    -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
 93 |   ngram -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/heldout
 94 | 
 95 | # Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
 96 | # Difference in WSJ must have been due to different treatment of <UNK>.
 97 | ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout 
 98 | fi
 99 | 
100 | 
101 | echo train lm succeeded
102 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/path.sh:
--------------------------------------------------------------------------------
 1 | export KALDI_ROOT=/alt-arabic/speech/amir/kaldi
 2 | 
 3 | [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 4 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 5 | 
 6 | export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 7 | [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 8 | . $KALDI_ROOT/tools/config/common_path.sh
 9 | export LC_ALL=C
10 | module load cuda10.2/toolkit gcc slurm cmake


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/results.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # The output of this script (after successfully running ./run.sh) can be found in the RESULTS file.
 4 | 
 5 | filter_regexp=.
 6 | [ $# -ge 1 ] && filter_regexp=$1
 7 | 
 8 | # kaldi scoring,
 9 | for x in exp/*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do
10 |   [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh;
11 | done 2>/dev/null
12 | for x in exp/*/chain*/*/decode*; do
13 |   [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh;
14 | done 2>/dev/null | grep $filter_regexp
15 | 
16 | # sclite scoring,
17 | for x in exp/*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do
18 |   [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh;
19 | done 2>/dev/null | grep $filter_regexp
20 | for x in exp/*/chain*/*/decode*; do
21 |   [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh;
22 | done 2>/dev/null | grep $filter_regexp
23 | 
24 | exit 0
25 | 
26 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright (C) 2022, Johns Hopkins University
  4 | #               Amir Hussein
  5 | # Apache 2.0
  6 | 
  7 | stage=6
  8 | export LC_ALL=en_US.UTF-8
  9 | 
 10 | data='data'
 11 | 
 12 | . ./path.sh
 13 | . ./cmd.sh
 14 | 
 15 | . utils/parse_options.sh
 16 | 
 17 | 
 18 | set -e -o pipefail -u
 19 | 
 20 | 
 21 | nj=100  # split training into how many jobs?
 22 | nDecodeJobs=100
 23 | 
 24 | ##########################################################
 25 | #
 26 | #  Recipe
 27 | #
 28 | ##########################################################
 29 | 
 30 | 
 31 | #1) Data preparation
 32 | 
 33 | if [ $stage -le 1 ]; then
 34 |   # DATA PREPARATION
 35 |   echo "Preparing data"
 36 |   utils/combine_data.sh data/train data/train_*
 37 |   
 38 | fi
 39 | 
 40 | lm=$data/local/lm
 41 | if [ $stage -le 2 ]; then
 42 |   #DICTIONARY PREPARATION
 43 |   echo "Preparing dictionary"
 44 |   # if [ ! -d $lm ]; then
 45 |   #   mkdir -p $lm
 46 |   # fi
 47 |   # cat $data/train/text | cut -d " " -f2- | perl -pe 's/\.\n/\n/' | perl -pe 's/\./\n/' \
 48 |   # | tr ' ' '\n'| tr '[:upper:]' '[:lower:]' | sort -u | sed -r '/^\s*$/d' > $lm/words.txt
 49 |   
 50 |   #L Compilation
 51 |   echo "Preparing lang dir"
 52 |   #./local/prepare_dict.sh $lm $data/local/dict
 53 |   ./local/prep_dict_en_zh.sh
 54 | 
 55 |   ./utils/prepare_lang.sh --position-dependent-phones false data/local/dict "<UNK>" data/local/lang data/lang
 56 | fi
 57 | 
 58 | # Using the training data transcript for building the language model
 59 | 
 60 | # cat $data/train/text | cut -d " " -f2- | perl -pe 's/\.\n/\n/' | \
 61 | # perl -pe 's/\./\n/' > $lm/train_text
 62 | 
 63 | 
 64 | if [ $stage -le 3 ]; then
 65 |   #LM TRAINING: Using the training set transcript text for language modelling
 66 |   echo "Training n-gram language model"
 67 |   # local/train_lms_extra.sh $lm/train_text
 68 |   bash local/train_lms.sh
 69 | fi
 70 | 
 71 | if [ $stage -le 4 ]; then
 72 |   #Calculating mfcc features
 73 |   mfccdir=mfcc
 74 |   echo "Computing features"
 75 |   for x in train dev; do
 76 |     steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" $data/$x \
 77 |       exp/make_mfcc/$x/log $mfccdir
 78 |     steps/compute_cmvn_stats.sh $data/$x \
 79 |       exp/make_mfcc/$x/log $mfccdir
 80 |     utils/fix_data_dir.sh $data/$x
 81 |   done
 82 | fi
 83 | 
 84 | if [ $stage -le 5 ]; then
 85 |   #G compilation
 86 |   echo "G compilation"
 87 |   local/format_data.sh --lang-test $data/lang \
 88 |     --arpa-lm $data/local/lm/3gram-mincount/lm_unpruned.gz
 89 |   
 90 |   # utils/build_const_arpa_lm.sh data/local/ngram/4gram-mincount/lm_unpruned.gz \
 91 |   #   $data/lang $data/lang_test_4g
 92 | fi
 93 | 
 94 | 
 95 | 
 96 | if [ $stage -le 6 ]; then
 97 |   #Taking 5k segments for faster training  
 98 |   utils/subset_data_dir.sh $data/train 5000 $data/train_5k
 99 |   utils/subset_data_dir.sh $data/train 10000 $data/train_10k
100 | fi
101 | 
102 | if [ $stage -le 7 ]; then
103 |   #Monophone training
104 |   steps/train_mono.sh --boost-silence 1.25 --totgauss 1000 --nj $nj --cmd "$train_cmd" \
105 |     $data/train_5k $data/lang exp/mono 
106 |   # Decode with mono
107 |   utils/mkgraph.sh --mono $data/lang exp/mono exp/mono/graph
108 |   for dev in dev; do
109 |     steps/decode.sh --config conf/decode.config \
110 |     --cmd "$decode_cmd" --nj $nDecodeJobs exp/mono/graph data/$dev exp/mono/decode_$dev
111 |   done
112 | 
113 | fi
114 | 
115 | if [ $stage -le 8 ]; then
116 |   #Monophone alignment
117 |   steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
118 |     $data/train_5k $data/lang exp/mono exp/mono_ali
119 | 
120 |   #tri1 [First triphone pass]
121 |   steps/train_deltas.sh --cmd "$train_cmd" \
122 |     2000 10000 $data/train_5k $data/lang exp/mono_ali exp/tri1 
123 | 
124 |   #tri1 decoding
125 |   utils/mkgraph.sh $data/lang exp/tri1 exp/tri1/graph
126 | 
127 |   for dev in dev; do
128 |     steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
129 |       exp/tri1/graph $data/$dev exp/tri1/decode_$dev &
130 |   done
131 | fi
132 | 
133 | if [ $stage -le 9 ]; then
134 |   #tri1 alignment
135 |   nj=100
136 |   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
137 |     $data/train_10k $data/lang exp/tri1 exp/tri1_ali 
138 | 
139 |   #tri2 [a larger model than tri1]
140 |   steps/train_deltas.sh --cmd "$train_cmd" \
141 |     2500 20000 $data/train_10k $data/lang exp/tri1_ali exp/tri2
142 | 
143 |   #tri2 decoding
144 |   utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
145 | 
146 |   for dev in dev; do
147 |    steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
148 |    exp/tri2/graph data/$dev exp/tri2/decode_$dev &
149 |   done
150 | fi
151 | 
152 | if [ $stage -le 10 ]; then
153 |   #tri2 alignment
154 |   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
155 |     $data/train $data/lang exp/tri2 exp/tri2_ali
156 | 
157 |   # tri3 training [LDA+MLLT]
158 |   steps/train_lda_mllt.sh --cmd "$train_cmd" \
159 |     3500 20000 $data/train $data/lang exp/tri2_ali exp/tri3
160 | 
161 |   #tri3 decoding
162 |   utils/mkgraph.sh $data/lang exp/tri3 exp/tri3/graph
163 | 
164 |   for dev in dev test; do
165 |    steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
166 |    exp/tri3/graph $data/$dev exp/tri3/decode_$dev & 
167 |   done
168 | fi
169 | 
170 | if [ $stage -le 11 ]; then
171 |   #tri3 alignment
172 |   #steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri3 exp/mer$mer/tri3_ali
173 |   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" $data/train \
174 |   $data/lang exp/tri3 exp/tri3_ali
175 |   #now we start building model with speaker adaptation SAT [fmllr]
176 |   steps/train_sat.sh  --cmd "$train_cmd" \
177 |     4200 40000 $data/train $data/lang exp/tri3_ali exp/tri4
178 | 
179 |   #sat decoding
180 |   utils/mkgraph.sh $data/lang exp/tri4 exp/tri4/graph
181 | 
182 |   for dev in dev; do
183 |     steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
184 |       exp/tri4/graph $data/$dev exp/tri4/decode_$dev &
185 |   done
186 | fi
187 | 
188 | exit 0;
189 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | #SBATCH -J kaldi#job name
 3 | ##SBATCH --mail-user=anh21@mail.aub.edu
 4 | ##SBATCH --mail-type=ALL
 5 | #SBATCH -o output-%j.txt #standard output file
 6 | #SBATCH -e errors-%j.txt #standard error file
 7 | #SBATCH -p gpu-all #queue used
 8 | #SBATCH --gres gpu:0 #number of gpus needed
 9 | #SBATCH -n 30
10 | #SBATCH --mem-per-cpu=8G
11 | 
12 | module purge
13 | module load slurm
14 | #module load cuda10.1/toolkit
15 | module load gcc
16 | 
17 | ./run.sh --stage 9
18 | 
19 | 


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/steps:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/kaldi/egs/wsj/s5/steps


--------------------------------------------------------------------------------
/asr1/kaldi_cmn/utils:
--------------------------------------------------------------------------------
1 | /alt-arabic/speech/amir/kaldi/egs/wsj/s5/utils


--------------------------------------------------------------------------------
/conf/slurm.conf:
--------------------------------------------------------------------------------
 1 | #command sbatch --ntasks-per-node=1 --partition=gpu-all  -x crimv3srv028,crimv3srv034,crimv3srv037,crimv3mgpu009
 2 | command sbatch --ntasks-per-node=1 --partition=gpu-all 
 3 | option mem=* --mem-per-cpu=$0
 4 | option mem=0          # Do not add anything to qsub_opts
 5 | option num_threads=* --cpus-per-task=$0 --ntasks-per-node=1
 6 | option num_threads=1 --cpus-per-task=1  --ntasks-per-node=1 # Do not add anything to qsub_opts
 7 | option max_jobs_run=*     # Do nothing
 8 | option gpu=* -N1 -n1 -p gpu-all  --mem=8GB --gres=gpu:$0 --cpus-per-task=2 --time=72:0:0  # in reality, we probably should have --cpus-per-task=$((6*$0))
 9 | 
10 | option gpu=0
11 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: collage
  2 | channels:
  3 |   - pytorch
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=main
  7 |   - _openmp_mutex=5.1=1_gnu
  8 |   - anyio=3.5.0=py38h06a4308_0
  9 |   - argon2-cffi=21.3.0=pyhd3eb1b0_0
 10 |   - argon2-cffi-bindings=21.2.0=py38h7f8727e_0
 11 |   - asttokens=2.0.5=pyhd3eb1b0_0
 12 |   - attrs=22.1.0=py38h06a4308_0
 13 |   - babel=2.9.1=pyhd3eb1b0_0
 14 |   - backcall=0.2.0=pyhd3eb1b0_0
 15 |   - beautifulsoup4=4.11.1=py38h06a4308_0
 16 |   - blas=1.0=mkl
 17 |   - bleach=4.1.0=pyhd3eb1b0_0
 18 |   - brotlipy=0.7.0=py38h27cfd23_1003
 19 |   - bzip2=1.0.8=h7b6447c_0
 20 |   - ca-certificates=2022.10.11=h06a4308_0
 21 |   - certifi=2022.12.7=py38h06a4308_0
 22 |   - cffi=1.15.1=py38h74dc2b5_0
 23 |   - cryptography=38.0.1=py38h9ce1e76_0
 24 |   - cudatoolkit=10.2.89=hfd86e86_1
 25 |   - dbus=1.13.18=hb2f20db_0
 26 |   - debugpy=1.5.1=py38h295c915_0
 27 |   - decorator=5.1.1=pyhd3eb1b0_0
 28 |   - defusedxml=0.7.1=pyhd3eb1b0_0
 29 |   - expat=2.4.9=h6a678d5_0
 30 |   - ffmpeg=4.2.2=h20bf706_0
 31 |   - flit-core=3.6.0=pyhd3eb1b0_0
 32 |   - fontconfig=2.14.1=hef1e5e3_0
 33 |   - freetype=2.12.1=h4a9f257_0
 34 |   - giflib=5.2.1=h7b6447c_0
 35 |   - glib=2.69.1=h4ff587b_1
 36 |   - gmp=6.2.1=h295c915_3
 37 |   - gnutls=3.6.15=he1e5248_0
 38 |   - gst-plugins-base=1.14.0=h8213a91_2
 39 |   - gstreamer=1.14.0=h28cd5cc_2
 40 |   - icu=58.2=he6710b0_3
 41 |   - idna=3.4=py38h06a4308_0
 42 |   - importlib_resources=5.2.0=pyhd3eb1b0_1
 43 |   - iniconfig=1.1.1=pyhd3eb1b0_0
 44 |   - intel-openmp=2021.4.0=h06a4308_3561
 45 |   - ipython_genutils=0.2.0=pyhd3eb1b0_1
 46 |   - ipywidgets=7.6.5=pyhd3eb1b0_1
 47 |   - jedi=0.18.1=py38h06a4308_1
 48 |   - jinja2=3.1.2=py38h06a4308_0
 49 |   - jpeg=9e=h7f8727e_0
 50 |   - json5=0.9.6=pyhd3eb1b0_0
 51 |   - jsonschema=4.16.0=py38h06a4308_0
 52 |   - jupyter=1.0.0=py38h06a4308_8
 53 |   - jupyter_client=7.4.7=py38h06a4308_0
 54 |   - jupyter_console=6.4.3=pyhd3eb1b0_0
 55 |   - jupyter_core=4.11.2=py38h06a4308_0
 56 |   - jupyter_server=1.18.1=py38h06a4308_0
 57 |   - jupyterlab_pygments=0.1.2=py_0
 58 |   - jupyterlab_server=2.16.3=py38h06a4308_0
 59 |   - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
 60 |   - krb5=1.19.2=hac12032_0
 61 |   - lame=3.100=h7b6447c_0
 62 |   - lcms2=2.12=h3be6417_0
 63 |   - ld_impl_linux-64=2.38=h1181459_1
 64 |   - lerc=3.0=h295c915_0
 65 |   - libclang=10.0.1=default_hb85057a_2
 66 |   - libdeflate=1.8=h7f8727e_5
 67 |   - libedit=3.1.20210910=h7f8727e_0
 68 |   - libevent=2.1.12=h8f2d780_0
 69 |   - libffi=3.3=he6710b0_2
 70 |   - libgcc-ng=11.2.0=h1234567_1
 71 |   - libgomp=11.2.0=h1234567_1
 72 |   - libidn2=2.3.2=h7f8727e_0
 73 |   - libllvm10=10.0.1=hbcb73fb_5
 74 |   - libopus=1.3.1=h7b6447c_0
 75 |   - libpng=1.6.37=hbc83047_0
 76 |   - libpq=12.9=h16c4e8d_3
 77 |   - libsodium=1.0.18=h7b6447c_0
 78 |   - libstdcxx-ng=11.2.0=h1234567_1
 79 |   - libtasn1=4.16.0=h27cfd23_0
 80 |   - libtiff=4.4.0=hecacb30_2
 81 |   - libunistring=0.9.10=h27cfd23_0
 82 |   - libvpx=1.7.0=h439df22_0
 83 |   - libwebp=1.2.4=h11a3e52_0
 84 |   - libwebp-base=1.2.4=h5eee18b_0
 85 |   - libxcb=1.15=h7f8727e_0
 86 |   - libxkbcommon=1.0.1=hfa300c1_0
 87 |   - libxml2=2.9.14=h74e7548_0
 88 |   - libxslt=1.1.35=h4e12654_0
 89 |   - lz4-c=1.9.3=h295c915_1
 90 |   - mistune=0.8.4=py38h7b6447c_1000
 91 |   - mkl=2021.4.0=h06a4308_640
 92 |   - mkl-service=2.4.0=py38h7f8727e_0
 93 |   - mkl_fft=1.3.1=py38hd3c417c_0
 94 |   - mkl_random=1.2.2=py38h51133e4_0
 95 |   - nbclassic=0.4.8=py38h06a4308_0
 96 |   - ncurses=6.3=h5eee18b_3
 97 |   - nettle=3.7.3=hbbd107a_1
 98 |   - notebook-shim=0.2.2=py38h06a4308_0
 99 |   - nspr=4.33=h295c915_0
100 |   - nss=3.74=h0370c37_0
101 |   - numpy=1.23.4=py38h14f4228_0
102 |   - numpy-base=1.23.4=py38h31eccc5_0
103 |   - openh264=2.1.1=h4ff587b_0
104 |   - openssl=1.1.1s=h7f8727e_0
105 |   - packaging=21.3=pyhd3eb1b0_0
106 |   - pandocfilters=1.5.0=pyhd3eb1b0_0
107 |   - parso=0.8.3=pyhd3eb1b0_0
108 |   - pcre=8.45=h295c915_0
109 |   - pexpect=4.8.0=pyhd3eb1b0_3
110 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
111 |   - pip=22.2.2=py38h06a4308_0
112 |   - pkgutil-resolve-name=1.3.10=py38h06a4308_0
113 |   - ply=3.11=py38_0
114 |   - prometheus_client=0.14.1=py38h06a4308_0
115 |   - prompt_toolkit=3.0.20=hd3eb1b0_0
116 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
117 |   - pure_eval=0.2.2=pyhd3eb1b0_0
118 |   - py=1.11.0=pyhd3eb1b0_0
119 |   - pycparser=2.21=pyhd3eb1b0_0
120 |   - pygments=2.11.2=pyhd3eb1b0_0
121 |   - pyopenssl=22.0.0=pyhd3eb1b0_0
122 |   - pyqt=5.15.7=py38h6a678d5_1
123 |   - pyqt5-sip=12.11.0=py38h6a678d5_1
124 |   - pysocks=1.7.1=py38h06a4308_0
125 |   - python=3.8.12=h12debd9_0
126 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
127 |   - python-fastjsonschema=2.16.2=py38h06a4308_0
128 |   - pyzmq=23.2.0=py38h6a678d5_0
129 |   - qt-main=5.15.2=h327a75a_7
130 |   - qt-webengine=5.15.9=hd2b0992_4
131 |   - qtconsole=5.3.2=py38h06a4308_0
132 |   - qtpy=2.2.0=py38h06a4308_0
133 |   - qtwebkit=5.212=h4eab89a_4
134 |   - readline=8.2=h5eee18b_0
135 |   - send2trash=1.8.0=pyhd3eb1b0_1
136 |   - setuptools=65.5.0=py38h06a4308_0
137 |   - sip=6.6.2=py38h6a678d5_0
138 |   - six=1.16.0=pyhd3eb1b0_1
139 |   - sniffio=1.2.0=py38h06a4308_1
140 |   - soupsieve=2.3.2.post1=py38h06a4308_0
141 |   - sqlite=3.40.0=h5082296_0
142 |   - stack_data=0.2.0=pyhd3eb1b0_0
143 |   - tinycss2=1.2.1=py38h06a4308_0
144 |   - tk=8.6.12=h1ccaba5_0
145 |   - toml=0.10.2=pyhd3eb1b0_0
146 |   - tornado=6.2=py38h5eee18b_0
147 |   - traitlets=5.1.1=pyhd3eb1b0_0
148 |   - typing_extensions=4.4.0=py38h06a4308_0
149 |   - wcwidth=0.2.5=pyhd3eb1b0_0
150 |   - wheel=0.37.1=pyhd3eb1b0_0
151 |   - widgetsnbextension=3.5.2=py38h06a4308_0
152 |   - x264=1!157.20191217=h7b6447c_0
153 |   - xz=5.2.8=h5eee18b_0
154 |   - zeromq=4.3.4=h2531618_0
155 |   - zlib=1.2.13=h5eee18b_0
156 |   - zstd=1.5.2=ha4553b6_0
157 |   - pip:
158 |     - absl-py==1.0.0
159 |     - alabaster==0.7.12
160 |     - appdirs==1.4.4
161 |     - argparse==1.4.0
162 |     - audioread==2.1.9
163 |     - autopep8==2.0.1
164 |     - bibtexparser==1.4.0
165 |     - black==22.3.0
166 |     - blessings==1.7
167 |     - braceexpand==0.1.7
168 |     - cachetools==4.2.4
169 |     - cfgv==3.3.1
170 |     - charset-normalizer==2.0.10
171 |     - ci-sdr==0.0.0
172 |     - click==8.0.3
173 |     - cloudpickle==2.1.0
174 |     - cmake==3.18.0
175 |     - colorama==0.4.5
176 |     - commonmark==0.9.1
177 |     - configargparse==1.5.3
178 |     - coverage==6.5.0
179 |     - ctc-segmentation==1.7.1
180 |     - cycler==0.11.0
181 |     - cython==0.29.32
182 |     - cytoolz==0.11.2
183 |     - dataclasses==0.6
184 |     - dill==0.3.6
185 |     - distance==0.1.3
186 |     - distlib==0.3.4
187 |     - docopt==0.6.2
188 |     - docutils==0.17.1
189 |     - editdistance==0.6.0
190 |     - einops==0.4.1
191 |     - entrypoints==0.3
192 |     - espnet==202207
193 |     - espnet-tts-frontend==0.0.3
194 |     - exceptiongroup==1.1.1
195 |     - execnet==1.9.0
196 |     - executing==0.8.2
197 |     - fairseq==0.6.2
198 |     - fast-bss-eval==0.1.3
199 |     - filelock==3.7.0
200 |     - flake8==5.0.4
201 |     - fonttools==4.28.5
202 |     - g2p-en==2.1.0
203 |     - google-auth==2.3.3
204 |     - google-auth-oauthlib==0.4.6
205 |     - gpustat==0.6.0
206 |     - grpcio==1.43.0
207 |     - h5py==3.6.0
208 |     - huggingface-hub==0.8.1
209 |     - humanfriendly==10.0
210 |     - hypothesis==6.56.0
211 |     - identify==2.5.1
212 |     - imagesize==1.3.0
213 |     - importlib-metadata==4.10.1
214 |     - importlib-resources==5.4.0
215 |     - inflect==6.0.0
216 |     - intervaltree==3.1.0
217 |     - ipykernel==6.7.0
218 |     - ipython==8.0.1
219 |     - isort==5.10.1
220 |     - jaconv==0.3
221 |     - jamo==0.4.1
222 |     - jiwer==2.3.0
223 |     - joblib==1.1.0
224 |     - jupyter-client==7.1.1
225 |     - jupyter-core==4.9.1
226 |     - jupyter-server==1.13.3
227 |     - jupyterlab==3.2.8
228 |     - jupyterlab-server==2.10.3
229 |     - kaldi-native-io==1.9
230 |     - kaldialign==0.2
231 |     - kaldifeat==1.21
232 |     - kaldifst==1.6
233 |     - kaldiio==2.17.2
234 |     - kaldilm==1.11
235 |     - kiwisolver==1.3.2
236 |     - lhotse==1.15.0.dev0+git.6fcfced.clean
237 |     - librosa==0.9.1
238 |     - lilcom==1.1.1
239 |     - littleutils==0.2.2
240 |     - llvmlite==0.38.0
241 |     - lxml==4.8.0
242 |     - markdown==3.3.6
243 |     - markupsafe==2.0.1
244 |     - matplotlib==3.5.1
245 |     - matplotlib-inline==0.1.3
246 |     - mccabe==0.7.0
247 |     - more-itertools==8.12.0
248 |     - msgspec==0.14.0
249 |     - mypy-extensions==0.4.3
250 |     - nbclient==0.5.10
251 |     - nbconvert==6.4.0
252 |     - nbformat==5.1.3
253 |     - nest-asyncio==1.5.4
254 |     - nltk==3.7
255 |     - nodeenv==1.6.0
256 |     - notebook==6.4.7
257 |     - numba==0.55.1
258 |     - nvidia-ml-py3==7.352.0
259 |     - nvidia-smi==0.1.3
260 |     - oauthlib==3.1.1
261 |     - optimized-transducer==1.3
262 |     - orjson==3.8.12
263 |     - pandas==1.4.2
264 |     - pathspec==0.9.0
265 |     - pillow==9.0.0
266 |     - pipreqs==0.4.11
267 |     - platformdirs==2.4.1
268 |     - pluggy==0.13.1
269 |     - pooch==1.6.0
270 |     - portalocker==2.5.1
271 |     - pre-commit==2.19.0
272 |     - prometheus-client==0.12.0
273 |     - prompt-toolkit==3.0.24
274 |     - protobuf==3.19.3
275 |     - psutil==5.9.1
276 |     - pure-eval==0.2.1
277 |     - py-spy==0.3.12
278 |     - pyarabic==0.6.15
279 |     - pyasn1==0.4.8
280 |     - pyasn1-modules==0.2.8
281 |     - pycodestyle==2.9.1
282 |     - pydantic==1.9.1
283 |     - pyflakes==2.5.0
284 |     - pyparsing==3.0.6
285 |     - pypinyin==0.44.0
286 |     - pyrsistent==0.18.1
287 |     - pytest==5.4.3
288 |     - pytest-cov==4.0.0
289 |     - pytest-forked==1.4.0
290 |     - pytest-xdist==2.5.0
291 |     - python-graphviz==0.20
292 |     - python-levenshtein==0.12.2
293 |     - pytz==2021.3
294 |     - pyworld==0.3.0
295 |     - pyyaml==6.0
296 |     - regex==2022.7.25
297 |     - requests==2.27.1
298 |     - requests-oauthlib==1.3.0
299 |     - resampy==0.2.2
300 |     - rich==12.4.4
301 |     - rsa==4.8
302 |     - sacrebleu==2.2.0
303 |     - scalene==1.5.8
304 |     - scikit-learn==1.0.2
305 |     - scipy==1.8.0
306 |     - seaborn==0.12.1
307 |     - sentencepiece==0.1.96
308 |     - snowballstemmer==2.2.0
309 |     - sorcery==0.2.2
310 |     - sortedcontainers==2.4.0
311 |     - soundfile==0.10.3.post1
312 |     - sphinx==4.2.0
313 |     - sphinx-autodoc-typehints==1.12.0
314 |     - sphinx-click==3.0.1
315 |     - sphinx-rtd-theme==1.0.0
316 |     - sphinxcontrib-applehelp==1.0.2
317 |     - sphinxcontrib-devhelp==1.0.2
318 |     - sphinxcontrib-htmlhelp==2.0.0
319 |     - sphinxcontrib-jsmath==1.0.1
320 |     - sphinxcontrib-qthelp==1.0.3
321 |     - sphinxcontrib-serializinghtml==1.1.5
322 |     - stack-data==0.1.4
323 |     - tabulate==0.8.10
324 |     - tensorboard==2.7.0
325 |     - tensorboard-data-server==0.6.1
326 |     - tensorboard-plugin-wit==1.8.1
327 |     - terminado==0.12.1
328 |     - testpath==0.5.0
329 |     - threadpoolctl==3.1.0
330 |     - tokenizers==0.12.1
331 |     - tomli==1.2.3
332 |     - toolz==0.11.2
333 |     - torch==1.8.1
334 |     - torch-complex==0.4.3
335 |     - torchaudio==0.7.2
336 |     - torchtext==0.8.1
337 |     - torchvision==0.8.2
338 |     - tqdm==4.62.3
339 |     - transformers==4.21.0
340 |     - typeguard==2.13.3
341 |     - typing-extensions==4.2.0
342 |     - unidecode==1.3.4
343 |     - urllib3==1.26.8
344 |     - virtualenv==20.14.1
345 |     - webdataset==0.2.5
346 |     - webencodings==0.5.1
347 |     - websocket-client==1.2.3
348 |     - websockets==10.4
349 |     - werkzeug==2.0.2
350 |     - wrapt==1.14.1
351 |     - yarg==0.1.9
352 |     - zipp==3.7.0
353 | 


--------------------------------------------------------------------------------
/images/high-level.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/images/high-level.png


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | # function to print logs
  4 | log() {
  5 |     local fname=${BASH_SOURCE[1]##*/}
  6 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
  7 | }
  8 | 
  9 | stage=2
 10 | stop_stage=3
 11 | utils=/alt-arabic/speech/amir/kaldi/egs/wsj/s5/utils
 12 | cmd=/alt-arabic/speech/amir/kaldi/egs/wsj/s5/utils/parallel/slurm.pl
 13 | nj=150			# number of jobs across the nodes on cluster
 14 | mode=bigram 	# different modes to generate the code switching: unigram, unigram_imp, bigrams
 15 | dir=/jsalt2/amir/generating-code-switched-audio2/
 16 | logdir="$dir/slurm_log_${mode}"
 17 | indir="$dir/data"
 18 | 
 19 | 
 20 | if [ ! -d "$logdir" ]; then
 21 |   echo "$logdir does not exist, creating new $logdir"
 22 |   mkdir -p $logdir
 23 | fi
 24 | 
 25 | # input CS <text_file> 
 26 | input_text="${indir}/text_60K" 	
 27 | # desired output directory for audios 
 28 | outdir="$dir/exp/${mode}" 	
 29 | 
 30 | # directory to store json dictionaries of supervisions and recordings are stored
 31 | exp="$dir/data_${mode}" 	
 32 | if [ ! -d "$exp" ]; then
 33 |   echo "$exp does not exist, creating  $exp"
 34 |   mkdir -p $exp
 35 | fi
 36 | 
 37 | proc=1
 38 | clean_dir=false
 39 | 
 40 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 41 | 	if [ $clean_dir == true ]; then
 42 | 		log "removing $outdir and $logdir"
 43 | 		rm -rf $outdir
 44 | 		rm -rf $logdir
 45 | 	fi
 46 | 	# split file to number of jobs
 47 | 	log "Splitting $input_text into $nj jobs"
 48 | 
 49 | 	split_scps=
 50 | 	for n in $(seq $nj); do
 51 | 		split_scps="$split_scps $logdir/out.$n"
 52 | 	done
 53 | 
 54 | 	$utils/split_scp.pl $input_text $split_scps
 55 | 
 56 | 	mkdir -p $outdir
 57 | fi
 58 | 
 59 | # run array jobs 
 60 | 
 61 | if [ $mode == unigram ]; then 
 62 | 	log "$mode running array jobs"
 63 | 	if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 64 | 		log "Preparing recordings dict: python src/setup_recording_dict.py ${indir}/wav.scp ${outdir}"
 65 | 		python src/setup_recording_dict.py ${indir}/wav.scp ${exp}
 66 | 
 67 | 		log "Preparing supervisions: python setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.json ${exp}"
 68 | 		python src/setup_supervision_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
 69 | 	fi
 70 | 
 71 | 	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 72 | 		log "Generating audio with $mode mode"
 73 | 		$cmd JOB=1:$nj $logdir/out.JOB.log \
 74 | 			./src/generate_unigram.py \
 75 | 					--input $logdir/out.JOB \
 76 | 					--output $logdir/gen_JOB \
 77 | 					--data $exp \
 78 | 					--jobs $proc
 79 | 	fi
 80 | 
 81 | elif [ $mode == "unigram_imp" ]; then
 82 | 	log "$mode running array jobs"
 83 | 	if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 84 | 		log "Preparing recordings dict: python src/setup_recording_dict.py ${indir}/wav.scp ${outdir}"
 85 | 		python src/setup_recording_dict.py ${indir}/wav.scp ${exp}
 86 | 
 87 | 		log "Preparing supervisions with hamming window: python setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.json ${exp}"
 88 | 		python src/setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
 89 | 	fi
 90 | 
 91 | 	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 92 | 		log "Generating audio with $mode mode"
 93 | 		$cmd JOB=1:$nj $logdir/out.JOB.log \
 94 | 			./src/generate_unigram_improved.py \
 95 | 					--input $logdir/out.JOB \
 96 | 					--output $logdir/gen_JOB \
 97 | 					--data $exp \
 98 | 					--jobs $proc
 99 | 	fi
100 | elif [ $mode == "bigram" ]; then
101 | 	log "$mode running array jobs"
102 | 	if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
103 | 		log "Preparing recordings dict: python src/setup_recording_dict.py ${indir}/wav.scp ${outdir}"
104 | 		python src/setup_recording_dict.py ${indir}/wav.scp ${exp}
105 | 		
106 | 		log "Preparing supervisions: python setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.json ${exp}"
107 | 		python src/setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
108 | 		
109 | 		log "Preparing bigram supervisions: python src/setup_bigram_sup_dict.py ${indir}/ctm.mono ${exp}/recording_dict.json ${exp}"
110 | 		python src/setup_bigram_sup_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
111 | 
112 | 	fi
113 | 
114 | 	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
115 | 		log "Generating audio with $mode mode"
116 | 		$cmd JOB=1:$nj $logdir/out.JOB.log \
117 | 			./src/generate_bigram.py \
118 | 					--input $logdir/out.JOB \
119 | 					--output $logdir/gen_JOB \
120 | 					--data $exp \
121 | 					--jobs $proc
122 | 	fi
123 | fi
124 | 
125 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
126 | # Concatenate the files together.
127 | 	log "Concatenate the files together"
128 | 	for n in $(seq $nj); do
129 | 		cat $logdir/gen_$n/transcripts.txt
130 | 	done > $outdir/transcripts.txt
131 | 
132 | 	cp $logdir/gen_*/*.wav $outdir/
133 | 	# check if all lines were processed
134 | 	nf=$(wc -l < $input_text)
135 | 	nu=$(wc -l < $outdir/transcripts.txt)
136 | 	if [ $nf -ne $nu ]; then
137 | 		log "Warning $0: It seems not all of the lines were successfully procesed" 
138 | 		log "$nu out of $nf were processed"
139 | 	fi
140 | fi
141 | 
142 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
143 | 
144 | 	log "creating $mode directory with wav.scp, text, fake utt2spk"
145 | 	mkdir $mode
146 | 	python utils/make_wav_scp.py --audio-dir $outdir --out-dir $mode
147 | 	cp $outdir/transcripts.txt $mode/text
148 | 	cat $mode/wav.scp | awk '{print $1 " " $1}' > $mode/utt2spk
149 | 	cp $mode/utt2spk $mode/spk2utt
150 | 
151 | fi
152 | 


--------------------------------------------------------------------------------
/run_cmn.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | # Copyright 2023 Jons Hopkins University (Amir Hussein)
  4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | 
  7 | # function to print logs
  8 | log() {
  9 |     local fname=${BASH_SOURCE[1]##*/}
 10 |     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 11 | }
 12 | 
 13 | process_ctm=false
 14 | lang=cmn
 15 | stage=-1
 16 | stop_stage=3
 17 | utils=/alt-arabic/speech/amir/kaldi/egs/wsj/s5/utils
 18 | cmd=/alt-arabic/speech/amir/kaldi/egs/wsj/s5/utils/parallel/slurm.pl
 19 | nj=100		# number of jobs across the nodes
 20 | mode=unigram 	# different modes to generate the code switching: unigram, unigram_imp, bigrams
 21 | logdir="/jsalt2/amir/generating-code-switched-audio2/slurm_log_${lang}_${mode}"
 22 | 
 23 | indir="/jsalt2/amir/generating-code-switched-audio2/data_cmn"
 24 | inputlist="${indir}/text" 	# input transcript for generation
 25 | outdir="/jsalt2/amir/generating-code-switched-audio2/exp_cmn/${mode}" 	# desired output directory for audios 
 26 | 
 27 | exp="/jsalt2/amir/generating-code-switched-audio2/exp_cmn/data_${mode}" 	# where json dictionaries of supervisions and recordings are stored
 28 | 
 29 | 
 30 | if [ ! -d "$exp" ]; then
 31 |   echo "$exp does not exist, creating  $exp"
 32 |   mkdir -p $exp
 33 | fi
 34 | 
 35 | proc=1
 36 | clean_dir=false
 37 | 
 38 | if [ $clean_dir == true ]; then
 39 | 	log "removing $outdir and  $logdir"
 40 | 	rm -rf $outdir
 41 | 	rm -rf $logdir
 42 | fi
 43 | 
 44 | if [ ! -d "$logdir" ]; then
 45 | 	echo "$logdir does not exist, creating new $logdir"
 46 | 	mkdir -p $logdir
 47 | fi
 48 | 
 49 | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 50 | 	if [ $process_ctm == true ]; then
 51 | 		python src2/seg2rec_ctm.py $indir
 52 | 	fi
 53 | 
 54 | 
 55 | 	# split file to number of jobs
 56 | 	log "Splitting $inputlist into $nj jobs"
 57 | 
 58 | 	split_scps=
 59 | 	for n in $(seq $nj); do
 60 | 		split_scps="$split_scps $logdir/out.$n"
 61 | 	done
 62 | 
 63 | 	$utils/split_scp.pl $inputlist $split_scps
 64 | 
 65 | 	mkdir -p $outdir
 66 | fi
 67 | 
 68 | # run array jobs 
 69 | 
 70 | if [ $mode == "unigram" ]; then 
 71 | 	log "$mode running array jobs"
 72 | 	if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 73 | 		log "Preparing recordings dict: python src2/setup_recording_dict.py ${indir}/wav.scp ${outdir}"
 74 | 
 75 | 		python src2/setup_recording_dict.py ${indir}/wav.scp ${exp}
 76 | 
 77 | 		log "Preparing supervisions: python setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.json ${exp}"
 78 | 
 79 | 		python src2/setup_supervision_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
 80 | 	fi
 81 | 
 82 | 	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 83 | 		log "Stage 1: Generating audio with $mode mode"
 84 | 		$cmd JOB=1:$nj $logdir/out.JOB.log \
 85 | 			./src2/generate_unigram.py \
 86 | 					--input $logdir/out.JOB \
 87 | 					--output $logdir/gen_JOB \
 88 | 					--data $exp \
 89 | 					--process $proc
 90 | 	fi
 91 | 
 92 | elif [ $mode == "unigram_imp" ]; then
 93 | 	log "$mode running array jobs"
 94 | 	if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 95 | 		log "Preparing recordings dict: python src2/setup_recording_dict.py ${indir}/wav.scp ${outdir}"
 96 | 
 97 | 		python src2/setup_recording_dict.py ${indir}/wav.scp ${exp}
 98 | 
 99 | 		log "Preparing supervisions with hamming window: python setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}"
100 | 
101 | 		python src2/setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
102 | 	fi
103 | 
104 | 	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
105 | 		log "Generating audio with $mode mode"
106 | 		$cmd JOB=1:$nj $logdir/out.JOB.log \
107 | 			./src2/generate_unigram_improved.py \
108 | 					--input $logdir/out.JOB \
109 | 					--output $logdir/gen_JOB \
110 | 					--data $exp \
111 | 					--process $proc
112 | 	fi
113 | elif [ $mode == "bigram" ]; then
114 | 	log "$mode running array jobs"
115 | 	if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
116 | 		log "Preparing recordings dict: python src2/setup_recording_dict.py ${indir}/wav.scp ${outdir}"
117 | 		python src2/setup_recording_dict.py ${indir}/wav.scp ${exp}
118 | 
119 | 		log "Preparing bigram supervisions: python src2/setup_supervision_bigram_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}"
120 | 		python src2/setup_supervision_bigram_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
121 | 
122 | 		log "Preparing unigram_imp supervisions: python setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}"
123 | 		python src2/setup_supervision_improved_dict.py ${indir}/ctm.mono ${exp}/recording_dict.pkl ${exp}
124 | 	fi
125 | 
126 | 	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
127 | 		log "Generating audio with $mode mode"
128 | 		$cmd JOB=1:$nj $logdir/out.JOB.log \
129 | 			./src2/generate_bigram.py \
130 | 					--input $logdir/out.JOB \
131 | 					--output $logdir/gen_JOB \
132 | 					--data $exp \
133 | 					--process $proc
134 | 	fi
135 | fi
136 | 
137 | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
138 | # Concatenate the files together.
139 | 	log "Concatenate the files together"
140 | 	for n in $(seq $nj); do
141 | 		cat $logdir/gen_$n/transcripts.txt
142 | 	done > $outdir/transcripts.txt
143 | 
144 | 	for n in $(seq $nj); do
145 | 		log "copying wav files from gen_$n/"
146 | 		cp $logdir/gen_$n/*.wav $outdir/
147 | 	done
148 | 	# check if all lines were processed
149 | 	nf=$(wc -l < $inputlist)
150 | 	nu=$(wc -l < $outdir/transcripts.txt)
151 | 	if [ $nf -ne $nu ]; then
152 | 		log "$0: It seems not all of the text lines were successfully generated" 
153 | 		log "$nu out of $nf were generated"
154 | 	fi
155 | fi
156 | 
157 | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
158 | 
159 | 	log "creating ${mode}_${lang} directory with wav.scp, text, fake utt2spk"
160 | 	mkdir ${mode}_${lang}
161 | 
162 | 	if [ ! -d "${mode}_${lang}" ]; then
163 | 		mkdir -p ${mode}_${lang}
164 | 	fi
165 | 
166 | 	python make_wav_scp.py --audio-dir $outdir --out-dir ${mode}_${lang}
167 | 	cp $outdir/transcripts.txt ${mode}_${lang}/text
168 | 	cat ${mode}_${lang}/wav.scp | awk '{print $1 " " $1}' > ${mode}_${lang}/utt2spk
169 | 	cp ${mode}_${lang}/utt2spk ${mode}_${lang}/spk2utt
170 | 
171 | fi


--------------------------------------------------------------------------------
/src/__pycache__/splice_bigram_random.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/splice_bigram_random.cpython-38.pyc


--------------------------------------------------------------------------------
/src/__pycache__/splice_unigram.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/splice_unigram.cpython-38.pyc


--------------------------------------------------------------------------------
/src/__pycache__/splice_unigram.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/splice_unigram.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/splice_unigram_improved.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/splice_unigram_improved.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/splice_unigram_improved.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/splice_unigram_improved.cpython-38.pyc


--------------------------------------------------------------------------------
/src/__pycache__/splice_unigram_improved.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/splice_unigram_improved.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/src/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/src/generate_bigram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2023 Jons Hopkins University (Amir Hussein)
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | 
 7 | 
 8 | import os, sys
 9 | import multiprocessing
10 | import time
11 | import argparse
12 | import splice_bigram_random as sp2
13 | from lhotse import Recording
14 | from pathlib import Path
15 | import logging
16 | from utils import dump_pickled, load_pickled
17 | import msgspec
18 | import pdb
19 | 
20 | parser = argparse.ArgumentParser(description='CS Audio generation pipeline')
21 | # Datasets
22 | parser.add_argument('--input', type=str, required=True,
23 |                     help='Input text file including ..')
24 | parser.add_argument('--output', type=str, required=True,
25 |                     help='Output directory including ..')
26 | parser.add_argument('--data', type=str, required=True, help='data path')
27 | 
28 | parser.add_argument('--jobs', default=25, type=int, metavar='N',
29 |                     help='number of multiprocess to run')
30 | 
31 | # parser.add_argument('--smoothing', action='store_true',
32 | #                     help='use smoothing technique')
33 | 
34 | 
35 | args = parser.parse_args()
36 | print(args)
37 | 
38 | 
39 | def generate(generated_text, output_directory_path, recordings, uni_sups, bi_sups):
40 |     sp2.create_cs_audio(generated_text,output_directory_path,recordings,uni_sups,bi_sups)
41 | 
42 | def chunks(list, n):
43 |     return [list[i:i+n] for i in range(0, len(list), n)]
44 | 
45 | 
46 | def main():
47 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
48 |     start_time = time.perf_counter()
49 | 
50 |     proc_count = args.jobs
51 | 
52 |     data_path = Path(args.data) #'./data/' #
53 |     uni_sups = data_path / 'supervisions.pkl'
54 |     rec_path = data_path / 'recording_dict.pkl'
55 |     bi_sups = data_path / 'bigram_supervisions.pkl'
56 | 
57 |     logging.info(f"Loading supervisions and recordings...")
58 |     uni_supervisions, bi_supervisions, recs = sp2.load_dicts_modified(uni_sups, bi_sups, rec_path)
59 |     
60 |     logging.info(f"Finished loading supervisions and recordings...")
61 | 
62 |     #recs = {key: Recording.from_file(val).move_to_memory(channels=0,format="wav") for key, val in recs.items()}
63 |     recs = {key: Recording.from_file(val) for key, val in recs.items()}
64 |     inlist = open(args.input, 'r+', encoding='utf8', errors='ignore').readlines()
65 |     # inlist=open(args.input,'r').readlines()
66 |     outdir = args.output
67 |     isExist = os.path.exists(outdir)
68 |     if not isExist:
69 |         os.makedirs(outdir)
70 |     total = len(inlist)
71 |     chunk_size = total // proc_count
72 | 
73 |     logging.info(f"Total: {total} Chunk size: {chunk_size}")
74 | 
75 |     slice = chunks(inlist, chunk_size)
76 |     processes = []
77 | 
78 |     if proc_count <= 1:
79 |         generate(inlist,outdir,recs,uni_supervisions,bi_supervisions)
80 |     elif proc_count >= 2:
81 |         for i, s in enumerate(slice):
82 |             p = multiprocessing.Process(target=generate, args=(s,outdir,recs,uni_supervisions,bi_supervisions))
83 |             p.start()
84 |             processes.append(p)
85 | 
86 |     # Joins all the processes
87 |     for p in processes:
88 |         p.join()
89 | 
90 |     finish_time = time.perf_counter()
91 | 
92 |     print(f"Program finished in {finish_time - start_time} seconds")
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     
97 |     main()
98 | 


--------------------------------------------------------------------------------
/src/generate_unigram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) SAC
 3 | 
 4 | # Apache 2.0
 5 | 
 6 | 
 7 | 
 8 | import os, sys
 9 | import multiprocessing
10 | import time
11 | import argparse
12 | import splice_unigram as sp2
13 | from lhotse import Recording
14 | import logging
15 | from pathlib import Path
16 | 
17 | parser = argparse.ArgumentParser(description='CS Audio generation pipeline')
18 | # Datasets
19 | parser.add_argument('--input', type=str, required=True,
20 |                     help='Input text file including ..')
21 | parser.add_argument('--output', type=str, required=True,
22 |                     help='Output directory including ..')
23 | parser.add_argument('--data', type=str, required=True, help='data path')
24 | # Optimization options
25 | parser.add_argument('--jobs', default=25, type=int, metavar='N',
26 |                     help='number of multiprocess to run')
27 | 
28 | # parser.add_argument('--smoothing', action='store_true',
29 | #                     help='use smoothing technique')
30 | 
31 | 
32 | args = parser.parse_args()
33 | print(args)
34 | 
35 | 
36 | def generate(generated_text, output_directory_path, supervisions, recordings):
37 |     sp2.create_cs_audio(generated_text, output_directory_path, supervisions, recordings)
38 | 
39 | def chunks(list, n):
40 |     return [list[i:i+n] for i in range(0, len(list), n)]
41 | 
42 | 
43 | def main():
44 |     start_time = time.perf_counter()
45 | 
46 |     proc_count=args.jobs
47 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
48 | 
49 |     data_path = Path(args.data)
50 |     logging.info(f"Loading supervisions and recordings...")
51 |     sup_path = data_path / 'supervisions.pkl'
52 |     rec_path = data_path / 'recording_dict.pkl'
53 | 
54 |     supervisions, recordings= sp2.load_dicts_modified(sup_path, rec_path)
55 |     #recordings = {key: (Recording.from_file(val).move_to_memory(channels=0,format="wav")) for key, val in recordings.items()}
56 |     recordings = {key: Recording.from_file(val) for key, val in recordings.items()}
57 |     logging.info(f"Finished loading supervisions and recordings...")
58 | 
59 |     inlist = open(args.input, 'r+', encoding='utf8', errors='ignore').readlines()
60 |     #inlist=open(args.input,'r').readlines()
61 |     outdir = args.output
62 |     isExist = os.path.exists(outdir)
63 |     if not isExist:
64 |         os.makedirs(outdir)
65 |     total = len(inlist)
66 |     chunk_size = total // proc_count
67 | 
68 |     logging.info(f"Total: {total} Chunk size: {chunk_size}")
69 | 
70 |     slice = chunks(inlist, chunk_size)
71 |     processes = []
72 | 
73 |     if proc_count <= 1:
74 |         generate(inlist,outdir,supervisions,recordings)
75 |     elif proc_count >= 2:
76 |         for i, s in enumerate(slice):
77 |             p = multiprocessing.Process(target=generate, args=(s,outdir, supervisions, recordings))
78 |             p.start()
79 |             processes.append(p)
80 | 
81 |     # Joins all the processes
82 |     for p in processes:
83 |         p.join()
84 | 
85 |     finish_time = time.perf_counter()
86 | 
87 |     logging.info(f"Program finished in {finish_time - start_time} seconds")
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/src/generate_unigram_improved.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2023 Jons Hopkins University (Amir Hussein)
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | 
 7 | 
 8 | import os, sys
 9 | import multiprocessing
10 | import time
11 | import argparse
12 | import splice_unigram_improved as sp2
13 | from lhotse import Recording
14 | import logging
15 | from pathlib import Path
16 | parser = argparse.ArgumentParser(description='CS Audio generation pipeline')
17 | # Datasets
18 | parser.add_argument('--input', type=str, required=True,
19 |                     help='Input text file including ..')
20 | parser.add_argument('--output', type=str, required=True,
21 |                     help='Output directory including ..')
22 | 
23 | parser.add_argument('--data', type=str, required=True, help='data path')
24 | # Optimization options
25 | parser.add_argument('--jobs', default=25, type=int, metavar='N',
26 |                     help='number of multiprocess to run')
27 | 
28 | # parser.add_argument('--smoothing', action='store_true',
29 | #                     help='use smoothing technique')
30 | 
31 | 
32 | args = parser.parse_args()
33 | print(args)
34 | 
35 | 
36 | def generate(generated_text, output_directory_path, supervisions, recordings):
37 |     sp2.create_cs_audio(generated_text, output_directory_path, supervisions, recordings)
38 | 
39 | def chunks(list, n):
40 |     return [list[i:i+n] for i in range(0, len(list), n)]
41 | 
42 | 
43 | def main():
44 |     start_time = time.perf_counter()
45 | 
46 |     proc_count=args.jobs
47 |     logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
48 |     data_path = Path(args.data)
49 |     sup_path = data_path / 'supervisions.pkl'
50 |     rec_path = data_path / 'recording_dict.pkl'
51 | 
52 |     logging.info(f"Loading supervisions and recordings...")
53 |     supervisions, recordings= sp2.load_dicts_modified(sup_path, rec_path)
54 |     #recordings = {key: Recording.from_file(val).move_to_memory(channels=0,format="wav") for key, val in recordings.items()}
55 |     recordings = {key: Recording.from_file(val) for key, val in recordings.items()}
56 |     logging.info(f"Finished loading supervisions and recordings...")
57 |     inlist = open(args.input, 'r+', encoding='utf8', errors='ignore').readlines()
58 |     #inlist=open(args.input,'r').readlines()
59 |     outdir = args.output
60 |     isExist = os.path.exists(outdir)
61 |     if not isExist:
62 |         os.makedirs(outdir)
63 |     total = len(inlist)
64 |     chunk_size = total // proc_count
65 | 
66 |     logging.info(f"Total: {total} Chunk size: {chunk_size}")
67 | 
68 |     slice = chunks(inlist, chunk_size)
69 |     processes = []
70 | 
71 |     if proc_count <= 1:
72 |         generate(inlist,outdir,supervisions,recordings)
73 |     elif proc_count >= 2:
74 |         for i, s in enumerate(slice):
75 |             p = multiprocessing.Process(target=generate, args=(s,outdir, supervisions, recordings))
76 |             p.start()
77 |             processes.append(p)
78 | 
79 |     # Joins all the processes
80 |     for p in processes:
81 |         p.join()
82 | 
83 |     finish_time = time.perf_counter()
84 | 
85 |     logging.info(f"Program finished in {finish_time - start_time} seconds")
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/src/seg2rec_ctm.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | from pathlib import Path
 3 | import pdb
 4 | 
 5 | def seg2rec_ctm(data_dir):
 6 | 
 7 |     ctm_lines = open(data_dir / "ctm.mono",'r').readlines()
 8 |     seg_lines = open(data_dir / "segments",'r').readlines()
 9 |     seg_dict = {}
10 |     for l in seg_lines:
11 |         l = l.strip().split()
12 |         seg_dict[l[0]] = (l[1], l[2], l[3]) 
13 |     new_ctm_lines = []
14 |     for l in ctm_lines:
15 |         l = l.strip().split()
16 |         start = float(seg_dict[l[0]][1]) + float(l[2])
17 |         ctm_line = " ".join([seg_dict[l[0]][0], l[1], 
18 |                                 str(round(start,3)), l[3], l[4]])
19 |         new_ctm_lines.append(ctm_line+"\n")
20 |         
21 |     with open('ctm', 'w') as f:  
22 |         for line in new_ctm_lines:      
23 |             f.write(line)
24 | 
25 | if __name__ == "__main__": 
26 | 	seg2rec_ctm(Path(sys.argv[1]))


--------------------------------------------------------------------------------
/src/setup_recording_dict.py:
--------------------------------------------------------------------------------
 1 | import json 
 2 | import os
 3 | import sys
 4 | import msgspec
 5 | from lhotse import Recording, RecordingSet
 6 | import logging
 7 | from tqdm import tqdm
 8 | from utils import dump_pickled
 9 | 
10 | #setup recording dictionary give wave.scp file and 
11 | #save it as recording_dict.json in rec_output_folder
12 | 
13 | def setup_rec_dict(wav_scp_path, rec_output_folder):
14 | 
15 | 	recording_file = open(wav_scp_path, 'r') 
16 | 
17 | 	lines = recording_file.readlines() 
18 | 	recordings = {}
19 | 	total_lines = len(lines)
20 | 	
21 | 	for i in tqdm(range(total_lines), desc="Processing recordings", ncols=100, ascii=True):
22 | 		line = lines[i].strip().strip('|')
23 | 		line = line.split() 	
24 | 		if(os.path.isfile(line[-1])):
25 | 			
26 | 			recordings[line[0]] = line[-1]
27 | 		else: 
28 | 			#log audio files that do not exist 
29 | 			print(f"Recording {line[-1]} does not exist")
30 | 	#recs = RecordingSet.from_recordings(Recording.from_file(p) for p in audio_paths)
31 | 	
32 | 	logging.info(f"Processed: {len(recordings)} / {total_lines}")
33 | 	out_file = rec_output_folder+'/recording_dict.pkl'
34 | 	#recs.to_file(rec_output_folder + "/recordings.jsonl.gz")
35 | 	logging.info(f"Dumping recordings_dict.pkl to {out_file} ")
36 | 	dump_pickled(msgspec.json.encode(recordings), out_file)
37 | 
38 | 
39 | if __name__=="__main__":
40 | 	setup_rec_dict(sys.argv[1], sys.argv[2])
41 | 


--------------------------------------------------------------------------------
/src/setup_supervision_bigram_dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2023 Jons Hopkins University (Amir Hussein)
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import json
 7 | import sys
 8 | from utils import dump_pickled, load_pickled
 9 | import msgspec
10 | from lhotse import Recording
11 | from tqdm import tqdm
12 | 
13 | #create supervision segment dictionary and save it as 
14 | # supervisions.pkl in output folder sup_dict_folder
15 | 
16 | def setup_sup_dict(ctm_file_path, recording_dict_path,sup_dict_folder):
17 | 	ctm_lines = open(ctm_file_path,'r').readlines()
18 | 	#recording_set=json.load(open(recording_dict_path))
19 | 	recording_set = msgspec.json.decode(load_pickled(recording_dict_path))
20 | 	supervision_segments={}
21 | 	loaded_recs = {}
22 | 	for i in tqdm(range(len(ctm_lines)-1), desc="Converting CTM tokens to supervisions", ncols=100, ascii=True):
23 | 		#print(i)
24 | 		line_1 = ctm_lines[i].strip().split()
25 | 		line_2 = ctm_lines[i+1].strip().split()
26 | 
27 | 		recording_id_1 = line_1[0]
28 | 		recording_id_2 = line_2[0]
29 | 
30 | 		if(recording_id_1 == recording_id_2):
31 | 			token_1 = line_1[4]
32 | 			text_1 = line_1[4]
33 | 			channel_1 = line_1[1]
34 | 			start_1 = float(line_1[2])
35 | 			duration_1 = float(line_1[3])
36 | 			
37 | 			token_2 = line_2[4]
38 | 			text_2 = line_2[4]
39 | 			channel_2 = line_2[1]
40 | 			start_2 = float(line_2[2]) 
41 | 			duration_2 = float(line_2[3])
42 | 
43 | 			bi_start = start_1
44 | 			gap = start_2 - (start_1 + duration_1)
45 | 			if( gap >= 0 and gap < 0.5 ):
46 | 				bi_duration = duration_1 + gap + duration_2
47 | 				bi_token = token_1 + ' ' + token_2
48 | 				bi_channel = channel_1
49 | 				bi_text = text_1 + ' ' + text_2
50 | 			
51 | 				if(recording_id_1 in recording_set):
52 | 					if recording_id_1 not in loaded_recs:
53 | 						rec = Recording.from_file(recording_set[recording_id_1])
54 | 						loaded_recs[recording_id_1] = rec
55 | 
56 | 					elif recording_id_1 in loaded_recs:
57 | 						rec = loaded_recs[recording_id_1]
58 | 
59 | 					new_start = max(0.0, bi_start-0.05) 
60 | 					end = bi_start + bi_duration 
61 | 					new_end = min(end + 0.05, rec.duration) 
62 | 					new_duration = new_end-new_start
63 | 					if(bi_token in supervision_segments):
64 | 						#sup=(bi_token,recording_id_1, bi_start, bi_duration, bi_channel, bi_text)
65 | 						sup = (str(i).zfill(8),recording_id_1, new_start, new_duration, bi_channel, bi_text)
66 | 						supervision_segments[bi_token].append(sup)
67 | 					else:
68 | 						#sup=(bi_token,recording_id_1, bi_start, bi_duration, bi_channel, bi_text)
69 | 						sup = (str(i).zfill(8),recording_id_1, new_start, new_duration, bi_channel, bi_text)
70 | 						supervision_segments[bi_token]=[sup]
71 | 
72 | 	out_file = sup_dict_folder+"/bigram_supervisions.pkl"
73 | 	dump_pickled(msgspec.json.encode(supervision_segments), out_file)
74 | 	# out_file = open(sup_dict_folder+"/bigram_supervisions.json", "w")
75 | 	# json.dump(supervision_segments, out_file)
76 | 
77 | if __name__ == "__main__":
78 | 	setup_sup_dict(sys.argv[1], sys.argv[2], sys.argv[3])
79 | 


--------------------------------------------------------------------------------
/src/setup_supervision_dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2023 Jons Hopkins University (Amir Hussein)
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import json
 7 | import sys 
 8 | from utils import dump_pickled, load_pickled
 9 | import msgspec
10 | import logging
11 | from tqdm import tqdm
12 | 
13 | # create supervision segment dictionary and save it as 
14 | # supervisions.json in output folder sup_dict_folder
15 | 
16 | def setup_sup_dict(ctm_file_path, recording_dict_path,sup_dict_folder):
17 | 	ctm_lines = open(ctm_file_path,'r').readlines() 
18 | 	recording_set = msgspec.json.decode(load_pickled(recording_dict_path))
19 | 	supervision_segments = {}
20 | 	loaded_recs = {}
21 | 	i = 0
22 | 	for j in tqdm(range(len(ctm_lines)), desc="Converting CTM tokens to supervisions", ncols=100, ascii=True):
23 | 
24 | 		line = ctm_lines[j].strip().split() 
25 | 		recording_id = line[0]
26 | 		token = line[4] 
27 | 		text = line[4] 
28 | 		channel = line[1] 
29 | 		start = float(line[2]) 
30 | 		duration = float(line[3]) 
31 | 
32 | 		if(recording_id in recording_set):
33 | 
34 | 			if(token in supervision_segments):
35 | 					if(duration > 0.1):
36 | 							sup=(str(i).zfill(8),recording_id, start, duration,channel, text)
37 | 							supervision_segments[token].append(sup)
38 | 			else:
39 | 					if(duration > 0.1): 
40 | 							sup = (str(i).zfill(8),recording_id, start,duration,channel,text)
41 | 							supervision_segments[token]=[sup]
42 | 		else:
43 | 			logging.info(f"{recording_id} not in recordings_dict")
44 | 	
45 | 	out_file = sup_dict_folder+"/supervisions.pkl"
46 | 	logging.info(f"Dumping supervisions.pkl to {out_file}")
47 | 	dump_pickled(msgspec.json.encode(supervision_segments), out_file)
48 | 	# out_file = open(sup_dict_folder+"/supervisions.json", "w")
49 | 	# json.dump(supervision_segments, out_file)
50 | 
51 | 
52 | 	
53 | if __name__ == "__main__": 
54 | 	setup_sup_dict(sys.argv[1], sys.argv[2], sys.argv[3])	 
55 | 


--------------------------------------------------------------------------------
/src/setup_supervision_improved_dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2023 Jons Hopkins University (Amir Hussein)
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | import json
 7 | from lhotse import Recording
 8 | import sys 
 9 | from utils import dump_pickled, load_pickled
10 | import msgspec
11 | import logging
12 | from lhotse import RecordingSet
13 | from tqdm import tqdm
14 | 
15 | # create supervision segment dictionary and save it as 
16 | # supervisions.json in output folder sup_dict_folder
17 | 
18 | def setup_sup_dict(ctm_file_path, recording_dict_path, sup_dict_folder):
19 |         ctm_lines = open(ctm_file_path,'r').readlines()
20 |         recording_set = msgspec.json.decode(load_pickled(recording_dict_path))
21 |         supervision_segments={}
22 |         loaded_recs = {}
23 |         i = 0
24 |         for j in tqdm(range(len(ctm_lines)), desc="Converting CTM tokens to supervisions", ncols=100, ascii=True):
25 |                 line = ctm_lines[j].strip().split() 
26 |                 recording_id = line[0]
27 |                 token = line[4] 
28 |                 text = line[4] 
29 |                 channel = line[1] 
30 |                 start = float(line[2]) 
31 |                 duration = float(line[3]) 
32 |                 if(recording_id in recording_set):
33 |                         # load recording
34 |                         if recording_id not in loaded_recs:
35 |                                 rec = Recording.from_file(recording_set[recording_id])
36 |                                 loaded_recs[recording_id] = rec
37 |                         elif recording_id in loaded_recs:
38 |                                 rec = loaded_recs[recording_id]
39 | 
40 |                         # add token to bags of supervisions
41 |                         if(token in supervision_segments):
42 |                                 
43 |                                 if(duration > 0.1):
44 |                                         new_start=max(0.0, start-0.05) 
45 |                                         end = start+duration 
46 |                                         new_end = min(end+0.05,rec.duration) 
47 |                                         new_duration = new_end-new_start
48 |                                         sup = (str(i).zfill(8), recording_id, new_start, new_duration,channel, text)
49 |                                         supervision_segments[token].append(sup)
50 |                         else:
51 |                                 if(duration > 0.1): 
52 |                                         new_start = max(0.0, start-0.05)
53 |                                         end = start + duration
54 |                                         new_end = min(end+0.05, rec.duration)
55 |                                         new_duration = new_end - new_start
56 |                                         sup = (str(i).zfill(8), recording_id ,new_start, new_duration,channel, text)
57 |                                         supervision_segments[token] = [sup]
58 |                 else:
59 |                         logging.info(f"{recording_id} not in recording_set")  
60 |                 i += 1             
61 |         out_file = sup_dict_folder+"/supervisions.pkl"
62 |         dump_pickled(msgspec.json.encode(supervision_segments), out_file)
63 |         # out_file = open(sup_dict_folder+"/supervisions.json", "w")
64 |         # json.dump(supervision_segments, out_file)
65 | 
66 | 
67 | 	
68 | if __name__ == "__main__": 
69 |         # pass to setup_sup_dict: ctm_file_path, recording_dict_path, sup output folder
70 | 	setup_sup_dict(sys.argv[1], sys.argv[2], sys.argv[3])	 
71 | 


--------------------------------------------------------------------------------
/src/splice_bigram_random.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Authors: Dorsa Z, Jons Hopkins University (Amir Hussein) 
  3 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  4 | 
  5 | import json
  6 | from itertools import groupby
  7 | from operator import itemgetter
  8 | import re
  9 | import random 
 10 | import torchaudio 
 11 | from torchaudio import * 
 12 | from lhotse import SupervisionSegment, MonoCut, audio
 13 | import torch 
 14 | import os.path 
 15 | import numpy as np 
 16 | import sys 
 17 | from dataclasses import dataclass, field
 18 | from lhotse.augmentation.transform import AudioTransform
 19 | from datetime import datetime
 20 | from utils import load_pickled
 21 | import msgspec
 22 | import math 
 23 | import pdb
 24 | random.seed(50)
 25 | 
 26 | @dataclass
 27 | class Hamming(AudioTransform):
 28 |     """
 29 |     Hamming window
 30 |     """
 31 | 
 32 |     def __call__(self, samples: np.ndarray) -> np.ndarray:
 33 |         if isinstance(samples, np.ndarray):
 34 |             samples = torch.from_numpy(samples)
 35 |         augmented = samples*np.float32(np.hamming(len(samples)))
 36 |         return augmented.numpy()
 37 | 
 38 | hamming = Hamming()
 39 | 
 40 | def add_overlap(sample1, sample2, overlap=int(16000*0.1)):
 41 | 
 42 |     sample2 = hamming(sample2)
 43 |     new = np.zeros(len(sample1)+len(sample2)-overlap,dtype='float32')
 44 |     new[0:len(sample1)] = sample1
 45 |     new[len(sample1)-overlap:len(sample1)-overlap+len(sample2)] += sample2
 46 |     return new
 47 | 
 48 | def load_dicts_modified(uni_sup_dict_path,bi_sup_dict_path, rec_dict_path):
 49 |     supervisions =  msgspec.json.decode(load_pickled(uni_sup_dict_path))
 50 |     recordings =  msgspec.json.decode(load_pickled(rec_dict_path))
 51 |     bi_sups =  msgspec.json.decode(load_pickled(bi_sup_dict_path))
 52 |     return supervisions,bi_sups, recordings
 53 | 
 54 | def take_random(token,sups,recordings):
 55 |      matched_sups = sups[token]
 56 |      sup = random.sample(matched_sups, 1)[0]
 57 |      recording = recordings[sup[1]]
 58 |      sup = SupervisionSegment(id=sup[0], recording_id=sup[1], start=sup[2], duration=sup[3], channel=0, text=sup[0])
 59 |      c = MonoCut(id=sup.id, start=sup.start, duration=sup.duration, channel=sup.channel, recording=recording, supervisions=[sup])
 60 |      return c
 61 | 
 62 | def isEnglishWord(word):
 63 |     # we use wordnet as well as enchant as wordnet check fails for contractions
 64 |     # and enchant check fails for british spelling
 65 |     # chinese character check 
 66 |     return re.sub(r'[\u4e00-\u9fff]+', '', word)==word and re.sub(r'[\u0600-\u06FF\s]+', '', word)==word
 67 | 
 68 | def find_boundaries(line):
 69 |     ranges={}
 70 |     ranges['en']=[]
 71 |     ranges['oth']=[]
 72 |     en_indices=[]
 73 |     oth_indices=[]
 74 |     for i in range(len(line)):
 75 |         word =line[i]
 76 |         if(isEnglishWord(word)):
 77 |             en_indices.append(i)
 78 |         else:
 79 |             oth_indices.append(i)
 80 | 
 81 |     for k, g in groupby(enumerate(en_indices), lambda ix : ix[0] - ix[1]):
 82 |         rangs=list(map(itemgetter(1), g))
 83 |         r=(rangs[0],rangs[-1])
 84 |         ranges['en'].append(r)
 85 |     for k, g in groupby(enumerate(oth_indices), lambda ix : ix[0] - ix[1]):
 86 |         rangs=list(map(itemgetter(1), g))
 87 |         r=(rangs[0],rangs[-1])
 88 |         ranges['oth'].append(r)
 89 |     r=ranges['en'] + ranges['oth']
 90 |     r.sort(key=lambda ix:ix[0])
 91 |     #print(r)
 92 |     return r
 93 | 
 94 | def create_segments(ranges, line,uni_v,bi_v):
 95 |     segments=[]
 96 |     for (b,e) in ranges:
 97 |         seg = line[b:e+1]
 98 |         #print(seg)
 99 |         if(len(seg) == 1):
100 |             segments.append(seg)
101 |         elif(len(seg) == 2):
102 |             if(' '.join(seg) in bi_v):
103 |                 segments.append(seg)
104 |             else:
105 |                 segments.append([seg[0]])
106 |                 segments.append([seg[1]])
107 | 
108 |         elif(len(seg)>=3):
109 |             #length=len(seg)
110 |             i=0
111 |             while(i<len(seg)):
112 |                 ngram=random.randint(1,min(len(seg)-i,2))
113 |                 if(ngram==1):
114 |                     sub_seg=seg[i:i+1]
115 |                     #seg=seg[i+1:]
116 |                     segments.append(sub_seg)
117 |                     i+=1
118 |                 if(ngram==2):
119 |                     sub_seg=seg[i:i+2]
120 |                     #seg=seg[i+2:]
121 |                     if(' '.join(sub_seg) in bi_v):
122 |                         segments.append(sub_seg)
123 |                     else:
124 |                         segments.append([sub_seg[0]])
125 |                         segments.append([sub_seg[1]])
126 |                     i+=2
127 |     return segments
128 | 
129 | def create_cs_audio(generated_text,output_directory_path,recordings,uni_sups,bi_sups):
130 |     length = len(generated_text)
131 |     transcripts = []
132 |     for i in range(length):
133 |         line = generated_text[i].split()
134 |         #print(line)
135 |         file_name = "bi-"+line[0]
136 |         #ranges = find_boundaries(line[1:])
137 |         #print(ranges)
138 |         #segments = create_segments(ranges,line[1:],uni_sups,bi_sups)
139 |         #print(segments)
140 |         start_time = datetime.now()
141 |         transcript = file_name + ' '
142 |         cut = None
143 |         sentence_tokens = line[1:]
144 |         j = 0
145 |         while j < len(sentence_tokens): 
146 |             token_uni = sentence_tokens[j]
147 | 
148 |             if j == len(sentence_tokens):
149 |                 token_bi = '<None>'
150 |             else: 
151 |                 token_bi = ' '.join(sentence_tokens[j:j+2])
152 |         
153 |             if(token_bi in bi_sups):
154 |                 if(cut is None):
155 |                     #print('here3'
156 |                     c = take_random(token_bi,bi_sups,recordings)
157 |                     c_audio = c.load_audio().squeeze()
158 |                     c_audio = c_audio/(math.sqrt(audio.audio_energy(c_audio)))
159 |                     cut = hamming(c_audio)
160 |                 else:
161 |                     c = take_random(token_bi,bi_sups,recordings)
162 |                     c_audio = c.load_audio().squeeze()
163 |                     c_audio = c_audio/math.sqrt(audio.audio_energy(c_audio))
164 |                 #    print(cut)
165 |                     #cut=cut.append(c)
166 |                     cut = add_overlap(cut, c_audio)
167 |                 j += 2
168 |                 transcript += (token_bi + ' ')
169 |             elif(token_uni in uni_sups):
170 |                 if(cut is None):
171 |                     #print('here1')
172 |                     c=take_random(token_uni,uni_sups,recordings)
173 |                     c_audio =c.load_audio().squeeze()
174 |                     c_audio = c_audio/(math.sqrt(audio.audio_energy(c_audio)))
175 |                     cut = hamming(c_audio)
176 |                         
177 |                 else:
178 |                     c=take_random(token_uni,uni_sups,recordings)
179 |                     #print(cut)
180 |                     c_audio = c.load_audio().squeeze()
181 |                     c_audio = c_audio/math.sqrt(audio.audio_energy(c_audio))
182 |                 #    print(cut)
183 |                     #cut=cut.append(c)
184 |                     cut = add_overlap(cut,c_audio)
185 |                 j += 1 
186 |                 transcript += (token_uni+ ' ')
187 | 
188 |             else:
189 |                 j += 1
190 |         
191 | 
192 |         end_time = datetime.now()
193 |         delta = (end_time-start_time)
194 |         print('making sentence time: ', delta)
195 |         start_time = datetime.now()
196 |         if(cut is not None):
197 |             #cut.save_audio(output_directory_path+'/bi_'+file_name+'.wav')	
198 |             transcripts.append(transcript.strip())
199 |             #alignments[file_name]=alignment
200 |             torchaudio.save(output_directory_path+'/'+file_name+'.wav', torch.from_numpy(np.expand_dims(cut,0)),sample_rate=16000, encoding="PCM_S", bits_per_sample=16)
201 |         end_time = datetime.now()
202 |         delta = (end_time-start_time)
203 | 
204 |         print('saving audio time: ', delta)
205 | 
206 |     with open(output_directory_path+'/transcripts.txt','a') as f:
207 |         for t in transcripts:
208 |             f.write(t+'\n')


--------------------------------------------------------------------------------
/src/splice_unigram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Authors: Dorsa Z, Jons Hopkins University (Amir Hussein) 
 4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | 
 7 | import json
 8 | import random
 9 | from lhotse import SupervisionSegment, MonoCut, audio
10 | import torchaudio
11 | from torchaudio import * 
12 | import torch 
13 | import os.path
14 | import numpy as np
15 | import sys
16 | from datetime import datetime
17 | from utils import load_pickled
18 | import msgspec
19 | 
20 | random.seed(10)
21 | def load_dicts_modified(sup_dict_path, rec_dict_path):
22 |     supervisions =  msgspec.json.decode(load_pickled(sup_dict_path))
23 |     recordings =  msgspec.json.decode(load_pickled(rec_dict_path))
24 |     return supervisions, recordings
25 | 
26 | def take_random(token,sups,recordings):
27 |      matched_sups = sups[token]
28 |      sup = random.sample(matched_sups, 1)[0]
29 |      recording=recordings[sup[1]]
30 |      sup = SupervisionSegment(id=sup[0], recording_id=sup[1], start=sup[2], duration=sup[3], channel=0,
31 |                                      text=sup[-1])
32 |      c = MonoCut(id=sup.id, start=sup.start, duration=sup.duration, channel=sup.channel, recording=recording,
33 |                         supervisions=[sup])
34 |      return c   
35 |     
36 | def create_cs_audio(generated_text, output_directory_path, supervisions, recordings): 
37 |     length = len(generated_text)
38 |     transcripts=[]
39 |     #alignments={}
40 |     for i in range(length):
41 |         line = generated_text[i].split()
42 |         file_name = line[0]
43 | 
44 |         start_time = datetime.now()
45 |         transcript=file_name + ' '
46 |         #alignment=[]
47 |         sentence_token = line[1:]
48 |         cut = None
49 |         for j in range(len(sentence_token)):
50 |             token = sentence_token[j]
51 |             if (token in supervisions):
52 |                 transcript += (token+ ' ')
53 |                 if not cut: 
54 |                     c=take_random(token,supervisions,recordings)
55 |                     cut=c
56 |                     #cut = cut.perturb_volume(5.) #increase volume bc too quiet 
57 |                 else:
58 |                     c=take_random(token,supervisions,recordings)
59 |                     #c = c.perturb_volume(5.)
60 |                     cut=cut.append(c)
61 |                    
62 | 
63 |                  
64 | 
65 |         end_time = datetime.now()
66 |         delta = (end_time - start_time)
67 |         # print('making sentence time: ', delta)
68 | 
69 |         start_time = datetime.now()
70 |         if(cut):
71 |             transcripts.append(transcript.strip())
72 |             #alignments[file_name]=alignment
73 |             torchaudio.save(output_directory_path+'/'+file_name+'.wav', torch.from_numpy(cut.load_audio()),sample_rate=16000, encoding="PCM_S", bits_per_sample=16)
74 |         end_time = datetime.now()
75 |         delta = (end_time - start_time)
76 | 
77 |         # print('saving audio time: ', delta)
78 | 
79 |     with open(output_directory_path+'/transcripts.txt','a') as f:
80 |         for t in transcripts:
81 |             f.write(t+'\n')
82 |    
83 | if __name__ == "__main__":
84 |     sup_dict_path = sys.argv[1]
85 |     rec_dict_path = sys.argv[2]
86 | 
87 |     input_path = sys.argv[3]
88 |     output_path = sys.argv[4]
89 | 
90 |     supervisions, recordings= load_dicts_modified(sup_dict_path, rec_dict_path)
91 |         # non_freq_dict_path, sup_bin_1_dict_path, sup_bin_2_dict_path, sup_bin_3_dict_path,
92 |         # sup_bin_4_dict_path, sup_bin_5_dict_path)
93 |     generated_text = open(input_path, 'r').readlines()
94 |     create_cs_audio(generated_text, output_path, supervisions, recordings)
95 | 


--------------------------------------------------------------------------------
/src/splice_unigram_improved.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Authors: Dorsa Z, Jons Hopkins University (Amir Hussein) 
  4 | # Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  5 | 
  6 | import json
  7 | import random
  8 | from lhotse import *
  9 | import torchaudio 
 10 | from torchaudio import * 
 11 | import torch 
 12 | import os.path
 13 | import numpy as np
 14 | import sys
 15 | from datetime import datetime
 16 | import math 
 17 | random.seed(10)
 18 | from lhotse.augmentation.transform import AudioTransform
 19 | from dataclasses import dataclass, field
 20 | import numpy as np
 21 | import torch
 22 | from utils import load_pickled
 23 | import msgspec
 24 | 
 25 | from typing import Callable, Dict, List, Optional, Tuple, Union
 26 | 
 27 | 
 28 | @dataclass
 29 | class Hamming(AudioTransform):
 30 |     """
 31 |     Hamming window
 32 |     """
 33 | 
 34 |     def __call__(self, samples: np.ndarray) -> np.ndarray:
 35 |         if isinstance(samples, np.ndarray):
 36 |             samples = torch.from_numpy(samples)
 37 |         augmented = samples*np.float32(np.hamming(len(samples)))
 38 |         return augmented.numpy()
 39 | 
 40 | hamming = Hamming()
 41 | 
 42 | def add_overlap(sample1, sample2, overlap=int(16000*0.1)):
 43 | 
 44 |     sample2 = hamming(sample2)
 45 |     new = np.zeros(len(sample1)+len(sample2)-overlap,dtype='float32')
 46 |     new[0:len(sample1)] = sample1
 47 |     new[len(sample1)-overlap:len(sample1)-overlap+len(sample2)] += sample2
 48 |     return new
 49 | 
 50 | def load_dicts_modified(sup_dict_path, rec_dict_path):
 51 |     supervisions =  msgspec.json.decode(load_pickled(sup_dict_path))
 52 |     recordings =  msgspec.json.decode(load_pickled(rec_dict_path))
 53 |     return supervisions, recordings
 54 | 
 55 | 
 56 | def take_random(token,sups,recordings):
 57 |     matched_sups = sups[token]
 58 |     sup = random.sample(matched_sups, 1)[0]
 59 |     recording = recordings[sup[1]]
 60 |     sup = SupervisionSegment(id=sup[0], recording_id=sup[1], start=sup[2], duration=sup[3], channel=0,
 61 |                                      text=sup[-1])
 62 |     mono_cut = MonoCut(id=sup.id+"-cut", start=sup.start, duration=sup.duration, channel=sup.channel, recording=recording,
 63 |                         supervisions=[sup])
 64 |     return mono_cut
 65 |          
 66 | def create_cs_audio(generated_text, output_directory_path, supervisions, recordings): 
 67 |     length = len(generated_text)
 68 |     transcripts=[]
 69 |     for i in range(length):
 70 |         line = generated_text[i].split()
 71 |         file_name = "uni-"+line[0]
 72 | 
 73 |         start_time = datetime.now()
 74 |         transcript = file_name + ' '
 75 |         sentence_tokens = line[1:]
 76 |         a = None #audio 
 77 |         index = 0
 78 |         energies = 0.0
 79 |         
 80 |         for j in range(len(sentence_tokens)):
 81 |             token = sentence_tokens[j]
 82 |             if (token in supervisions):
 83 |                 transcript += (token+ ' ')
 84 |                 if index == 0: 
 85 |                     c = take_random(token,supervisions,recordings)
 86 |                     #c = c.perturb_volume(factor=5.)
 87 |                     c_audio =c.load_audio().squeeze()
 88 |                     a = c_audio
 89 |                     #a = np.pad(c_audio, (0, int(0.05*16000)), 'constant') # padding 0.05s with zeros from both sides
 90 |                     a = a/(math.sqrt(audio.audio_energy(a)))
 91 |                     a = hamming(a)
 92 |                     #print('a energy', audio.audio_energy(a),flush=True)
 93 |                     index += 1 
 94 |                 else:
 95 |                     c = take_random(token,supervisions,recordings)
 96 |                     #c = c.perturb_volume(factor=5.) #increasing volume because it was too quiet 
 97 |                 
 98 |                     #audio=np.append(audio,np.zeros((int(16000*0.01)),dtype='float32')) #the small pause 
 99 |                     #if (len(audio) < int(16000*0.05)): #if segment is too short for overlap of 0.05 secs 
100 |                     #    audio = np.append(audio,np.zeros((int(16000*0.05)-len(audio)),dtype='float32'))
101 |                     audio2 = c.load_audio().squeeze()
102 |                     #audio2 = np.pad(c_audio, (int(0.05*16000), int(0.05*16000)), 'constant')
103 |                     #audio2 = c_audio
104 |                     audio2 = audio2/math.sqrt(audio.audio_energy(audio2)) 
105 |                     #print('audio2 energy', audio.audio_energy(audio2),flush=True)
106 |                     a = add_overlap(a,audio2)
107 |                     #cut=cut.append(audio2)
108 |                     index+=1 
109 | 
110 |         end_time = datetime.now()
111 |         delta = (end_time - start_time)
112 |         print('making sentence time: ', delta)
113 | 
114 |         start_time = datetime.now()
115 |         if( index != 0 ):
116 |             transcripts.append(transcript.strip())
117 |             #audio = audio/energies
118 |             torchaudio.save(output_directory_path+'/'+file_name+'.wav', torch.from_numpy(np.expand_dims(a,0)),sample_rate=16000, encoding="PCM_S", bits_per_sample=16)
119 |         end_time = datetime.now()
120 |         delta = (end_time - start_time)
121 | 
122 |         print('saving audio time: ', delta)
123 | 
124 |     with open(output_directory_path+'/transcripts.txt','a') as f: #in case there is oov, must use this transcripts as text for training
125 |         for t in transcripts:
126 |             f.write(t+'\n')
127 |  
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     sup_dict_path = sys.argv[1]
132 |     rec_dict_path = sys.argv[2]
133 | 
134 |     input_path = sys.argv[3]
135 |     output_path = sys.argv[4]
136 | 
137 |     supervisions, recordings= load_dicts_modified(sup_dict_path, rec_dict_path)
138 |         # non_freq_dict_path, sup_bin_1_dict_path, sup_bin_2_dict_path, sup_bin_3_dict_path,
139 |         # sup_bin_4_dict_path, sup_bin_5_dict_path)
140 |     generated_text = open(input_path, 'r').readlines()
141 |     create_cs_audio(generated_text, output_path, supervisions, recordings)
142 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2023 Johns Hopkins University (Amir Hussein)
 4 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 5 | 
 6 | 
 7 | import pickle
 8 | import logging
 9 | import os
10 | 
11 | def dump_pickled(data, path):
12 |     logging.info(f"Dumping pickled data: {os.path.basename(path)}")
13 |     with open(path, 'wb') as file:
14 |         pickle.dump(data, file)
15 | 
16 | def load_pickled(path):
17 |     logging.info(f"Loading pickled data: {os.path.basename(path)}")
18 |     with open(path, 'rb') as file:
19 |         data = pickle.load(file)
20 |     return data
21 | 
22 | 


--------------------------------------------------------------------------------
/test/bigram_supervisions.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/test/bigram_supervisions.pkl


--------------------------------------------------------------------------------
/test/recording_dict.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/test/recording_dict.pkl


--------------------------------------------------------------------------------
/test/supervisions.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JSALT2022CodeSwitchingASR/generating-code-switched-audio/bacef099e0ddccd16b23191a4d9938b97bab3a92/test/supervisions.pkl


--------------------------------------------------------------------------------
/utils/make_utt2spk.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from pathlib import Path
 4 | 
 5 | def get_parser():
 6 |     parser = argparse.ArgumentParser(
 7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 8 |     )
 9 | 
10 |     parser.add_argument(
11 |         "--text-dir",
12 |         type=str,
13 |         default="data/",
14 |         help="Directory of the generated audio",
15 |     )
16 | 
17 |     parser.add_argument(
18 |         "--out-dir",
19 |         type=str,
20 |         default="data/",
21 |         help="Directory for wav.scp",
22 |     )
23 |     return parser
24 | 
25 | 
26 | parser = get_parser()
27 | args = parser.parse_args()
28 | text_dir = Path(args.text_dir)
29 | out_dir = Path(args.out_dir)
30 | 
31 | f=open(text_dir / 'text', 'r') 
32 | utt2spk=open(out_dir / 'utt2spk', 'w') 
33 | spk2utt=open(out_dir / 'spk2utt', 'w')
34 | for line in f.readlines(): 
35 |     l = line.split()[0] 
36 |     eyedee = l.split("_")[0] 
37 |     s = l + ' ' + eyedee+"\n"
38 |     utt2spk.write(s) 
39 |     s2 = eyedee + ' ' + l + "\n" 
40 |     spk2utt.write(s2)
41 | 
42 | 


--------------------------------------------------------------------------------
/utils/make_wav_scp.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import argparse
 3 | from pathlib import Path
 4 | 
 5 | def get_parser():
 6 |     parser = argparse.ArgumentParser(
 7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 8 |     )
 9 | 
10 |     parser.add_argument(
11 |         "--audio-dir",
12 |         type=str,
13 |         default="data/",
14 |         help="Directory of the generated audio",
15 |     )
16 | 
17 |     parser.add_argument(
18 |         "--out-dir",
19 |         type=str,
20 |         default="data/",
21 |         help="Directory for wav.scp",
22 |     )
23 |     return parser
24 | 
25 | 
26 | parser = get_parser()
27 | args = parser.parse_args()
28 | audio_dir = Path(args.audio_dir)
29 | out_dir = Path(args.out_dir)
30 | with open(out_dir / "wav.scp","w") as f: 
31 |     for file in os.listdir(audio_dir): 
32 |     
33 |         if file.endswith(".wav"): 
34 |     
35 |             f.write(str(file).split(".")[0] + " " + str(audio_dir / file) +"\n") 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------