├── Makefile ├── environment.yml ├── .gitignore ├── data ├── breadcrumbs.md ├── HA │ └── keywords.txt ├── eval_spk.list ├── dev_spk.list └── train_spk.list ├── paths.py ├── install_local.sh ├── samediff ├── readme.md ├── get_npz_keys.py ├── create_labels.py ├── create_speakers.py ├── run_samediff.sh ├── run_local.py └── run_calcdists.sh ├── downsample ├── readme.md └── downsample.py ├── src ├── print_dict.py └── plotting.py ├── qbe ├── combine_model_output.py ├── .ipynb_checkpoints │ └── sandbox-checkpoint.ipynb ├── readme.md ├── extract_queries_link_search.py ├── dense_seg_mvn.py ├── get_dtw_costs.py ├── data_prep_dense_seg.py ├── get_dense_seg_costs.py ├── sandbox.ipynb ├── apply_model_dense.py └── eval_qbe.py ├── blackbox ├── npz_to_tsv.py ├── readme.md ├── logreg_speaker.py ├── hierarchical_clustering.py ├── logreg_pronlength.py ├── dp_align.py ├── extract_analysis_features.py └── analyse_pairs.py ├── embeddings ├── readme.md ├── eval_samediff.py ├── link_mfcc.py ├── apply_model_to_npz.py ├── data_io.py ├── apply_model.py └── analyse_embeds.py ├── readme.md ├── notebooks └── sandbox_splitnet.ipynb └── features ├── analyse_utd_pairs.py ├── utils.py └── features.py /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | nosetests -v 3 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: tf1.13 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.7 7 | - numpy 8 | - tensorflow-gpu=1.13.1 9 | - tqdm 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !*/ 3 | !.* 4 | !*.py 5 | !*.pyx 6 | !*.md 7 | !*.sh 8 | !*.yml 9 | !*.ipynb 10 | !*.conf 11 | !Makefile 12 | !docker/Dockerfile* 13 | !data/* 14 | !data/*/* 15 | notebooks/.ipynb_checkpoints/* 16 | -------------------------------------------------------------------------------- /data/breadcrumbs.md: -------------------------------------------------------------------------------- 1 | Breadcrumbs 2 | ----------- 3 | - Wordpairs obtained from `/disk/scratch/s1680167/zero/data/word_pairs/`. 4 | - Speaker lists obtained from the GlobalPhone Kaldi recipe; the training list 5 | was generated. 6 | -------------------------------------------------------------------------------- /paths.py: -------------------------------------------------------------------------------- 1 | # gp_data_dir = "/group/corporapublic/global_phone/" 2 | # gp_alignments_dir = "/disk/scratch/v1hkampe/endgame/datasets/globalphone_alignments/" 3 | gp_data_dir = "/home/kamperh/endgame/datasets/globalphone/" 4 | gp_alignments_dir = "/home/kamperh/endgame/datasets/globalphone_alignments/" 5 | -------------------------------------------------------------------------------- /data/HA/keywords.txt: -------------------------------------------------------------------------------- 1 | amfani 2 | amurka 3 | arziki 4 | babban 5 | bayan 6 | bayyana 7 | birnin 8 | daban 9 | daular 10 | domin 11 | duniya 12 | hankali 13 | hanyar 14 | harkokin 15 | kasar 16 | kasashe 17 | kasashen 18 | lokacin 19 | majalisar 20 | mutane 21 | samun 22 | sarki 23 | sosai 24 | tattalin 25 | tsakanin 26 | wajen 27 | wanda 28 | wannan 29 | zaman 30 | zamanin 31 | -------------------------------------------------------------------------------- /install_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | if [ ! -d ../src ]; then 5 | mkdir ../src/ 6 | fi 7 | cd ../src/ 8 | 9 | # Install speech_dtw 10 | if [ ! -d speech_dtw ]; then 11 | git clone https://github.com/kamperh/speech_dtw.git 12 | cd speech_dtw 13 | make 14 | make test 15 | cd - 16 | fi 17 | 18 | # Install shorten 19 | if [ ! -d shorten-3.6.1 ]; then 20 | wget https://download.tuxfamily.org/xcfaudio/PROG_ABS_FRUGALWARE/SHORTEN/shorten-3.6.1.tar.gz 21 | # wget http://etree.org/shnutils/shorten/dist/src/shorten-3.6.1.tar.gz 22 | tar -zxf shorten-3.6.1.tar.gz 23 | cd shorten-3.6.1 24 | ./configure --prefix=`pwd` 25 | make 26 | make install 27 | fi 28 | 29 | set +e 30 | -------------------------------------------------------------------------------- /data/eval_spk.list: -------------------------------------------------------------------------------- 1 | AR TBA 2 | BG 040 059 063 068 095 109 110 3 | CR 037 038 039 040 041 042 043 044 045 047 4 | CZ 084 086 088 090 092 094 096 098 100 102 5 | FR 091 092 093 094 095 096 097 098 6 | GE 018 020 021 026 029 073 7 | HA 002 014 025 028 030 052 053 062 070 088 8 | JA XXX 9 | KO 019 029 032 042 051 064 069 080 082 088 10 | CH 080 081 082 083 084 085 086 087 088 089 11 | PO 135 137 138 139 142 143 312 12 | PL 050 001 031 043 023 004 098 009 044 033 13 | RU 002 027 036 063 069 092 102 104 109 112 14 | WU TBA 15 | SP 011 012 013 014 015 016 017 018 16 | SW 040 041 042 043 044 060 061 062 063 064 17 | TH 101 102 103 104 105 106 107 108 18 | TA TBA 19 | TU 025 030 031 032 037 039 041 046 056 063 20 | VN 092 094 096 098 102 103 106 110 113 21 | -------------------------------------------------------------------------------- /data/dev_spk.list: -------------------------------------------------------------------------------- 1 | AR TBA 2 | BG 051 055 058 084 090 100 106 3 | CR 033 034 035 036 046 048 051 053 054 057 4 | CZ 083 085 087 089 091 093 095 097 099 101 5 | FR 082 083 084 085 086 087 088 089 6 | GE 001 002 003 004 008 010 7 | HA 018 031 034 038 046 047 050 055 058 072 8 | JA XXX 9 | KO 006 012 025 040 045 061 084 086 091 098 10 | CH 028 029 030 031 032 039 040 041 042 043 044 11 | PO 064 072 102 103 104 132 133 134 12 | PL 097 046 041 005 012 063 040 030 090 011 13 | RU 005 033 042 065 078 097 103 106 110 122 14 | WU TBA 15 | SP 001 002 003 004 005 006 007 008 009 010 16 | SW 045 046 047 048 049 066 067 068 069 17 | TH 023 025 028 037 045 061 073 085 18 | TA TBA 19 | TU 001 002 003 005 006 008 013 014 015 016 019 20 | VN 200 201 202 203 204 205 206 207 208 21 | -------------------------------------------------------------------------------- /samediff/readme.md: -------------------------------------------------------------------------------- 1 | Same-Different Evaluation 2 | ========================= 3 | 4 | Overview 5 | -------- 6 | Performs same-different evaluation on frame-level features using dynamic time 7 | warping (DTW) alignment. 8 | 9 | 10 | Evaluation 11 | ---------- 12 | This needs to be run on a multi-core machine. Change the `n_cpus` variable in 13 | `run_calcdists.sh` and `run_samediff.sh` to the number of CPUs on the machine. 14 | 15 | As an example, to evaluate the Spanish development MFCCs: 16 | 17 | ./run_calcdists.sh ../features/mfcc/KO/ko.dev.gt_words.npz # finish first 18 | ./run_samediff.sh ../features/mfcc/KO/ko.dev.gt_words.npz 19 | 20 | 21 | Results 22 | ------- 23 | *(Deprecated)* SWDP average precision: 24 | 25 | - CH dev: 0.15380600 26 | - CR dev: 0.13270483 27 | - HA dev: 0.21368697 28 | - SP dev: 0.19288643 29 | - SW dev: 0.10928384 30 | - TU dev: 0.18624635 31 | 32 | - GE dev: 0.22482616 33 | - KO dev: 0.15748395 34 | 35 | - SP eval: 0.29650854 36 | -------------------------------------------------------------------------------- /downsample/readme.md: -------------------------------------------------------------------------------- 1 | Downsampled Acoustic Word Embeddings 2 | ==================================== 3 | 4 | Overview 5 | -------- 6 | MFCCs are downsampled to obtain acoustic word embeddings. These are evaluated 7 | using same-different evaluation. 8 | 9 | 10 | Downsampling 11 | ------------ 12 | Perform downsampling on MFCCs without deltas: 13 | 14 | mkdir -p exp/SP 15 | ./downsample.py --technique resample --frame_dims 13 \ 16 | ../features/mfcc/CH/ch.eval.gt_words.npz \ 17 | exp/CH/mfcc.eval.gt_words.downsample_10.npz 10 18 | 19 | 20 | Evaluation 21 | ---------- 22 | Evaluate and analyse downsampled MFCCs without deltas: 23 | 24 | ../embeddings/eval_samediff.py --mvn \ 25 | exp/SP/mfcc.dev.gt_words.downsample_10.npz 26 | ../embeddings/analyse_embeds.py --normalize --word_type \ 27 | guatemala,presidente,autoridades,candidatos,asesinato,presupuesto,vicepresidente,negociaciones,netanyahu,social,explotaciones \ 28 | exp/SP/mfcc.dev.gt_words.downsample_10.npz 29 | 30 | 31 | Results 32 | ------- 33 | SWDP average precision: 34 | 35 | - SP dev: 0.14567458 36 | 37 | 38 | *(Deprecated)* SWDP average precision: 39 | 40 | - CH dev: 0.11420457 41 | - CR dev: 0.11620668 42 | - HA dev: 0.11831970 43 | - SP dev: 0.12301926 44 | - SW dev: 0.06808896 45 | - TU dev: 0.13914600 46 | 47 | - GE dev: 0.08031011 48 | - KO dev: 0.13563458 49 | - TH dev: 0.08781202 50 | - VN dev: 0.02734849 51 | 52 | - SP eval: 0.19438775 53 | -------------------------------------------------------------------------------- /src/print_dict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Print the contents of a pickled dictionary. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2020 9 | """ 10 | 11 | from os import path 12 | import argparse 13 | import pickle 14 | import sys 15 | 16 | 17 | #-----------------------------------------------------------------------------# 18 | # UTILITY FUNCTIONS # 19 | #-----------------------------------------------------------------------------# 20 | 21 | def check_argv(): 22 | """Check the command line arguments.""" 23 | parser = argparse.ArgumentParser( 24 | description=__doc__.strip().split("\n")[0], add_help=False 25 | ) 26 | parser.add_argument("pickle_dict_fn", type=str, help="pickled dictionary") 27 | if len(sys.argv) == 1: 28 | parser.print_help() 29 | sys.exit(1) 30 | return parser.parse_args() 31 | 32 | 33 | #-----------------------------------------------------------------------------# 34 | # MAIN FUNCTION # 35 | #-----------------------------------------------------------------------------# 36 | 37 | def main(): 38 | args = check_argv() 39 | 40 | if path.isfile(args.pickle_dict_fn): 41 | with open(args.pickle_dict_fn, "rb") as f: 42 | d = pickle.load(f) 43 | for key in sorted(d): 44 | print(key, ":", d[key]) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /samediff/get_npz_keys.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Write the keys in a given Numpy archive. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2015, 2018, 2019 9 | """ 10 | 11 | import argparse 12 | import sys 13 | import numpy as np 14 | 15 | #-----------------------------------------------------------------------------# 16 | # UTILITY FUNCTIONS # 17 | #-----------------------------------------------------------------------------# 18 | 19 | def check_argv(): 20 | """Check the command line arguments.""" 21 | parser = argparse.ArgumentParser( 22 | description=__doc__.strip().split("\n")[0], add_help=False 23 | ) 24 | parser.add_argument("npz_fn", type=str, help="the Numpy archive") 25 | parser.add_argument( 26 | "keys_fn", type=str, help="the file to write the keys to" 27 | ) 28 | if len(sys.argv) == 1: 29 | parser.print_help() 30 | sys.exit(1) 31 | return parser.parse_args() 32 | 33 | 34 | #-----------------------------------------------------------------------------# 35 | # MAIN FUNCTION # 36 | #-----------------------------------------------------------------------------# 37 | 38 | def main(): 39 | args = check_argv() 40 | 41 | npz = np.load(args.npz_fn) 42 | 43 | print("Writing keys:", args.keys_fn) 44 | open(args.keys_fn, "w").write("\n".join(npz.keys()) + "\n") 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /samediff/create_labels.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Create a list of the word labels from a list of utterance IDs. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2014, 2018, 2019 9 | """ 10 | 11 | import argparse 12 | import codecs 13 | import sys 14 | 15 | 16 | #-----------------------------------------------------------------------------# 17 | # UTILITY FUNCTIONS # 18 | #-----------------------------------------------------------------------------# 19 | 20 | def check_argv(): 21 | """Check the command line arguments.""" 22 | parser = argparse.ArgumentParser( 23 | description=__doc__.strip().split("\n")[0], add_help=False 24 | ) 25 | parser.add_argument("utterance_ids_fn") 26 | parser.add_argument("labels_fn") 27 | if len(sys.argv) == 1: 28 | parser.print_help() 29 | sys.exit(1) 30 | return parser.parse_args() 31 | 32 | 33 | #-----------------------------------------------------------------------------# 34 | # MAIN FUNCTION # 35 | #-----------------------------------------------------------------------------# 36 | 37 | def main(): 38 | 39 | args = check_argv() 40 | 41 | utt_ids = [i.strip() for i in open(args.utterance_ids_fn)] 42 | labels = [] 43 | for utt_id in utt_ids: 44 | word = utt_id.split("_")[0] #"_".join(utt_id.split("_")[:-2]) 45 | labels.append(word) 46 | with codecs.open(args.labels_fn, "w", "utf-8") as f: 47 | for label in labels: 48 | f.write(label + "\n") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /samediff/create_speakers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Create a list of the speaker labels from a list of utterance IDs. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2014, 2018, 2019 9 | """ 10 | 11 | import argparse 12 | import codecs 13 | import sys 14 | 15 | 16 | #-----------------------------------------------------------------------------# 17 | # UTILITY FUNCTIONS # 18 | #-----------------------------------------------------------------------------# 19 | 20 | def check_argv(): 21 | """Check the command line arguments.""" 22 | parser = argparse.ArgumentParser( 23 | description=__doc__.strip().split("\n")[0], add_help=False 24 | ) 25 | parser.add_argument("utterance_ids_fn") 26 | parser.add_argument("labels_fn") 27 | if len(sys.argv) == 1: 28 | parser.print_help() 29 | sys.exit(1) 30 | return parser.parse_args() 31 | 32 | 33 | #-----------------------------------------------------------------------------# 34 | # MAIN FUNCTION # 35 | #-----------------------------------------------------------------------------# 36 | 37 | def main(): 38 | 39 | args = check_argv() 40 | 41 | utt_ids = [i.strip() for i in open(args.utterance_ids_fn)] 42 | labels = [] 43 | for utt_id in utt_ids: 44 | speaker = utt_id.split("_")[1] #"_".join(utt_id.split("_")[:-2]) 45 | labels.append(speaker) 46 | with codecs.open(args.labels_fn, "w", "utf-8") as f: 47 | for label in labels: 48 | f.write(label + "\n") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /qbe/combine_model_output.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Combine `apply_model_dense_seg.py` output into a single Numpy archive. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2017, 2019 9 | """ 10 | 11 | from os import path 12 | from tqdm import tqdm 13 | import argparse 14 | import pickle 15 | import glob 16 | import numpy as np 17 | import sys 18 | 19 | 20 | #-----------------------------------------------------------------------------# 21 | # UTILITY FUNCTIONS # 22 | #-----------------------------------------------------------------------------# 23 | 24 | def check_argv(): 25 | """Check the command line arguments.""" 26 | parser = argparse.ArgumentParser( 27 | description=__doc__.strip().split("\n")[0], add_help=False 28 | ) 29 | parser.add_argument("exp_dir", type=str, help="experiments directory") 30 | if len(sys.argv) == 1: 31 | parser.print_help() 32 | sys.exit(1) 33 | return parser.parse_args() 34 | 35 | 36 | #-----------------------------------------------------------------------------# 37 | # MAIN FUNCTION # 38 | #-----------------------------------------------------------------------------# 39 | 40 | def main(): 41 | args = check_argv() 42 | 43 | features_dict = {} 44 | for fn in glob.glob(path.join(args.exp_dir, "search.*.npz")): 45 | print("Reading:", fn) 46 | split_features_dict = np.load(fn) 47 | for key in tqdm(split_features_dict): 48 | features_dict[key] = split_features_dict[key] 49 | # print(split_features_dict[key].shape) 50 | print("Total no. utterances:", len(features_dict)) 51 | 52 | fn = path.join(args.exp_dir, "search.npz") 53 | print("Writing:", fn) 54 | np.savez(fn, **features_dict) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /samediff/run_samediff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Calculate distances for same-different evaluation of autoencoder features. 4 | # Herman Kamper, h.kamper@sms.ed.ac.uk, 2014-2015, 2018. 5 | 6 | # General setup 7 | n_cpus=29 8 | 9 | # Input features 10 | features_npz=$1 # features_npz=../data/mfcc_test.npz 11 | if [ -z $features_npz ]; then 12 | echo "usage: ${0} features_npz" 13 | exit 1 14 | fi 15 | if [ ! -f $features_npz ]; then 16 | echo "Error: $features_npz does not exist" 17 | exit 1 18 | fi 19 | 20 | # Files and directories 21 | basename=`basename $features_npz` 22 | basename="${basename%.*}" 23 | samediff_dir=exp/$basename 24 | pairs=$samediff_dir/pairs.list 25 | pairs_split_dir=$samediff_dir/pairs_split 26 | labels=$samediff_dir/labels.list 27 | speakers=$samediff_dir/speakers.list 28 | distances_split_dir=$samediff_dir/distances_split 29 | distances=$samediff_dir/distances.dist 30 | samediff_result=$samediff_dir/samediff_result.txt 31 | 32 | # Make sure that all the jobs are done 33 | complete=`ls $distances_split_dir/distances.*.log | xargs grep "End time" \ 34 | | wc -l` 35 | echo "Number of splits completed: $complete out of $n_cpus" 36 | if [ "$n_cpus" -ne "$complete" ]; then 37 | echo "Error: wait for jobs to complete" 38 | exit 1 39 | fi 40 | 41 | # Concatenate distances 42 | if [ ! -f $distances ]; then 43 | touch $distances 44 | for JOB in $(seq 1 $n_cpus); do 45 | cat $distances_split_dir/distances.$JOB.dist >> $distances 46 | done 47 | fi 48 | 49 | if [ ! -f $samediff_result ]; then 50 | python ../../src/speech_dtw/utils/samediff.py --binary_dists $labels \ 51 | --speakers_fn $speakers $distances > $samediff_result 52 | echo 53 | cat $samediff_result 54 | echo 55 | 56 | if [ $? -ne 0 ]; then 57 | echo "Exiting" 58 | rm $samediff_result 59 | exit 1 60 | fi 61 | fi 62 | 63 | # Clean directories 64 | read -p "Clean distances (y/n)? " -n 1 -r 65 | echo 66 | if [[ $REPLY =~ ^[Yy]$ ]]; then 67 | rm -r $pairs $pairs_split_dir $distances $distances_split_dir 68 | fi 69 | -------------------------------------------------------------------------------- /samediff/run_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Run the same command in parallel on an SGE grid. 5 | 6 | As an example, run:: 7 | 8 | ./run_local.py 1 3 log.JOB "echo start;sleep 10;echo finished job JOB" 9 | 10 | The final line of output is the last spawned PID. 11 | 12 | Author: Herman Kamper 13 | Contact: kamperh@gmail.com 14 | Date: 2014, 2018, 2019 15 | """ 16 | 17 | import argparse 18 | import subprocess 19 | import sys 20 | import re 21 | 22 | shell = lambda command: subprocess.Popen( 23 | command, shell=True, stdout=subprocess.PIPE 24 | ).communicate()[0] 25 | 26 | 27 | def check_argv(): 28 | """Check the command line arguments.""" 29 | parser = argparse.ArgumentParser( 30 | description=__doc__.strip().split("\n")[0], add_help=False 31 | ) 32 | parser.add_argument("JOB_start", type=int, help="JOB id start value") 33 | parser.add_argument( 34 | "JOB_end", type=int, help="JOB id end value (exclusive)" 35 | ) 36 | parser.add_argument( 37 | "log_fn", type=str, 38 | help="log file, substituting JOB for the current id" 39 | ) 40 | parser.add_argument( 41 | "command", type=str, 42 | help="execute this command, substituting JOB for the current" 43 | " id (enclose in quotes if using parameters)" 44 | ) 45 | if len(sys.argv) == 1: 46 | parser.print_help() 47 | sys.exit(1) 48 | return parser.parse_args() 49 | 50 | 51 | def main(): 52 | args = check_argv() 53 | job_start = args.JOB_start 54 | job_end = args.JOB_end 55 | log_fn = args.log_fn 56 | command = args.command 57 | 58 | pid = -1 59 | for i in range(job_start, job_end + 1): 60 | cur_command = re.sub("JOB", str(i), command) 61 | cur_log = re.sub("JOB", str(i), log_fn) 62 | pid = subprocess.Popen( 63 | cur_command, shell=True, stderr=subprocess.STDOUT, 64 | stdout=open(cur_log, "wb") 65 | ).pid 66 | print("Spawning job " + str(i) + " with PID:", pid) 67 | print(pid) 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /samediff/run_calcdists.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Calculate distances for same-different evaluation. 4 | # Herman Kamper, kamperh@gmail.com, 2014-2015, 2018-2019. 5 | 6 | set -e 7 | 8 | # General setup 9 | n_cpus=29 10 | cmd="python run_local.py" 11 | # cmd="./local/run_sge.py --extraargs -P inf_hcrc_cstr_students" 12 | export PYTHONUNBUFFERED="YOUR_SET" # flush after every Python print statement 13 | 14 | 15 | # Input features 16 | features_npz=$1 17 | if [ -z $features_npz ]; then 18 | echo "usage: ${0} features_npz" 19 | exit 1 20 | fi 21 | if [ ! -f $features_npz ]; then 22 | echo "Error: $features_npz does not exist" 23 | exit 1 24 | fi 25 | 26 | # Files and directories 27 | basename=`basename $features_npz` 28 | basename="${basename%.*}" 29 | samediff_dir=exp/$basename 30 | utterance_ids=$samediff_dir/utterance_ids.list 31 | pairs=$samediff_dir/pairs.list 32 | pairs_split_dir=$samediff_dir/pairs_split 33 | labels=$samediff_dir/labels.list 34 | speakers=$samediff_dir/speakers.list 35 | distances_split_dir=$samediff_dir/distances_split 36 | distances=$samediff_dir/distances.dist 37 | 38 | # Create samediff dir 39 | [ ! -d $samediff_dir ] && mkdir -p $samediff_dir 40 | 41 | # Create utterance IDs and label files 42 | [ ! -f $utterance_ids ] && python get_npz_keys.py $features_npz $utterance_ids 43 | [ ! -f $labels ] && python create_labels.py $utterance_ids $labels 44 | [ ! -f $speakers ] && python create_speakers.py $utterance_ids $speakers 45 | 46 | # Generate a list of all possible pairs and split for parallel processing 47 | [ ! -f $pairs ] && python ../../src/speech_dtw/utils/create_pair_file.py \ 48 | $utterance_ids $pairs 49 | [ ! -d $pairs_split_dir ] && ../../src/speech_dtw/utils/split_file.py \ 50 | $pairs $n_cpus $pairs_split_dir 51 | 52 | # Calculate DTW distances 53 | if [ ! -d $distances_split_dir ]; then 54 | mkdir -p $distances_split_dir 55 | dist_cmd="python ../../src/speech_dtw/utils/calculate_dtw_costs.py \ 56 | --binary_dists --input_fmt npz $pairs_split_dir/pairs.JOB.list \ 57 | $features_npz $distances_split_dir/distances.JOB.dist" 58 | $cmd 1 $n_cpus $distances_split_dir/distances.JOB.log "$dist_cmd" 59 | fi 60 | 61 | echo "Wait to complete, then run run_samediff.sh" 62 | -------------------------------------------------------------------------------- /blackbox/npz_to_tsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Convert a NumPy archive to a TSV file for visualising embeddings. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from os import path 12 | from tqdm import tqdm 13 | import argparse 14 | import numpy as np 15 | import sys 16 | 17 | 18 | #-----------------------------------------------------------------------------# 19 | # UTILITY FUNCTIONS # 20 | #-----------------------------------------------------------------------------# 21 | 22 | def check_argv(): 23 | """Check the command line arguments.""" 24 | parser = argparse.ArgumentParser( 25 | description=__doc__.strip().split("\n")[0], add_help=False 26 | ) 27 | parser.add_argument("npz_fn", type=str, help="input NumPy archive") 28 | parser.add_argument( 29 | "tsv_fn", type=str, help="output TSV file; if 'auto', then an output " 30 | "filename is generated automatically based on the input filename" 31 | ) 32 | if len(sys.argv) == 1: 33 | parser.print_help() 34 | sys.exit(1) 35 | return parser.parse_args() 36 | 37 | 38 | #-----------------------------------------------------------------------------# 39 | # MAIN FUNCTION # 40 | #-----------------------------------------------------------------------------# 41 | 42 | def main(): 43 | args = check_argv() 44 | 45 | print("Reading:", args.npz_fn) 46 | features = np.load(args.npz_fn) 47 | 48 | if args.tsv_fn == "auto": 49 | npz_fn_split = path.split(args.npz_fn) 50 | args.tsv_fn = ( 51 | path.split(npz_fn_split[-2])[-1] + "." + 52 | path.splitext(npz_fn_split[-1])[0] + ".tsv" 53 | ) 54 | metadata_fn = args.tsv_fn + ".metadata" 55 | print("Writing:", args.tsv_fn) 56 | print("Writing:", metadata_fn) 57 | with open(args.tsv_fn, "w") as f_tsv, open(metadata_fn, "w") as f_metadata: 58 | f_metadata.write("word\tspeaker\n") 59 | for utt_key in tqdm(sorted(features)): 60 | f_tsv.write( 61 | "\t".join(["{:.5f}".format(i) for i in features[utt_key]]) + 62 | "\n" 63 | ) 64 | utt_key_split = utt_key.split("_") 65 | f_metadata.write(utt_key_split[0] + "\t" + utt_key_split[1] + "\n") 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /qbe/.ipynb_checkpoints/sandbox-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sandox: Hauso QbE" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Herman Kamper, Stellenbosch University, 2018-2019." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Preliminaries" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "%matplotlib inline\n", 31 | "%load_ext autoreload\n", 32 | "%autoreload 2\n", 33 | "\n", 34 | "from os import path\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import numpy as np" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Keywords" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 8, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "dev_keywords_fn = \"../features/mfcc/HA/ha.dev.gt_words.npz\"\n", 53 | "test_fn = \"../features/mfcc/HA/ha.eval.npz\"\n", 54 | "dev_keywords_features = np.load(dev_keywords_fn)\n", 55 | "test_features = np.load(test_fn)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "'HA002_94'" 67 | ] 68 | }, 69 | "execution_count": 9, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "list(test_features)[0]" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "def read_forced_alignment(globalphone_fa_fn):\n", 85 | " \"\"\"Read a GlobalPhone forced alignment file.\"\"\"\n", 86 | "\n", 87 | "test_transcript = read_forced_alignment(\"/home/kamperh/endgame/datasets/globalphone_alignments/HA/eval.ctm\")" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.5.2" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /blackbox/readme.md: -------------------------------------------------------------------------------- 1 | Black-Box Analysis of Embedding Models 2 | ====================================== 3 | 4 | Extract features for analysis 5 | ----------------------------- 6 | While the default evaluation data (typically including a `gt_words` tag) are 7 | extracted with a minimum duration of 0.5 seconds at at least 5 characters, it 8 | is useful to do analysis on a larger range of word segments. This is done in 9 | the script below. 10 | 11 | Extract features and perform intermediate analysis: 12 | 13 | ./extract_analysis_features.py --analyse RU 14 | 15 | 16 | Process features with model 17 | --------------------------- 18 | The extracted features would typically be passed through a model. 19 | 20 | For instance, to obtain downsampled embeddings, run: 21 | 22 | cd ../downsample 23 | ./downsample.py --technique resample --frame_dims 13 \ 24 | ../blackbox/mfcc/GE/ge.dev.filter1_gt.npz \ 25 | exp/GE/mfcc.dev.filter1_gt.downsample_10.npz 10 26 | ../embeddings/eval_samediff.py --mvn \ 27 | exp/GE/mfcc.dev.filter1_gt.downsample_10.npz 28 | cd - 29 | 30 | To obtain embeddings from a particular model, run: 31 | 32 | cd ../embeddings 33 | ./apply_model_to_npz.py \ 34 | models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ckpt \ 35 | ../blackbox/mfcc/GE/ge.dev.filter1_gt.npz 36 | ./eval_samediff.py --mvn \ 37 | models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz 38 | cd - 39 | 40 | 41 | t-SNE visualisation 42 | ------------------- 43 | To visualise embeddings, https://projector.tensorflow.org/ can be used. To 44 | generate the input required by this tool, run: 45 | 46 | ./npz_to_tsv.py \ 47 | ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz 48 | 49 | and load the data into the tool. 50 | 51 | 52 | Agglomerative clustering 53 | ------------------------ 54 | Clustering can be applied and visualised by running: 55 | 56 | ./hierarchical_clustering.py --n_samples 1000 \ 57 | ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz 58 | 59 | Here, the colouring in the labels indicate the speaker for that token. 60 | 61 | 62 | Classifier analysis 63 | ------------------- 64 | Perform speaker classification by training a multi-class logistic regression 65 | classifier on 80% of the data and then test on the remaining 20%: 66 | 67 | ./logreg_speaker.py \ 68 | ../downsample/exp/GE/mfcc.dev.gt_words.downsample_10.npz 69 | ./logreg_speaker.py \ 70 | ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.GE.val.npz 71 | 72 | Perform length (number of phones) classification: 73 | 74 | # To-do: should train and test on different sets here 75 | ./logreg_pronlength.py \ 76 | ../downsample/exp/GE/mfcc.dev.filter1_gt.downsample_10.npz GE 77 | ./logreg_pronlength.py \ 78 | ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz GE 79 | -------------------------------------------------------------------------------- /qbe/readme.md: -------------------------------------------------------------------------------- 1 | Query-by-Example Search on Hausa 2 | ================================ 3 | 4 | Overview 5 | -------- 6 | Queries are extracted from validation data and the evaluation data is treated 7 | as the search collection. 8 | 9 | 10 | Prepare and link data 11 | --------------------- 12 | Extract features and link the required speech features: 13 | 14 | ./extract_queries_link_search.py HA 15 | 16 | Extract the search intervals: 17 | 18 | ./data_prep_dense_seg.py --min_frames 20 --max_frames 60 --step 3 \ 19 | --n_splits 2 HA 20 | 21 | 22 | DTW-based QbE 23 | ------------- 24 | Get QbE costs and write these to file: 25 | 26 | ./get_dtw_costs.py --n_cpus 29 HA 27 | 28 | Evaluate QbE performance: 29 | 30 | ./eval_qbe.py HA exp/HA/dtw/cost_dict.pkl 31 | 32 | HA results: 33 | 34 | Avg. duration per comparison: 0.057 sec 35 | --------------------------------------------------------------------------- 36 | EER: 0.2655, avg: 0.2918, median: 0.2783, max: 0.4505, min: 0.1844 37 | AUC: 0.8002, avg: 0.7724, median: 0.7960, max: 0.8766, min: 0.5752 38 | P@10: 0.4139, avg: 0.3468, median: 0.3550, max: 0.5433, min: 0.0933 39 | P@N: 0.3257, avg: 0.2870, median: 0.2937, max: 0.4471, min: 0.0836 40 | --------------------------------------------------------------------------- 41 | 42 | 43 | Embedding-based QbE 44 | ------------------- 45 | Apply a CAE-RNN to the dense intervals for the different splits: 46 | 47 | ./apply_model_dense.py \ 48 | ../embeddings/models/HA.utd/train_cae_rnn/5addd62282/cae.best_val.ckpt \ 49 | HA search.0 50 | ./apply_model_dense.py \ 51 | ../embeddings/models/HA.utd/train_cae_rnn/5addd62282/cae.best_val.ckpt \ 52 | HA search.1 53 | 54 | Combine the splits: 55 | 56 | ./combine_model_output.py exp/HA/5addd62282.min_20.max_60.step_3 57 | 58 | Remove split files: 59 | 60 | rm exp/HA/*/search.?.npz 61 | 62 | Embed the queries: 63 | 64 | ../embeddings/apply_model_to_npz.py \ 65 | ../embeddings/models/HA.utd/train_cae_rnn/5addd62282/cae.best_val.ckpt \ 66 | data/HA/queries.npz \ 67 | --output_npz_fn exp/HA/5addd62282.min_20.max_60.step_3/queries.npz 68 | 69 | Apply normalisation: 70 | 71 | ./dense_seg_mvn.py exp/HA/5addd62282.min_20.max_60.step_3 72 | 73 | Calculate costs: 74 | 75 | # Unnormalised 76 | ./get_dense_seg_costs.py exp/HA/5addd62282.min_20.max_60.step_3 77 | # MVN 78 | ./get_dense_seg_costs.py exp/HA/mvn.5addd62282.min_20.max_60.step_3 79 | 80 | Evaluate QbE performance: 81 | 82 | # Unnormalised 83 | ./eval_qbe.py HA \ 84 | exp/HA/5addd62282.min_20.max_60.step_3/cost_dict.cosine.pkl 85 | # MVN 86 | ./eval_qbe.py HA \ 87 | exp/HA/mvn.5addd62282.min_20.max_60.step_3/cost_dict.cosine.pkl 88 | 89 | HA results with normalisation: 90 | 91 | Avg. duration per comparison: 0.00061147 sec 92 | --------------------------------------------------------------------------- 93 | EER: 0.2354, avg: 0.2720, median: 0.2419, max: 0.4669, min: 0.1669 94 | AUC: 0.8254, avg: 0.7879, median: 0.8284, max: 0.8813, min: 0.5541 95 | P@10: 0.3745, avg: 0.3045, median: 0.3133, max: 0.5233, min: 0.0600 96 | P@N: 0.3053, avg: 0.2515, median: 0.2618, max: 0.4180, min: 0.0528 97 | --------------------------------------------------------------------------- 98 | -------------------------------------------------------------------------------- /qbe/extract_queries_link_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Extract queries and link search datasets for a particular GlobalPhone language. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from os import path 12 | from tqdm import tqdm 13 | import argparse 14 | import codecs 15 | import numpy as np 16 | import os 17 | import sys 18 | 19 | sys.path.append(path.join("..", "embeddings")) 20 | 21 | from link_mfcc import link_features 22 | 23 | 24 | #-----------------------------------------------------------------------------# 25 | # UTILITY FUNCTIONS # 26 | #-----------------------------------------------------------------------------# 27 | 28 | def check_argv(): 29 | """Check the command line arguments.""" 30 | parser = argparse.ArgumentParser( 31 | description=__doc__.strip().split("\n")[0], add_help=False 32 | ) 33 | parser.add_argument( 34 | "language", type=str, help="GlobalPhone language", 35 | choices=["HA"] 36 | ) 37 | if len(sys.argv) == 1: 38 | parser.print_help() 39 | sys.exit(1) 40 | return parser.parse_args() 41 | 42 | 43 | #-----------------------------------------------------------------------------# 44 | # MAIN FUNCTION # 45 | #-----------------------------------------------------------------------------# 46 | 47 | def main(): 48 | args = check_argv() 49 | 50 | # Create feature/link directory 51 | feat_dir = path.join("data", args.language) 52 | if not path.isdir(feat_dir): 53 | os.makedirs(feat_dir) 54 | 55 | # Read keywords 56 | keywords_fn = path.join("..", "data", args.language, "keywords.txt") 57 | with codecs.open(keywords_fn, "r", "utf-8") as f: 58 | keywords = [line.strip() for line in f] 59 | print("No. keywords:", len(keywords)) 60 | 61 | # Extract queries from development data 62 | queries_feat_fn = path.join(feat_dir, "queries.npz") 63 | if not path.isfile(queries_feat_fn): 64 | dev_feat_fn = path.join( 65 | "..", "features", "mfcc", args.language, args.language.lower() + 66 | ".dev.gt_words.npz" 67 | ) 68 | assert path.isfile(dev_feat_fn), "file not found: " + dev_feat_fn 69 | print("Reading:", dev_feat_fn) 70 | dev_feat_dict = np.load(dev_feat_fn) 71 | print("Extracting queries:") 72 | queries_feat_dict = {} 73 | for utterance_key in tqdm(dev_feat_dict): 74 | label = utterance_key.split("_")[0] 75 | if label in keywords: 76 | queries_feat_dict[utterance_key] = dev_feat_dict[utterance_key] 77 | print("No. queries tokens:", len(queries_feat_dict)) 78 | print("Writing:", queries_feat_fn) 79 | np.savez(queries_feat_fn, **queries_feat_dict) 80 | else: 81 | print("Using existing file:", queries_feat_fn) 82 | 83 | # Link test search utterances 84 | search_feat_fn = path.join( 85 | "..", "..", "..", "features", "mfcc", args.language, 86 | args.language.lower() + ".eval.npz" 87 | ) # relative path 88 | link_fn = path.join(feat_dir, "search.npz") 89 | link_features(search_feat_fn, link_fn, feat_dir) 90 | 91 | 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /qbe/dense_seg_mvn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Perform mean and variance normalisation. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2017, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | from tqdm import tqdm 14 | import argparse 15 | import numpy as np 16 | import os 17 | import sys 18 | 19 | 20 | #-----------------------------------------------------------------------------# 21 | # UTILITY FUNCTIONS # 22 | #-----------------------------------------------------------------------------# 23 | 24 | def check_argv(): 25 | """Check the command line arguments.""" 26 | parser = argparse.ArgumentParser( 27 | description=__doc__.strip().split("\n")[0], add_help=False 28 | ) 29 | parser.add_argument("exp_dir", type=str, help="experiments directory") 30 | if len(sys.argv) == 1: 31 | parser.print_help() 32 | sys.exit(1) 33 | return parser.parse_args() 34 | 35 | 36 | #-----------------------------------------------------------------------------# 37 | # MAIN FUNCTION # 38 | #-----------------------------------------------------------------------------# 39 | 40 | def main(): 41 | args = check_argv() 42 | 43 | print(datetime.now()) 44 | 45 | # Read queries 46 | fn = path.join(args.exp_dir, "queries.npz") 47 | if not path.isfile(fn): 48 | import re 49 | fn = path.join( 50 | re.sub("min\_.*step\_\d*\.", "", args.exp_dir), "queries.npz" 51 | ) 52 | print("Reading:", fn) 53 | queries_dict = np.load(fn) 54 | print("No. queries:", len(queries_dict.keys())) 55 | 56 | # Read search collection 57 | fn = path.join(args.exp_dir, "search.npz") 58 | print("Reading:", fn) 59 | search_dict = np.load(fn) 60 | print("No. search utterances:", len(search_dict.keys())) 61 | 62 | # Calculate mean and variance 63 | search_stacked = np.vstack([search_dict[i] for i in search_dict]) 64 | mean = np.mean(search_stacked, axis=0) 65 | std = np.std(search_stacked, axis=0) 66 | std[std == 0] = np.mean(std) # hack 67 | 68 | # Apply normalisation 69 | mvn_queries_dict = {} 70 | print("Normalising queries:") 71 | for query_key in tqdm(queries_dict): 72 | mvn_queries_dict[query_key] = ( 73 | np.array(queries_dict[query_key]) - mean 74 | ) / std 75 | print("No. queries:", len(mvn_queries_dict)) 76 | mvn_search_dict = {} 77 | print("Normalising search utterances:") 78 | for search_key in tqdm(search_dict): 79 | mvn_search_dict[search_key] = ( 80 | np.array(search_dict[search_key]) - mean 81 | ) / std 82 | print("No. search utterances:", len(mvn_search_dict)) 83 | 84 | print(datetime.now()) 85 | 86 | # Create output directory 87 | exp_dir = path.normpath(args.exp_dir) 88 | output_dir = path.join( 89 | path.split(exp_dir)[0], "mvn." + path.split(exp_dir)[1] 90 | ) 91 | if not path.isdir(output_dir): 92 | os.makedirs(output_dir) 93 | 94 | # Write normalized Numpy archives 95 | fn = path.join(output_dir, "queries.npz") 96 | print("Writing:", fn) 97 | np.savez(fn, **mvn_queries_dict) 98 | fn = path.join(output_dir, "search.npz") 99 | print("Writing:", fn) 100 | np.savez(fn, **mvn_search_dict) 101 | 102 | print(datetime.now()) 103 | 104 | 105 | if __name__ == "__main__": 106 | main() 107 | -------------------------------------------------------------------------------- /qbe/get_dtw_costs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Obtain the QbE costs for a given set of queries and search utterances. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2017, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | import argparse 14 | import pickle 15 | import numpy as np 16 | import os 17 | import sys 18 | import timeit 19 | 20 | sys.path.append(path.join("..", "..", "src", "speech_dtw")) 21 | 22 | from speech_dtw import qbe 23 | 24 | 25 | #-----------------------------------------------------------------------------# 26 | # UTILITY FUNCTIONS # 27 | #-----------------------------------------------------------------------------# 28 | 29 | def check_argv(): 30 | """Check the command line arguments.""" 31 | parser = argparse.ArgumentParser( 32 | description=__doc__.strip().split("\n")[0], add_help=False 33 | ) 34 | parser.add_argument( 35 | "--n_cpus", type=int, 36 | help="number of CPUs to parallelise over (default: %(default)s)", 37 | default=1 38 | ) 39 | parser.add_argument( 40 | "feature_label", type=str, 41 | help="identifier for the set of queries and search utterances" 42 | ) 43 | if len(sys.argv) == 1: 44 | parser.print_help() 45 | sys.exit(1) 46 | return parser.parse_args() 47 | 48 | 49 | #-----------------------------------------------------------------------------# 50 | # MAIN FUNCTION # 51 | #-----------------------------------------------------------------------------# 52 | 53 | def main(): 54 | args = check_argv() 55 | 56 | print(datetime.now()) 57 | 58 | # Read queries into a list 59 | fn = path.join("data", args.feature_label, "queries.npz") 60 | print("Reading:", fn) 61 | queries_dict = np.load(fn) 62 | queries_keys = sorted(queries_dict.keys()) 63 | queries_list = [ 64 | np.asarray(queries_dict[i], np.double) for i in queries_keys 65 | ] 66 | print("No. queries:", len(queries_list)) 67 | 68 | # Read search collection into a list 69 | fn = path.join("data", args.feature_label, "search.npz") 70 | print("Reading:", fn) 71 | search_dict = np.load(fn) 72 | search_keys = sorted(search_dict.keys()) 73 | search_list = [ 74 | np.asarray(search_dict[i], np.double) for i in search_keys 75 | ] 76 | print("No. search items:", len(search_list)) 77 | 78 | print(datetime.now()) 79 | 80 | # Perform QbE 81 | print("Calculating costs: {} cores".format(args.n_cpus)) 82 | start_time = timeit.default_timer() 83 | dtw_costs = qbe.parallel_dtw_sweep_min( 84 | queries_list, search_list, n_cpus=args.n_cpus 85 | ) 86 | end_time = timeit.default_timer() 87 | duration = end_time - start_time 88 | print(datetime.now()) 89 | print( 90 | "Avg. duration per comparison: {:.8f} sec".format(duration * 91 | args.n_cpus / (len(queries_list) * len(search_list))) 92 | ) 93 | 94 | # Write costs 95 | cost_dict = {} 96 | for i_query, key_query in enumerate(queries_keys): 97 | if key_query not in cost_dict: 98 | cost_dict[key_query] = {} 99 | for i_search, key_search in enumerate(search_keys): 100 | cost_dict[key_query][key_search] = dtw_costs[i_query][i_search] 101 | output_dir = path.join("exp", args.feature_label, "dtw") 102 | if not path.isdir(output_dir): 103 | os.makedirs(output_dir) 104 | fn = path.join(output_dir, "cost_dict.pkl") 105 | print("Writing:", fn) 106 | with open(fn, "wb") as f: 107 | pickle.dump(cost_dict, f, -1) 108 | 109 | print(datetime.now()) 110 | 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /downsample/downsample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Downsample a given file using a particular technique and target dimensionality. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2015, 2018, 2019 9 | """ 10 | 11 | import argparse 12 | import numpy as np 13 | import scipy.interpolate as interpolate 14 | import scipy.signal as signal 15 | import sys 16 | 17 | flatten_order = "C" 18 | 19 | 20 | #-----------------------------------------------------------------------------# 21 | # UTILITY FUNCTIONS # 22 | #-----------------------------------------------------------------------------# 23 | 24 | def check_argv(): 25 | """Check the command line arguments.""" 26 | parser = argparse.ArgumentParser( 27 | description=__doc__.strip().split("\n")[0], add_help=False 28 | ) 29 | parser.add_argument("input_npz_fn", type=str, help="input speech file") 30 | parser.add_argument( 31 | "output_npz_fn", type=str, help="output embeddings file" 32 | ) 33 | parser.add_argument("n", type=int, help="number of samples") 34 | parser.add_argument( 35 | "--technique", choices=["interpolate", "resample", "rasanen"], 36 | default="resample" 37 | ) 38 | parser.add_argument( 39 | "--frame_dims", type=int, default=None, 40 | help="only keep these number of dimensions" 41 | ) 42 | if len(sys.argv) == 1: 43 | parser.print_help() 44 | sys.exit(1) 45 | return parser.parse_args() 46 | 47 | 48 | #-----------------------------------------------------------------------------# 49 | # MAIN FUNCTION # 50 | #-----------------------------------------------------------------------------# 51 | 52 | def main(): 53 | args = check_argv() 54 | 55 | print("Reading:", args.input_npz_fn) 56 | input_npz = np.load(args.input_npz_fn) 57 | d_frame = input_npz[sorted(input_npz.keys())[0]].shape[1] 58 | 59 | print("Frame dimensionality:", d_frame) 60 | if args.frame_dims is not None and args.frame_dims < d_frame: 61 | d_frame = args.frame_dims 62 | print("Reducing frame dimensionality:", d_frame) 63 | 64 | print("Downsampling:", args.technique) 65 | output_npz = {} 66 | for key in input_npz: 67 | 68 | # Limit input dimensionailty 69 | y = input_npz[key][:, :args.frame_dims].T 70 | 71 | # Downsample 72 | if args.technique == "interpolate": 73 | x = np.arange(y.shape[1]) 74 | f = interpolate.interp1d(x, y, kind="linear") 75 | x_new = np.linspace(0, y.shape[1] - 1, args.n) 76 | y_new = f(x_new).flatten(flatten_order) #.flatten("F") 77 | elif args.technique == "resample": 78 | y_new = signal.resample( 79 | y, args.n, axis=1 80 | ).flatten(flatten_order) #.flatten("F") 81 | elif args.technique == "rasanen": 82 | # Taken from Rasenen et al., Interspeech, 2015 83 | n_frames_in_multiple = int(np.floor(y.shape[1] / args.n)) * args.n 84 | y_new = np.mean( 85 | y[:, :n_frames_in_multiple].reshape((d_frame, args.n, -1)), 86 | axis=-1 87 | ).flatten(flatten_order) #.flatten("F") 88 | 89 | # This was done in Rasenen et al., 2015, but didn't help here 90 | # last_term = args.n/3. * np.log10(y.shape[1] * 10e-3) 91 | # Not sure if the above should be in frames or ms 92 | # y_new = np.hstack([y_new, last_term]) 93 | 94 | # Save result 95 | output_npz[key] = y_new 96 | 97 | print( 98 | "Output dimensionality:", 99 | output_npz[sorted(output_npz.keys())[0]].shape[0] 100 | ) 101 | 102 | print("Writing:", args.output_npz_fn) 103 | np.savez_compressed(args.output_npz_fn, **output_npz) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /blackbox/logreg_speaker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Use logistic regression for speaker classification. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.metrics import accuracy_score, classification_report 14 | from tqdm import tqdm 15 | import argparse 16 | import numpy as np 17 | import sys 18 | 19 | 20 | #-----------------------------------------------------------------------------# 21 | # UTILITY FUNCTIONS # 22 | #-----------------------------------------------------------------------------# 23 | 24 | def check_argv(): 25 | """Check the command line arguments.""" 26 | parser = argparse.ArgumentParser( 27 | description=__doc__.strip().split("\n")[0], add_help=False 28 | ) 29 | parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings") 30 | if len(sys.argv) == 1: 31 | parser.print_help() 32 | sys.exit(1) 33 | return parser.parse_args() 34 | 35 | 36 | #-----------------------------------------------------------------------------# 37 | # MAIN FUNCTION # 38 | #-----------------------------------------------------------------------------# 39 | 40 | def main(): 41 | args = check_argv() 42 | 43 | print("Reading:", args.npz_fn) 44 | embeddings = np.load(args.npz_fn) 45 | 46 | # # Temp 47 | # import random 48 | # data = {} 49 | # a = list(embeddings) 50 | # random.shuffle(a) 51 | # for key in a[:100]: 52 | # data[key] = embeddings[key] 53 | # embeddings = data 54 | 55 | print("Ordering embeddings:") 56 | n_embeds = 0 57 | X = [] 58 | utt_keys = [] 59 | words = [] 60 | speakers = [] 61 | for utt_key in tqdm(sorted(embeddings)): 62 | utt_keys.append(utt_key) 63 | X.append(embeddings[utt_key]) 64 | utt_key = utt_key.split("_") 65 | word = utt_key[0] 66 | speaker = utt_key[1] 67 | words.append(word) 68 | speakers.append(speaker) 69 | X = np.array(X) 70 | print("No. embeddings:", X.shape[0]) 71 | print("Embedding dimensionality:", X.shape[1]) 72 | 73 | # Convert words to IDs 74 | speaker_set = set(speakers) 75 | speaker_to_id = dict( 76 | zip(sorted(list(speaker_set)), range(len(speaker_set))) 77 | ) 78 | id_to_speaker = dict([[v,k] for k, v in speaker_to_id.items()]) 79 | y = [] 80 | for speaker in speakers: 81 | y.append(speaker_to_id[speaker]) 82 | y = np.array(y, dtype=int) 83 | print("No. speakers:", len(speaker_to_id)) 84 | 85 | # Split training and test sets 80/20 86 | indices = np.arange(X.shape[0]) 87 | np.random.seed(1) 88 | np.random.shuffle(indices) 89 | n_train = int(round(X.shape[0]*0.8)) 90 | X_train = X[indices[:n_train]] 91 | X_test = X[indices[n_train:]] 92 | y_train = y[indices[:n_train]] 93 | y_test = y[indices[n_train:]] 94 | print("Training data shape:", X_train.shape) 95 | print("Test data shape:", X_test.shape) 96 | 97 | # Multi-class logistic regression 98 | print(datetime.now()) 99 | print("Fitting multi-class logistic regression model") 100 | logreg = LogisticRegression( 101 | C=1e5, solver="lbfgs", multi_class="multinomial" 102 | # solver="lbfgs", multi_class="ovr", max_iter=200 103 | ) 104 | logreg.fit(X_train, y_train) 105 | print(datetime.now()) 106 | 107 | # Predict classes 108 | y_pred = logreg.predict(X_test) 109 | accuracy = accuracy_score(y_test, y_pred) 110 | 111 | print("Speaker classification accuracy: {:.2f}%".format(accuracy*100)) 112 | print( 113 | classification_report(y_test, y_pred, 114 | target_names=[id_to_speaker[i] for i in range(max(y) + 1)]) 115 | ) 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /embeddings/readme.md: -------------------------------------------------------------------------------- 1 | Acoustic Word Embedding Models and Evaluation 2 | ============================================= 3 | 4 | Overview 5 | -------- 6 | The examples below are intended for illustration purposes -- there are many 7 | different language-combinations and other settings which can be adjusted for 8 | the different models. But the most important command-line arguments are 9 | illustrated in the examples here. 10 | 11 | 12 | Data preparation 13 | ---------------- 14 | Create links to the MFCC NumPy archives: 15 | 16 | ./link_mfcc.py SP 17 | 18 | You need to run `link_mfcc.py` for all languages; run it without any arguments 19 | to see all 16 language codes. Alternatively, links can be greated for all 20 | languages by passing the "all" argument. 21 | 22 | 23 | Autoencoder RNN 24 | --------------- 25 | Train an AE-RNN on Spanish UTD segments: 26 | 27 | ./train_cae_rnn.py --extrinsic_usefinal --ae_n_val_interval 9 \ 28 | --ae_n_epochs 10 --cae_n_epochs 0 --train_tag utd --val_lang SP SP 29 | 30 | Train an AE-RNN on seven languages using ground truth segments and validate on 31 | German: 32 | 33 | ./train_cae_rnn.py --ae_n_epochs 25 --cae_n_epochs 0 \ 34 | --n_max_types 1000 --train_tag gt --val_lang GE RU+CZ+FR+PL+TH+PO 35 | 36 | 37 | Correspondence autoencoder RNN 38 | ------------------------------ 39 | Train a CAE-RNN on Spanish UTD segments: 40 | 41 | ./train_cae_rnn.py --pretrain_usefinal --extrinsic_usefinal \ 42 | --ae_n_val_interval 14 --ae_n_epochs 15 --cae_n_epochs 3 \ 43 | --cae_batch_size 600 --train_tag utd --val_lang SP SP 44 | 45 | Evaluate the model: 46 | 47 | ./apply_model.py \ 48 | models/SP.utd/train_cae_rnn/17b498a959/cae.best_val.ckpt SP val 49 | ./eval_samediff.py --mvn \ 50 | models/SP.utd/train_cae_rnn/17b498a959/cae.best_val.SP.val.npz 51 | 52 | Analyse embeddings: 53 | 54 | ./analyse_embeds.py --normalize --word_type \ 55 | guatemala,presidente,autoridades,candidatos,vicepresidente,social \ 56 | models/SP.utd/train_cae_rnn/17b498a959/cae.best_val.SP.val.npz 57 | 58 | All the models trained below can be applied, evaluated and analysed using the 59 | scripts above. 60 | 61 | Train a CNN-RNN on Spanish ground truth segments: 62 | 63 | ./train_cae_rnn.py --pretrain_usefinal --n_max_pairs 100000 \ 64 | --ae_n_val_interval 14 --ae_n_epochs 15 --cae_n_epochs 25 \ 65 | --train_tag gt --val_lang SP SP 66 | 67 | Train a CAE-RNN jointly on multiple languages, limiting the maximum overall 68 | number of pairs, the maximum number of types per language and requiring a 69 | minimum number of tokens per type: 70 | 71 | ./train_cae_rnn.py --pretrain_usefinal --ae_n_val_interval 14 \ 72 | --ae_n_epochs 15 --cae_n_epochs 10 --n_max_pairs 300000 \ 73 | --n_min_tokens_per_type 2 --n_max_types 1000 --train_tag gt \ 74 | --val_lang GE RU+CZ+FR+PL+TH+PO 75 | 76 | 77 | Siamese RNN 78 | ----------- 79 | Train a Siamese RNN on ground truth segments: 80 | 81 | ./train_siamese_rnn.py --n_epochs 25 --train_tag gt --val_lang SP SP 82 | 83 | Train a Siamese RNN ensuring that each batch contains paired data, i.e., no 84 | batch will have a singleton token: 85 | 86 | ./train_siamese_rnn_pairbatch.py --n_epochs 15 --train_tag gt \ 87 | --margin 0.2 --val_lang GE GE 88 | 89 | 90 | Siamese CNN 91 | ----------- 92 | Train a Siamese CNN on ground truth segments: 93 | 94 | ./train_siamese_cnn.py --n_epochs 150 --train_tag gt --n_val_interval 5 SP 95 | 96 | 97 | Classifier CNN 98 | -------------- 99 | Train a word classifier CNN on ground truth segments: 100 | 101 | ./train_cnn.py --n_epochs 100 --train_tag gt --n_val_interval 5 SP 102 | 103 | 104 | Classifier RNN 105 | -------------- 106 | Train a word classifier RNN on ground truth segments: 107 | 108 | ./train_rnn.py --n_epochs 25 --train_tag gt --val_lang SP SP 109 | 110 | Train a word classifier RNN jointly on multiple languages: 111 | 112 | ./train_rnn.py --n_epochs 15 --train_tag gt --n_max_types 10000 \ 113 | --n_max_tokens_per_type 20 --val_lang GE RU+CZ+FR+PL+TH+PO 114 | 115 | -------------------------------------------------------------------------------- /blackbox/hierarchical_clustering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Apply agglomerative clustering to embeddings and plot a labelled dendrogram. 5 | 6 | See 7 | https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/ 8 | 9 | Author: Herman Kamper 10 | Contact: kamperh@gmail.com 11 | Date: 2019 12 | """ 13 | 14 | from os import path 15 | from scipy.cluster.hierarchy import dendrogram, linkage 16 | from tqdm import tqdm 17 | import argparse 18 | import matplotlib.pyplot as plt 19 | import numpy as np 20 | import random 21 | import sys 22 | 23 | 24 | #-----------------------------------------------------------------------------# 25 | # UTILITY FUNCTIONS # 26 | #-----------------------------------------------------------------------------# 27 | 28 | def check_argv(): 29 | """Check the command line arguments.""" 30 | parser = argparse.ArgumentParser( 31 | description=__doc__.strip().split("\n")[0], add_help=False 32 | ) 33 | parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings") 34 | parser.add_argument( 35 | "--n_samples", type=int, 36 | help="if given, the embeddings are subsampled" 37 | ) 38 | if len(sys.argv) == 1: 39 | parser.print_help() 40 | sys.exit(1) 41 | return parser.parse_args() 42 | 43 | 44 | #-----------------------------------------------------------------------------# 45 | # MAIN FUNCTION # 46 | #-----------------------------------------------------------------------------# 47 | 48 | def main(): 49 | args = check_argv() 50 | 51 | print("Reading:", args.npz_fn) 52 | embeddings = np.load(args.npz_fn) 53 | 54 | if args.n_samples is not None: 55 | utt_keys = list(embeddings) 56 | random.seed(1) 57 | random.shuffle(utt_keys) 58 | new_embeddings = {} 59 | for utt_key in utt_keys[:args.n_samples]: 60 | new_embeddings[utt_key] = embeddings[utt_key] 61 | embeddings = new_embeddings 62 | 63 | print("Ordering embeddings:") 64 | n_embeds = 0 65 | X = [] 66 | utt_keys = [] 67 | labels = [] 68 | speakers = [] 69 | for utt_key in tqdm(sorted(embeddings)): 70 | utt_keys.append(utt_key) 71 | X.append(embeddings[utt_key]) 72 | utt_key = utt_key.split("_") 73 | label = utt_key[0] 74 | speaker = utt_key[1] 75 | labels.append(label) 76 | speakers.append(speaker) 77 | X = np.array(X) 78 | print("No. embeddings:", X.shape[0]) 79 | print("Embedding dimensionality:", X.shape[1]) 80 | 81 | # Normalise 82 | normed = (X - X.mean(axis=0)) / X.std(axis=0) 83 | X = normed 84 | 85 | # Get a speaker colour map 86 | # cmap = plt.cm.jet 87 | cmap = plt.cm.viridis 88 | cmaplist = [cmap(i) for i in range(cmap.N)] 89 | speakers_set = set(speakers) 90 | n_speakers = len(speakers_set) 91 | speaker_to_color = {} 92 | for i_speaker, speaker in enumerate(sorted(list(speakers_set))): 93 | speaker_to_color[speaker] = cmaplist[ 94 | int(i_speaker/n_speakers * (len(cmaplist) - 1)) 95 | ] 96 | # speakers_to_id = dict( 97 | # zip(sorted(list(speakers_set)), range(len(speakers_set))) 98 | # ) 99 | # speakers_to_color = {} 100 | # for speaker in speakers_to_id: 101 | # speakers_to_color[speaker] = 102 | 103 | # Cluster 104 | print("Clustering") 105 | Z = linkage(X, method="ward", metric="euclidean") 106 | 107 | # Plot dendrogram 108 | print("Plotting") 109 | plt.figure() 110 | R = dendrogram( 111 | Z, 112 | leaf_rotation=90, 113 | leaf_font_size=8, 114 | labels=labels 115 | ) 116 | leaves = R["leaves"] 117 | 118 | ax = plt.gca() 119 | x_labels = ax.get_xmajorticklabels() 120 | for i, x in enumerate(x_labels): 121 | x.set_color(speaker_to_color[speakers[leaves[i]]]) 122 | # c = 123 | # print(x.get_text(), labels[leaves[i]], speakers[leaves[i]]) 124 | # x.set_color(colorDict[x.get_text()]) 125 | 126 | plt.show() 127 | 128 | 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | Multilingual Acoustic Word Embeddings on GlobalPhone 2 | ==================================================== 3 | 4 | Overview 5 | -------- 6 | Multilingual acoustic word embedding approaches are implemented and evaluated 7 | on the GlobalPhone corpus. The experiments are described in: 8 | 9 | - H. Kamper, Y. Matusevych, and S.J. Goldwater, "Multilingual acoustic word 10 | embedding models for processing zero-resource languages," in *Proc. ICASSP*, 11 | 2020. [[arXiv](https://arxiv.org/abs/2002.02109)] 12 | - H. Kamper, Y. Matusevych, and S. J. Goldwater, "Improved acoustic word 13 | embeddings for zero-resource languages using multilingual transfer," 14 | *arXiv preprint arXiv:2006.02295*, 2020. 15 | [[arXiv](https://arxiv.org/abs/2006.02295)] 16 | 17 | Please cite these papers if you use the code. 18 | 19 | 20 | Disclaimer 21 | ---------- 22 | The code provided here is not pretty. But I believe that research should be 23 | reproducible. I provide no guarantees with the code, but please let me know if 24 | you have any problems, find bugs or have general comments. 25 | 26 | 27 | Download datasets 28 | ----------------- 29 | The [GlobalPhone](https://csl.anthropomatik.kit.edu/english/globalphone.php) 30 | corpus and forced alignments of the data needs to be obtained. GlobalPhone 31 | needs to be paid for. If you have proof of payment, we can give you access to 32 | the forced alignments. Save the data and forced alignments in a separate 33 | directory and update the `paths.py` file to point to the data directories. 34 | 35 | 36 | Install dependencies 37 | -------------------- 38 | You will require the following: 39 | 40 | - [Python 3](https://www.python.org/downloads/) 41 | - [TensorFlow 1.13.1](https://www.tensorflow.org/) 42 | - [LibROSA](http://librosa.github.io/librosa/) 43 | - [Cython](https://cython.org/) 44 | - [tqdm](https://tqdm.github.io/) 45 | - [speech_dtw](https://github.com/kamperh/speech_dtw/) 46 | - [shorten](http://etree.org/shnutils/shorten/dist/src/shorten-3.6.1.tar.gz) 47 | 48 | To install `speech_dtw` (required for same-different evaluation) and `shorten` 49 | (required for processing audio), run `./install_local.sh`. 50 | 51 | You can install all the other dependencies in a conda environment by running: 52 | 53 | conda env create -f environment.yml 54 | conda activate tf1.13 55 | 56 | 57 | Extract speech features 58 | ----------------------- 59 | Update the paths in `paths.py` to point to the data directories. Extract MFCC 60 | features in the `features/` directory as follows: 61 | 62 | cd features 63 | ./extract_features.py SP 64 | 65 | You need to run `extract_features.py` for all languages; run it without any 66 | arguments to see all 16 language codes. 67 | 68 | UTD pairs can also be analysed here, by running e.g.: 69 | 70 | ./analyse_utd_pairs.py SP 71 | 72 | 73 | Evaluate frame-level features using the same-different task 74 | ----------------------------------------------------------- 75 | This is optional. To perform frame-level same-different evaluation based on 76 | dynamic time warping (DTW), follow [samediff/readme.md](samediff/readme.md). 77 | 78 | 79 | Obtain downsampled acoustic word embeddings 80 | ------------------------------------------- 81 | Extract and evaluate downsampled acoustic word embeddings by running the steps 82 | in [downsample/readme.md](downsample/readme.md). 83 | 84 | 85 | Train neural acoustic word embeddings 86 | ------------------------------------- 87 | Train and evaluate neural network acoustic word embedding models by running the 88 | steps in [embeddings/readme.md](embeddings/readme.md). 89 | 90 | 91 | Analyse embedding models 92 | ------------------------ 93 | Analyse different properties/aspects of the acoustic word embedding models by 94 | running the steps in [blackbox/readme.md](blackbox/readme.md). 95 | 96 | 97 | Query-by-example search 98 | ----------------------- 99 | Perform query-by-example search experiments by running the steps in 100 | [qbe/readme.md](qbe/readme.md). 101 | 102 | 103 | Unit tests 104 | ---------- 105 | In the root project directory, run `make test` to run unit tests. 106 | 107 | 108 | References 109 | ---------- 110 | - https://github.com/eginhard/cae-utd-utils 111 | 112 | 113 | Contributors 114 | ------------ 115 | - [Herman Kamper](http://www.kamperh.com/) 116 | - [Yevgen Matusevych](https://homepages.inf.ed.ac.uk/ymatusev/) 117 | - [Sharon Goldwater](https://homepages.inf.ed.ac.uk/sgwater/) 118 | 119 | 120 | License 121 | ------- 122 | The code is distributed under the Creative Commons Attribution-ShareAlike 123 | license ([CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)). 124 | -------------------------------------------------------------------------------- /embeddings/eval_samediff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Perform same-different evaluation of fixed-dimensional representations. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2015, 2016, 2018, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | from scipy.spatial.distance import pdist 14 | import argparse 15 | import numpy as np 16 | import sys 17 | 18 | sys.path.append(path.join("..", "..", "src", "speech_dtw", "utils")) 19 | 20 | import samediff 21 | 22 | 23 | #-----------------------------------------------------------------------------# 24 | # UTILITY FUNCTIONS # 25 | #-----------------------------------------------------------------------------# 26 | 27 | def check_argv(): 28 | """Check the command line arguments.""" 29 | parser = argparse.ArgumentParser( 30 | description=__doc__.strip().split("\n")[0], add_help=False 31 | ) 32 | parser.add_argument("npz_fn", type=str, help="NumPy archive") 33 | parser.add_argument( 34 | "--metric", choices=["cosine", "euclidean", "hamming", "chebyshev", 35 | "kl"], default="cosine", help="distance metric (default: %(default)s)" 36 | ) 37 | parser.add_argument( 38 | "--mean_ap", dest="mean_ap", action="store_true", 39 | help="also compute mean average precision (this is significantly " 40 | "more resource intensive)" 41 | ) 42 | parser.add_argument( 43 | "--mvn", action="store_true", 44 | help="mean and variance normalise (default: False)" 45 | ) 46 | if len(sys.argv) == 1: 47 | parser.print_help() 48 | sys.exit(1) 49 | return parser.parse_args() 50 | 51 | 52 | #-----------------------------------------------------------------------------# 53 | # MAIN FUNCTION # 54 | #-----------------------------------------------------------------------------# 55 | 56 | def main(): 57 | args = check_argv() 58 | 59 | print(datetime.now()) 60 | 61 | print("Reading:", args.npz_fn) 62 | npz = np.load(args.npz_fn) 63 | 64 | print(datetime.now()) 65 | 66 | print("Ordering embeddings") 67 | n_embeds = 0 68 | X = [] 69 | ids = [] 70 | for label in sorted(npz): 71 | ids.append(label) 72 | X.append(npz[label]) 73 | n_embeds += 1 74 | X = np.array(X) 75 | print("No. embeddings:", n_embeds) 76 | print("Embedding dimensionality:", X.shape[1]) 77 | 78 | if args.mvn: 79 | normed = (X - X.mean(axis=0)) / X.std(axis=0) 80 | X = normed 81 | 82 | print(datetime.now()) 83 | 84 | print("Calculating distances") 85 | metric = args.metric 86 | if metric == "kl": 87 | import scipy.stats 88 | metric = scipy.stats.entropy 89 | distances = pdist(X, metric=metric) 90 | 91 | print(datetime.now()) 92 | 93 | print("Getting labels and speakers") 94 | labels = [] 95 | speakers = [] 96 | for utt_id in ids: 97 | utt_id = utt_id.split("_") 98 | word = utt_id[0] 99 | speaker = utt_id[1] 100 | labels.append(word) 101 | speakers.append(speaker) 102 | 103 | if args.mean_ap: 104 | print(datetime.now()) 105 | print("Calculating mean average precision") 106 | mean_ap, mean_prb, ap_dict = samediff.mean_average_precision( 107 | distances, labels 108 | ) 109 | print("Mean average precision:", mean_ap) 110 | print("Mean precision-recall breakeven:", mean_prb) 111 | 112 | print(datetime.now()) 113 | 114 | print("Calculating average precision") 115 | # matches = samediff.generate_matches_array(labels) # Temp 116 | word_matches = samediff.generate_matches_array(labels) 117 | speaker_matches = samediff.generate_matches_array(speakers) 118 | print("No. same-word pairs:", sum(word_matches)) 119 | print("No. same-speaker pairs:", sum(speaker_matches)) 120 | 121 | sw_ap, sw_prb, swdp_ap, swdp_prb = samediff.average_precision_swdp( 122 | distances[np.logical_and(word_matches, speaker_matches)], 123 | distances[np.logical_and(word_matches, speaker_matches == False)], 124 | distances[word_matches == False] 125 | ) 126 | print("-"*79) 127 | print("Average precision: {:.8f}".format(sw_ap)) 128 | print("Precision-recall breakeven: {:.8f}".format(sw_prb)) 129 | print("SWDP average precision: {:.8f}".format(swdp_ap)) 130 | print("SWDP precision-recall breakeven: {:.8f}".format(swdp_prb)) 131 | print("-"*79) 132 | 133 | print(datetime.now()) 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /qbe/data_prep_dense_seg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Prepare the data for dense segmental QbE search. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2017, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | import argparse 14 | import pickle 15 | import numpy as np 16 | import os 17 | import sys 18 | 19 | 20 | #-----------------------------------------------------------------------------# 21 | # UTILITY FUNCTIONS # 22 | #-----------------------------------------------------------------------------# 23 | 24 | def check_argv(): 25 | """Check the command line arguments.""" 26 | parser = argparse.ArgumentParser( 27 | description=__doc__.strip().split("\n")[0], add_help=False 28 | ) 29 | parser.add_argument( 30 | "language", type=str, help="GlobalPhone language", 31 | choices=["HA"] 32 | ) 33 | parser.add_argument( 34 | "--min_frames", type=int, 35 | help="minimum number of frames (default: %(default)s)", default=20 36 | ) 37 | parser.add_argument( 38 | "--max_frames", type=int, 39 | help="maximum number of frames (default: %(default)s)", default=60 40 | ) 41 | parser.add_argument( 42 | "--step", type=int, 43 | help="frame step (default: %(default)s)", default=3 44 | ) 45 | parser.add_argument( 46 | "--n_splits", type=int, 47 | help="number of search collection splits (default: %(default)s)", 48 | default=2 49 | ) 50 | if len(sys.argv) == 1: 51 | parser.print_help() 52 | sys.exit(1) 53 | return parser.parse_args() 54 | 55 | 56 | #-----------------------------------------------------------------------------# 57 | # MAIN FUNCTION # 58 | #-----------------------------------------------------------------------------# 59 | 60 | def main(): 61 | args = check_argv() 62 | 63 | print(datetime.now()) 64 | 65 | output_dir = path.join("data", args.language) 66 | if not path.isdir(output_dir): 67 | os.makedirs(output_dir) 68 | segtag = "min_{}.max_{}.step_{}".format( 69 | args.min_frames, args.max_frames, args.step 70 | ) 71 | 72 | # Subset search collection 73 | search_dict_fn = path.join("data", args.language, "search.npz") 74 | print("Reading:", search_dict_fn) 75 | search_dict = np.load(search_dict_fn) 76 | search_keys = sorted(search_dict.keys()) 77 | print("No. search utterances:", len(search_keys)) 78 | 79 | # Dense search segments list 80 | seglist_fn = path.join( 81 | output_dir, "search.seglist." + segtag + ".pkl" 82 | ) 83 | if not path.isfile(seglist_fn): 84 | print("Getting segmentation lists") 85 | seglist_dict = {} 86 | n_intervals = 0 87 | for utt_key in search_keys: 88 | seglist = [] 89 | length = search_dict[utt_key].shape[0] 90 | i_start = 0 91 | while i_start < length: 92 | i_end = i_start + args.min_frames 93 | while i_end <= length and i_end - i_start <= args.max_frames: 94 | seglist.append((i_start, i_end)) 95 | i_end += args.step 96 | n_intervals += 1 97 | i_start += args.step 98 | seglist_dict[utt_key] = seglist 99 | print("No. segmentation intervals:", n_intervals) 100 | print("Writing:", seglist_fn) 101 | with open(seglist_fn, "wb") as f: 102 | pickle.dump(seglist_dict, f, -1) 103 | else: 104 | print("Using existing file:", seglist_fn) 105 | 106 | # Split the search collection 107 | split_dict_fn = path.join( 108 | "data", args.language, "search." + str(args.n_splits - 1) + ".npz" 109 | ) 110 | if not path.isfile(split_dict_fn): 111 | n_items = int(np.ceil(np.float(len(search_keys)) / args.n_splits)) 112 | n_total = 0 113 | for i_split in range(args.n_splits): 114 | split_search_keys = search_keys[i_split*n_items:(i_split + 1)*n_items] 115 | split_dict = {} 116 | for utt_key in split_search_keys: 117 | split_dict[utt_key] = search_dict[utt_key] 118 | split_dict_fn = path.join( 119 | "data", args.language, "search." + str(i_split) + ".npz" 120 | ) 121 | print("Writing:", split_dict_fn) 122 | np.savez(split_dict_fn, **split_dict) 123 | n_total += len(split_dict) 124 | print( 125 | "Wrote {} out of {} utterances".format(len(search_dict.keys()), 126 | n_total) 127 | ) 128 | else: 129 | print("Using existing splits:", split_dict_fn) 130 | 131 | print(datetime.now()) 132 | 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /blackbox/logreg_pronlength.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Use logistic regression for classifying the number of phones in a word. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.metrics import accuracy_score, classification_report 15 | from tqdm import tqdm 16 | import argparse 17 | import numpy as np 18 | import sys 19 | 20 | from analyse_pairs import read_pronunciations 21 | 22 | 23 | #-----------------------------------------------------------------------------# 24 | # UTILITY FUNCTIONS # 25 | #-----------------------------------------------------------------------------# 26 | 27 | def check_argv(): 28 | """Check the command line arguments.""" 29 | parser = argparse.ArgumentParser( 30 | description=__doc__.strip().split("\n")[0], add_help=False 31 | ) 32 | parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings") 33 | parser.add_argument( 34 | "language", type=str, help="the pronunciations for this GlobalPhone " 35 | "language is used", choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA", 36 | "KO", "PL", "PO", "RU", "SP", "SW", "TH", "TU", "VN"] 37 | ) 38 | if len(sys.argv) == 1: 39 | parser.print_help() 40 | sys.exit(1) 41 | return parser.parse_args() 42 | 43 | 44 | #-----------------------------------------------------------------------------# 45 | # MAIN FUNCTION # 46 | #-----------------------------------------------------------------------------# 47 | 48 | def main(): 49 | args = check_argv() 50 | 51 | print("Reading:", args.npz_fn) 52 | embeddings = np.load(args.npz_fn) 53 | 54 | assert False, "need to test on different data, since here we have speaker" 55 | " overlap (read in additional test file)" 56 | 57 | # # Temp 58 | # import random 59 | # data = {} 60 | # a = list(embeddings) 61 | # random.shuffle(a) 62 | # for key in a[:100]: 63 | # data[key] = embeddings[key] 64 | # embeddings = data 65 | 66 | print("Ordering embeddings:") 67 | n_embeds = 0 68 | X = [] 69 | utt_keys = [] 70 | words = [] 71 | speakers = [] 72 | for utt_key in tqdm(sorted(embeddings)): 73 | utt_keys.append(utt_key) 74 | X.append(embeddings[utt_key]) 75 | utt_key = utt_key.split("_") 76 | word = utt_key[0] 77 | speaker = utt_key[1] 78 | words.append(word) 79 | speakers.append(speaker) 80 | X = np.array(X) 81 | print("No. embeddings:", X.shape[0]) 82 | print("Embedding dimensionality:", X.shape[1]) 83 | 84 | # Pronunciations 85 | pron_fn = path.join("lists", args.language, "dev.prons") 86 | print("Reading:", pron_fn) 87 | pronunciations = read_pronunciations(pron_fn) 88 | pron_labels = [] 89 | pron_lengths = [] 90 | for utt_key in utt_keys: 91 | pron_labels.append(pronunciations[utt_key]) 92 | pron_lengths.append(len(pronunciations[utt_key])) 93 | print("Minimum length:", min(pron_lengths)) 94 | print("Maximum length:", max(pron_lengths)) 95 | 96 | # Convert words to IDs 97 | length_set = set(pron_lengths) 98 | length_to_id = dict( 99 | zip(sorted(list(length_set)), range(len(length_set))) 100 | ) 101 | id_to_length = dict([[v,k] for k, v in length_to_id.items()]) 102 | y = [] 103 | for length in pron_lengths: 104 | y.append(length_to_id[length]) 105 | y = np.array(y, dtype=int) 106 | 107 | # Split training and test sets 80/20 108 | indices = np.arange(X.shape[0]) 109 | np.random.seed(2) 110 | np.random.shuffle(indices) 111 | n_train = int(round(X.shape[0]*0.8)) 112 | X_train = X[indices[:n_train]] 113 | X_test = X[indices[n_train:]] 114 | y_train = y[indices[:n_train]] 115 | y_test = y[indices[n_train:]] 116 | print("Training data shape:", X_train.shape) 117 | print("Test data shape:", X_test.shape) 118 | print(id_to_length, max(y_test), min(y_test)) 119 | 120 | # Multi-class logistic regression 121 | print(datetime.now()) 122 | print("Fitting multi-class logistic regression model") 123 | logreg = LogisticRegression( 124 | C=1e5, solver="lbfgs", multi_class="multinomial" 125 | # solver="lbfgs", multi_class="ovr", max_iter=200 126 | ) 127 | logreg.fit(X_train, y_train) 128 | print(datetime.now()) 129 | 130 | # Predict classes 131 | y_pred = logreg.predict(X_test) 132 | accuracy = accuracy_score(y_test, y_pred) 133 | 134 | print("Length classification accuracy: {:.2f}%".format(accuracy*100)) 135 | print( 136 | classification_report(y_test, y_pred, 137 | target_names=[str(id_to_length[i]) + " phone" for i in range(max(y) + 138 | 1)]) 139 | ) 140 | 141 | 142 | if __name__ == "__main__": 143 | main() 144 | -------------------------------------------------------------------------------- /qbe/get_dense_seg_costs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Calculate costs for dense search. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2017, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | from scipy.spatial.distance import cdist 14 | from tqdm import tqdm 15 | import argparse 16 | import pickle 17 | import numpy as np 18 | import os 19 | import sys 20 | import timeit 21 | 22 | 23 | #-----------------------------------------------------------------------------# 24 | # UTILITY FUNCTIONS # 25 | #-----------------------------------------------------------------------------# 26 | 27 | def check_argv(): 28 | """Check the command line arguments.""" 29 | parser = argparse.ArgumentParser( 30 | description=__doc__.strip().split("\n")[0], add_help=False 31 | ) 32 | parser.add_argument("eval_dir", type=str, help="evaluation directory") 33 | parser.add_argument( 34 | "--metric", 35 | choices=["cosine", "euclidean", "hamming", "chebyshev", 36 | "symsumxentropy"], 37 | default="cosine", help="distance metric (default: %(default)s)" 38 | ) 39 | if len(sys.argv) == 1: 40 | parser.print_help() 41 | sys.exit(1) 42 | return parser.parse_args() 43 | 44 | 45 | def sweep_min(query_vec, search_array, metric): 46 | """ 47 | Return the minimum cost between `query_vec` and rows of `search_array`. 48 | """ 49 | if metric == "symsumxentropy": 50 | return np.min( 51 | cdist_sumxentropy(np.array([query_vec]), search_array, True) 52 | ) 53 | else: 54 | return np.min(cdist(np.array([query_vec]), search_array, metric)) 55 | 56 | 57 | 58 | def cdist_sumxentropy(queries_array, search_array, symmetric=False): 59 | distances = np.zeros((queries_array.shape[0], search_array.shape[0])) 60 | for i_query, query_vec in enumerate(queries_array): 61 | for i_search, search_vec in enumerate(search_array): 62 | if symmetric: 63 | distances[i_query, i_search] = sumxentroy( 64 | search_vec, query_vec 65 | ) + sumxentroy(query_vec, search_vec) 66 | else: 67 | distances[i_query, i_search] = sumxentroy( 68 | search_vec, query_vec 69 | ) 70 | return distances 71 | 72 | 73 | #-----------------------------------------------------------------------------# 74 | # MAIN FUNCTION # 75 | #-----------------------------------------------------------------------------# 76 | 77 | def main(): 78 | args = check_argv() 79 | 80 | print(datetime.now()) 81 | 82 | # Read queries 83 | fn = path.join(args.eval_dir, "queries.npz") 84 | if not path.isfile(fn): 85 | import re 86 | fn = path.join( 87 | re.sub("min\_.*step\_\d*\.", "", args.eval_dir), "queries.npz" 88 | ) 89 | print("Reading:", fn) 90 | queries_dict = np.load(fn) 91 | queries_keys = sorted(list(queries_dict)) 92 | queries_list = [queries_dict[i] for i in queries_keys] 93 | print("No. queries:", len(queries_list)) 94 | print("Query array shape:", queries_dict[list(queries_dict)[0]].shape) 95 | 96 | # Read search collection 97 | fn = path.join(args.eval_dir, "search.npz") 98 | print("Reading:", fn) 99 | search_dict = np.load(fn) 100 | search_keys = sorted(list(search_dict)) 101 | search_list = [search_dict[i] for i in search_keys] 102 | print("No. search utterances:", len(search_list)) 103 | print("Search array shape:", search_dict[list(search_dict)[0]].shape) 104 | 105 | # print(datetime.now()) 106 | 107 | print("Calculating costs:") 108 | start_time = timeit.default_timer() 109 | costs = [] 110 | for query_vec in tqdm(queries_list): 111 | for search_array in search_list: 112 | costs.append(sweep_min( 113 | query_vec, search_array, args.metric 114 | )) 115 | end_time = timeit.default_timer() 116 | n_search = len(search_list) 117 | costs = [ 118 | costs[i*n_search:(i + 1)*n_search] for i in 119 | range(int(np.floor(len(costs)/n_search))) 120 | ] 121 | duration = end_time - start_time 122 | print( 123 | "Avg. duration per comparison: {:.8f} sec".format(duration / 124 | (len(queries_list) * len(search_list))) 125 | ) 126 | 127 | # Write costs 128 | fn = path.join(args.eval_dir, "cost_dict." + args.metric + ".pkl") 129 | print("Writing:", fn) 130 | cost_dict = {} 131 | for i_query, key_query in enumerate(queries_keys): 132 | if key_query not in cost_dict: 133 | cost_dict[key_query] = {} 134 | for i_search, key_search in enumerate(search_keys): 135 | cost_dict[key_query][key_search] = costs[i_query][i_search] 136 | # print(datetime.now()) 137 | with open(fn, "wb") as f: 138 | pickle.dump(cost_dict, f, -1) 139 | 140 | print(datetime.now()) 141 | 142 | 143 | if __name__ == "__main__": 144 | main() 145 | -------------------------------------------------------------------------------- /data/train_spk.list: -------------------------------------------------------------------------------- 1 | BG 018 020 021 023 025 026 027 032 035 039 041 042 043 045 046 047 048 049 050 052 053 054 056 060 062 064 065 066 067 069 070 071 072 073 075 077 078 079 080 082 083 085 087 088 089 091 092 093 094 096 097 098 099 101 102 103 104 105 107 111 112 113 114 2 | CH 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 033 034 035 036 037 038 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 3 | CR 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 022 023 024 025 026 027 028 029 030 031 032 049 050 052 055 056 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 092 093 094 4 | CZ 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 5 | FR 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 090 099 100 6 | GE 005 006 007 009 011 012 013 014 015 016 017 019 022 023 024 025 027 028 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 074 075 076 077 7 | HA 001 003 004 005 006 007 008 009 010 011 012 013 015 016 017 019 020 021 022 023 024 026 027 029 032 033 035 036 037 039 040 041 042 043 044 045 048 049 051 054 056 057 059 060 061 063 064 065 066 067 068 069 071 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 8 | KO 001 002 003 004 005 007 008 009 010 011 013 014 015 016 017 018 020 021 022 023 024 026 027 028 030 031 033 034 035 036 037 038 039 041 043 044 046 047 048 049 050 052 053 054 055 056 057 058 059 060 062 063 065 066 067 068 070 071 072 073 074 075 076 077 078 079 081 083 085 087 089 090 092 093 094 095 096 097 099 100 9 | PL 002 003 006 007 008 010 013 014 015 016 017 018 019 020 021 022 024 025 026 028 029 032 034 035 036 037 038 039 042 045 047 048 049 051 052 053 054 055 056 057 058 059 060 061 062 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 091 092 093 094 095 096 099 100 10 | PO 001 002 003 004 005 006 007 008 009 010 011 013 014 015 016 017 018 019 021 022 023 024 025 026 029 030 031 033 034 036 037 042 043 044 045 047 048 049 051 053 054 055 056 058 059 060 068 069 070 071 101 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 125 126 127 128 129 130 131 136 140 141 144 145 146 147 148 149 150 212 11 | RU 001 003 004 006 007 008 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 028 029 030 031 032 034 035 037 038 039 040 041 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 064 066 067 068 070 071 072 073 074 075 076 077 079 080 081 082 083 084 085 086 087 088 089 090 091 093 094 095 096 098 099 100 101 105 114 115 116 117 119 120 121 123 12 | SP 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 13 | SW 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 050 051 052 053 054 055 056 057 058 059 070 071 072 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 14 | TH 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 024 026 027 029 030 031 032 033 034 035 036 038 039 040 041 042 043 044 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 062 063 064 065 066 067 068 069 070 071 072 074 075 076 077 078 079 080 081 082 083 084 086 087 088 089 090 15 | TU 004 007 009 010 011 012 017 018 020 021 022 023 024 026 027 028 029 033 034 035 036 038 040 042 043 044 045 047 048 049 050 051 052 053 054 055 057 058 059 060 061 062 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 16 | VN 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 019 020 021 022 023 024 025 026 027 028 029 030 031 033 034 035 036 037 038 039 040 041 042 043 044 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 070 071 073 074 075 076 077 080 081 082 083 084 085 086 087 088 089 090 091 093 095 097 099 100 101 104 105 108 109 111 112 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 17 | -------------------------------------------------------------------------------- /embeddings/link_mfcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Create links to the MFCC files. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from os import path 12 | import numpy as np 13 | import os 14 | 15 | import argparse 16 | import sys 17 | 18 | relative_features_dir = path.join("..", "..", "..", "features") 19 | sixteen_languages = [ 20 | "BG", "CH", "CR", "CZ", "FR", "GE", "HA", "KO", "PL", "PO", "RU", "SP", 21 | "SW", "TH", "TU", "VN" 22 | ] 23 | 24 | #-----------------------------------------------------------------------------# 25 | # UTILITY FUNCTIONS # 26 | #-----------------------------------------------------------------------------# 27 | 28 | def check_argv(): 29 | """Check the command line arguments.""" 30 | parser = argparse.ArgumentParser( 31 | description=__doc__.strip().split("\n")[0], add_help=False 32 | ) 33 | parser.add_argument( 34 | "language", type=str, help="GlobalPhone language", 35 | choices=sixteen_languages + ["all"] 36 | ) 37 | if len(sys.argv) == 1: 38 | parser.print_help() 39 | sys.exit(1) 40 | return parser.parse_args() 41 | 42 | 43 | def link_features(npz_fn, link_fn, link_dir): 44 | assert ( 45 | path.isfile(path.join(link_dir, npz_fn)) 46 | ), "missing file: {}".format(path.join(link_dir, npz_fn)) 47 | if not path.isfile(link_fn): 48 | print("Linking:", npz_fn, "to", link_fn) 49 | os.symlink(npz_fn, link_fn) 50 | else: 51 | print("Using existing link:", link_fn) 52 | 53 | 54 | #-----------------------------------------------------------------------------# 55 | # MAIN FUNCTION # 56 | #-----------------------------------------------------------------------------# 57 | 58 | def main(): 59 | args = check_argv() 60 | 61 | if args.language == "all": 62 | languages = sixteen_languages 63 | else: 64 | languages = [args.language] 65 | 66 | for language in languages: 67 | 68 | print("Linking features for", language) 69 | 70 | # Create link directory 71 | link_dir = path.join("data", language) 72 | if not path.isdir(link_dir): 73 | os.makedirs(link_dir) 74 | 75 | # Training: All features 76 | npz_fn = path.join( 77 | relative_features_dir, "mfcc", language, language.lower() + 78 | ".train.npz" 79 | ) 80 | link_fn = path.join(link_dir, "train.all.npz") 81 | link_features(npz_fn, link_fn, link_dir) 82 | 83 | # Training: Ground truth words 84 | npz_fn = path.join( 85 | relative_features_dir, "mfcc", language, language.lower() + 86 | ".train.gt_words.npz" 87 | ) 88 | link_fn = path.join(link_dir, "train.gt.npz") 89 | link_features(npz_fn, link_fn, link_dir) 90 | 91 | # Training: UTD words 92 | npz_fn = path.join( 93 | relative_features_dir, "mfcc", language, language.lower() + 94 | ".train.utd_terms.npz" 95 | ) 96 | if path.isfile(path.join(link_dir, npz_fn)): 97 | # Not all languages have UTD output 98 | link_fn = path.join(link_dir, "train.utd.npz") 99 | link_features(npz_fn, link_fn, link_dir) 100 | 101 | # Training: UTD words with fixed labels 102 | npz_fn = path.join( 103 | relative_features_dir, "mfcc", language, language.lower() + 104 | ".train.utd_terms.fixed_labels.npz" 105 | ) 106 | if path.isfile(path.join(link_dir, npz_fn)): 107 | # Not all languages have UTD output 108 | link_fn = path.join(link_dir, "train.utd.fixed_labels.npz") 109 | link_features(npz_fn, link_fn, link_dir) 110 | 111 | # Training: UTD words with fixed segment intervals 112 | npz_fn = path.join( 113 | relative_features_dir, "mfcc", language, language.lower() + 114 | ".train.utd_terms.fixed_segs.npz" 115 | ) 116 | if path.isfile(path.join(link_dir, npz_fn)): 117 | # Not all languages have UTD output 118 | link_fn = path.join(link_dir, "train.utd.fixed_segs.npz") 119 | link_features(npz_fn, link_fn, link_dir) 120 | 121 | # Training: UTD words with fixed labels and segment intervals 122 | npz_fn = path.join( 123 | relative_features_dir, "mfcc", language, language.lower() + 124 | ".train.utd_terms.fixed_labels_segs.npz" 125 | ) 126 | if path.isfile(path.join(link_dir, npz_fn)): 127 | # Not all languages have UTD output 128 | link_fn = path.join(link_dir, "train.utd.fixed_labels_segs.npz") 129 | link_features(npz_fn, link_fn, link_dir) 130 | 131 | # Validation: Ground truth words 132 | npz_fn = path.join( 133 | relative_features_dir, "mfcc", language, language.lower() + 134 | ".dev.gt_words.npz" 135 | ) 136 | link_fn = path.join(link_dir, "val.npz") 137 | link_features(npz_fn, link_fn, link_dir) 138 | 139 | # Testing: Ground truth words 140 | npz_fn = path.join( 141 | relative_features_dir, "mfcc", language, language.lower() + 142 | ".eval.gt_words.npz" 143 | ) 144 | link_fn = path.join(link_dir, "test.npz") 145 | link_features(npz_fn, link_fn, link_dir) 146 | 147 | 148 | if __name__ == "__main__": 149 | main() 150 | -------------------------------------------------------------------------------- /notebooks/sandbox_splitnet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sandbox" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Herman Kamper, Stellenbosch University, 2019." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Preliminaries" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "The autoreload extension is already loaded. To reload it, use:\n", 34 | " %reload_ext autoreload\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "%matplotlib inline\n", 40 | "%load_ext autoreload\n", 41 | "%autoreload 2\n", 42 | "\n", 43 | "from os import path\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "import numpy as np\n", 46 | "import os\n", 47 | "import sys\n", 48 | "import tensorflow as tf\n", 49 | "\n", 50 | "sys.path.append(path.join(\"..\", \"src\"))\n", 51 | "import tflego\n", 52 | "\n", 53 | "from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "import warnings\n", 63 | "warnings.filterwarnings(\"ignore\")\n", 64 | "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\"\n", 65 | "tf.logging.set_verbosity(tf.logging.ERROR)\n", 66 | "if type(tf.contrib) != type(tf):\n", 67 | " tf.contrib._warning = None" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Split network based on condition" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "tf.reset_default_graph()\n", 84 | "\n", 85 | "# Random seed\n", 86 | "np.random.seed(1)\n", 87 | "tf.set_random_seed(1)\n", 88 | "\n", 89 | "# Parameters\n", 90 | "n_data = 7\n", 91 | "d_in = 5\n", 92 | "n_languages = 3\n", 93 | "n_classes = 2\n", 94 | "test_data = np.asarray(np.random.randn(n_data, d_in), dtype=NP_DTYPE)\n", 95 | "test_language = np.asarray([0, 0, 0, 1, 1, 2, 2], dtype=NP_ITYPE) # want to split accordingly\n", 96 | "test_class = np.asarray([0, 1, 0, 1, 0, 0, 1], dtype=NP_ITYPE) # output class\n", 97 | "\n", 98 | "# Model\n", 99 | "x = tf.placeholder(TF_DTYPE, [None, d_in])\n", 100 | "language = tf.placeholder(TF_ITYPE, [None])\n", 101 | "y = tf.placeholder(TF_ITYPE, [None])\n", 102 | "ff = tflego.build_feedforward(\n", 103 | " x, [10, 9]\n", 104 | " )\n", 105 | "split_networks = []\n", 106 | "for i_lang in range(n_languages):\n", 107 | " with tf.variable_scope(\"split_{}\".format(i_lang)):\n", 108 | " split_network = tflego.build_feedforward(\n", 109 | " ff, [6, n_classes]\n", 110 | " )\n", 111 | " if i_lang == 0:\n", 112 | " split_network *= 0\n", 113 | " elif i_lang == 1:\n", 114 | " split_network *= np.inf\n", 115 | " elif i_lang == 2:\n", 116 | " pass\n", 117 | " split_networks.append(split_network)\n", 118 | " \n", 119 | "output = tf.where(tf.equal(language, 0), split_networks[0], \n", 120 | " tf.where(tf.equal(language, 1), split_networks[1], split_networks[2])\n", 121 | " )\n", 122 | "\n", 123 | "# output = tf.where(tf.equal(language, 0), (language + 1)*55, language*0)\n", 124 | "# output = tf.where(tf.equal(language, 0), 55, \n", 125 | "# tf.where(tf.equal(language, 1), 66,\n", 126 | "# tf.where(tf.equal(language, 2), 9, -1\n", 127 | "# )))\n" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "init = tf.global_variables_initializer()\n", 137 | "with tf.Session() as session:\n", 138 | " session.run(init)\n", 139 | " np_output = output.eval({x: test_data, language: test_language})" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 6, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "array([[-0. , -0. ],\n", 151 | " [-0. , -0. ],\n", 152 | " [-0. , -0. ],\n", 153 | " [ inf, -inf],\n", 154 | " [ inf, -inf],\n", 155 | " [ 1.7773362 , -0.6447714 ],\n", 156 | " [ 1.8913155 , -0.20415437]], dtype=float32)" 157 | ] 158 | }, 159 | "execution_count": 6, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "np_output" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.5.2" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /embeddings/apply_model_to_npz.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Encode the given NumPy archive using the specified model. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2018, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | import argparse 14 | import pickle 15 | import numpy as np 16 | import os 17 | import sys 18 | import tensorflow as tf 19 | 20 | sys.path.append(path.join("..", "src")) 21 | 22 | from apply_model import build_model 23 | from link_mfcc import sixteen_languages 24 | from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE 25 | import batching 26 | import data_io 27 | 28 | 29 | #-----------------------------------------------------------------------------# 30 | # APPLY MODEL FUNCTIONS # 31 | #-----------------------------------------------------------------------------# 32 | 33 | 34 | def apply_model(model_fn, npz_fn): 35 | 36 | # Load the model options 37 | model_dir = path.split(model_fn)[0] 38 | options_dict_fn = path.join(model_dir, "options_dict.pkl") 39 | print("Reading:", options_dict_fn) 40 | with open(options_dict_fn, "rb") as f: 41 | options_dict = pickle.load(f) 42 | 43 | # Load data 44 | x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz( 45 | npz_fn 46 | ) 47 | 48 | if "cnn" in options_dict["script"]: 49 | 50 | # Pad and flatten data 51 | x_data, _ = data_io.pad_sequences( 52 | x_data, options_dict["max_length"], True 53 | ) 54 | x_data = np.transpose(x_data, (0, 2, 1)) 55 | x_data = x_data.reshape((-1, options_dict["d_in"])) 56 | 57 | # Build model 58 | x = tf.placeholder(TF_DTYPE, [None, options_dict["d_in"]]) 59 | model = build_model(x, None, options_dict) 60 | 61 | # Embed data 62 | batch_iterator = batching.LabelledIterator( 63 | x_data, None, x_data.shape[0], False 64 | ) 65 | saver = tf.train.Saver() 66 | with tf.Session() as session: 67 | saver.restore(session, model_fn) 68 | for batch_x in batch_iterator: 69 | np_z = session.run( 70 | [model["encoding"]], feed_dict={x: batch_x})[0] 71 | break # single batch 72 | 73 | else: # rnn 74 | 75 | # Truncate and limit dimensionality 76 | data_io.trunc_and_limit_dim( 77 | x_data, lengths, options_dict["n_input"], 78 | options_dict["max_length"] 79 | ) 80 | 81 | # Build model 82 | x = tf.placeholder(TF_DTYPE, [None, None, options_dict["n_input"]]) 83 | x_lengths = tf.placeholder(TF_ITYPE, [None]) 84 | model = build_model(x, x_lengths, options_dict) 85 | 86 | # Embed data 87 | batch_iterator = batching.SimpleIterator(x_data, len(x_data), False) 88 | saver = tf.train.Saver() 89 | with tf.Session() as session: 90 | saver.restore(session, model_fn) 91 | for batch_x_padded, batch_x_lengths in batch_iterator: 92 | np_x = batch_x_padded 93 | np_x_lengths = batch_x_lengths 94 | np_z = session.run( 95 | [model["encoding"]], feed_dict={x: np_x, x_lengths: 96 | np_x_lengths} 97 | )[0] 98 | break # single batch 99 | 100 | embed_dict = {} 101 | for i, utt_key in enumerate([keys[i] for i in batch_iterator.indices]): 102 | embed_dict[utt_key] = np_z[i] 103 | 104 | return embed_dict 105 | 106 | 107 | #-----------------------------------------------------------------------------# 108 | # UTILITY FUNCTIONS # 109 | #-----------------------------------------------------------------------------# 110 | 111 | def check_argv(): 112 | """Check the command line arguments.""" 113 | parser = argparse.ArgumentParser( 114 | description=__doc__.strip().split("\n")[0], add_help=False 115 | ) 116 | parser.add_argument("model_fn", type=str, help="model checkpoint filename") 117 | parser.add_argument("npz_fn", type=str, help="the NumPy archive to encode") 118 | parser.add_argument( 119 | "--output_npz_fn", type=str, 120 | help="if provided, the output is written to this NumPy archive " 121 | "instead of the model directory" 122 | ) 123 | if len(sys.argv) == 1: 124 | parser.print_help() 125 | sys.exit(1) 126 | return parser.parse_args() 127 | 128 | 129 | #-----------------------------------------------------------------------------# 130 | # MAIN FUNCTION # 131 | #-----------------------------------------------------------------------------# 132 | 133 | def main(): 134 | args = check_argv() 135 | 136 | # Do not output TensorFlow info and warning messages 137 | import warnings 138 | warnings.filterwarnings("ignore") 139 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" 140 | tf.logging.set_verbosity(tf.logging.ERROR) 141 | if type(tf.contrib) != type(tf): 142 | tf.contrib._warning = None 143 | 144 | # Embed data 145 | embed_dict = apply_model(args.model_fn, args.npz_fn) 146 | 147 | # Save embeddings 148 | model_dir, model_fn = path.split(args.model_fn) 149 | if args.output_npz_fn is None: 150 | npz_fn = path.join( 151 | model_dir, path.splitext(model_fn)[0] + "." + 152 | path.split(args.npz_fn)[-1] 153 | ) 154 | else: 155 | npz_fn = args.output_npz_fn 156 | print("Writing:", npz_fn) 157 | np.savez_compressed(npz_fn, **embed_dict) 158 | print(datetime.now()) 159 | 160 | 161 | if __name__ == "__main__": 162 | main() 163 | -------------------------------------------------------------------------------- /features/analyse_utd_pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Analyse UTD pairs for the indicated language. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2020 9 | """ 10 | 11 | from os import path 12 | from tqdm import tqdm 13 | import argparse 14 | import codecs 15 | import glob 16 | import numpy as np 17 | import os 18 | import shutil 19 | import sys 20 | 21 | sys.path.append("..") 22 | 23 | from extract_features import get_overlap 24 | from paths import gp_data_dir, gp_alignments_dir 25 | 26 | 27 | #-----------------------------------------------------------------------------# 28 | # UTILITY FUNCTIONS # 29 | #-----------------------------------------------------------------------------# 30 | 31 | def check_argv(): 32 | """Check the command line arguments.""" 33 | parser = argparse.ArgumentParser( 34 | description=__doc__.strip().split("\n")[0], add_help=False 35 | ) 36 | parser.add_argument( 37 | "language", type=str, help="GlobalPhone language", 38 | choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA", "KO", "PL", "PO", 39 | "RU", "SP", "SW", "TH", "TU", "VN"] 40 | ) 41 | if len(sys.argv) == 1: 42 | parser.print_help() 43 | sys.exit(1) 44 | return parser.parse_args() 45 | 46 | 47 | #-----------------------------------------------------------------------------# 48 | # MAIN FUNCTION # 49 | #-----------------------------------------------------------------------------# 50 | 51 | def main(): 52 | args = check_argv() 53 | subset = "train" 54 | 55 | # Read UTD terms 56 | utd_list_fn = path.join("lists", args.language, "train.utd_terms.list") 57 | print("Reading:", utd_list_fn) 58 | # overlap_dict[speaker_utt][(start, end)] is list a tuples of 59 | # (label, (start, end), overlap, cluster_label) 60 | overlap_dict = {} 61 | with codecs.open(utd_list_fn, "r", "utf-8") as utd_list_f: 62 | for line in utd_list_f: 63 | term, speaker, utt, start_end = line.strip().split("_") 64 | start, end = start_end.split("-") 65 | start = int(start) 66 | end = int(end) 67 | if not speaker + "_" + utt in overlap_dict: 68 | overlap_dict[speaker + "_" + utt] = {} 69 | overlap_dict[speaker + "_" + utt][(start, end, term)] = [] 70 | 71 | # Read forced alignments 72 | fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm") 73 | print("Reading:", fa_fn) 74 | fa_dict = {} 75 | with codecs.open(fa_fn, "r", "utf-8") as fa_f: 76 | for line in fa_f: 77 | utt_key, _, start, duration, label = line.strip().split() 78 | start = float(start) 79 | duration = float(duration) 80 | end = start + duration 81 | start_frame = int(round(start*100)) 82 | end_frame = int(round(end*100)) 83 | if (label != "" and label != "sil" and label != "?" and 84 | label != "spn"): 85 | if not utt_key in fa_dict: 86 | fa_dict[utt_key] = {} 87 | fa_dict[utt_key][start_frame, end_frame] = label 88 | 89 | # Find ground truth terms with maximal overlap 90 | print("Getting ground truth terms with overlap:") 91 | overlap_label_dict = {} 92 | for utt_key in tqdm(fa_dict): 93 | # print(utt_key) 94 | if utt_key not in overlap_dict: 95 | continue 96 | for (fa_start, fa_end) in fa_dict[utt_key]: 97 | for (utd_start, utd_end, utd_term) in overlap_dict[utt_key]: 98 | overlap = get_overlap( 99 | utd_start, utd_end, fa_start, fa_end 100 | ) 101 | if overlap == 0: 102 | continue 103 | overlap_dict[utt_key][(utd_start, utd_end, utd_term)].append(( 104 | fa_dict[utt_key][(fa_start, fa_end)], 105 | (fa_start, fa_end), overlap 106 | )) 107 | term_key = "{}_{}_{:06d}-{:06d}".format( 108 | utd_term, utt_key, utd_start, utd_end 109 | ) 110 | if not term_key in overlap_label_dict: 111 | overlap_label_dict[term_key] = set() 112 | overlap_label_dict[term_key].add( 113 | fa_dict[utt_key][(fa_start, fa_end)] 114 | ) 115 | 116 | # Read UTD pairs 117 | pairs_fn = path.join("lists", args.language, "train.utd_pairs.list") 118 | pairs = [] 119 | n_pairs = 0 120 | n_correct = 0 121 | n_missing = 0 122 | with codecs.open(pairs_fn, "r", "utf-8") as pairs_f: 123 | for line in pairs_f: 124 | term1, term2 = line.strip().split(" ") 125 | pairs.append((term1, term2)) 126 | if (term1 not in overlap_label_dict or term2 not in 127 | overlap_label_dict): 128 | n_missing += 1 129 | continue 130 | if (len(overlap_label_dict[term1].intersection( 131 | overlap_label_dict[term2])) > 0): 132 | n_correct += 1 133 | n_pairs += 1 134 | print("Correct pairs: {:.2f}%".format(n_correct/n_pairs*100.0)) 135 | print("No. missing pairs: {} out of {}".format(n_missing, n_pairs)) 136 | 137 | 138 | 139 | # # Construct list of UTD labels and list of list of overlapping GT terms 140 | # labels = [] 141 | # overlap_lists = [] 142 | # for utt_key in tqdm(overlap_dict): 143 | # for (utd_start, utd_end, utd_term) in overlap_dict[utt_key]: 144 | # overlap_list = overlap_dict[utt_key][ 145 | # (utd_start, utd_end, utd_term) 146 | # ] 147 | # if len(overlap_list) == 0: 148 | # continue 149 | # labels.append(utd_term) 150 | # overlap_lists.append([i[0] for i in overlap_list]) 151 | 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /src/plotting.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some of these functions are based on 3 | http://deeplearning.net/tutorial/code/utils.py. 4 | 5 | Author: Herman Kamper 6 | Contact: kamperh@gmail.com 7 | Date: 2015, 2016 8 | """ 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def scale_unit_interval(mat, eps=1e-8): 15 | """Scales all values in `mat` to be between 0 and 1.""" 16 | mat = mat.copy() 17 | mat -= mat.min() 18 | mat *= 1.0 / (mat.max() + eps) 19 | return mat 20 | 21 | 22 | def array_to_pixels(mat): 23 | """Convert the given array to pixel values after scaling.""" 24 | mat = scale_unit_interval(mat) 25 | out_array = np.zeros(mat.shape, dtype="uint8") 26 | for i in range(mat.shape[0]): 27 | for j in range(mat.shape[1]): 28 | out_array[i, j] = mat[i, j] * 255 29 | return out_array 30 | 31 | 32 | def tile_images(X, image_shape, tile_shape, tile_spacing=(1, 1), 33 | scale_rows_unit_interval=True): 34 | """ 35 | Transform the 2-D matrix `X`, which has one flattened data instance or 36 | filter per row, into a matrix of pixel values with the data instances or 37 | filters layed out as tiles. 38 | 39 | Parameters 40 | ---------- 41 | X : 2-D matrix or 4-D tensor 42 | The data to transform. If the tensor is given, the data from the 43 | last two dimensions are tiled. 44 | image_shape : (height, width) 45 | Each row is reshaped to this dimensionality. 46 | tile_shape : (n_rows, n_columns) 47 | Number of rows and columns to have in the output. 48 | scale_rows_unit_interval : bool 49 | Should each row be scaled to interval of [0, 1] before plotting 50 | 51 | Return 52 | ------ 53 | out_array : matrix of type int 54 | Can be passed directly to `PIL.Image.fromarray`. 55 | """ 56 | 57 | assert len(image_shape) == 2 58 | assert len(tile_shape) == 2 59 | assert len(tile_spacing) == 2 60 | assert len(X.shape) == 2 or len(X.shape) == 4 61 | 62 | if len(X.shape) == 4: 63 | n_filters_out, n_channels_in, image_h, image_w = X.shape 64 | image_shape = image_h, image_w 65 | X = X.copy() 66 | X = X.reshape(n_filters_out*n_channels_in, image_h*image_w) 67 | 68 | # Dimensions 69 | image_h, image_w = image_shape 70 | spacing_h, spacing_w = tile_spacing 71 | n_tiles_h, n_tiles_w = tile_shape 72 | 73 | # Output dimensionality 74 | out_shape = [0, 0] 75 | out_shape[0] = (image_h + spacing_h) * n_tiles_h - spacing_h 76 | out_shape[1] = (image_w + spacing_w) * n_tiles_w - spacing_w 77 | 78 | # Output matrix 79 | out_array = np.zeros(out_shape, dtype="uint8") 80 | 81 | # Lay out tiles 82 | for i_tile in xrange(n_tiles_h): 83 | for j_tile in xrange(n_tiles_w): 84 | cur_image = X[i_tile * n_tiles_w + j_tile].reshape(image_shape) 85 | if scale_rows_unit_interval: 86 | cur_image = scale_unit_interval(cur_image) 87 | i = i_tile * (image_h + spacing_h) 88 | j = j_tile * (image_w + spacing_w) 89 | out_array[i:i + image_h, j:j + image_w] = cur_image * 255 90 | 91 | return out_array 92 | 93 | 94 | def make_patch_spines_invisible(ax): 95 | ax.set_frame_on(True) 96 | ax.patch.set_visible(False) 97 | for sp in ax.spines.values(): 98 | sp.set_visible(False) 99 | 100 | 101 | def plot_raw_embeds(embed_dict, types=None, mvn=False, **kwargs): 102 | """Plot all the embeddings of type `types`; if None, plot everything.""" 103 | 104 | # Get embeddings 105 | embeddings = [] 106 | labels = [] 107 | for key in embed_dict: 108 | if "_" in key: 109 | label = key.split("_")[0] 110 | else: 111 | label = key 112 | if types is None: 113 | labels.append(label) 114 | embeddings.append(embed_dict[key]) 115 | elif label in types: 116 | labels.append(label) 117 | embeddings.append(embed_dict[key]) 118 | n_embeds = len(embeddings) 119 | embeddings = np.array(embeddings) 120 | 121 | # Mean and variance normalise 122 | if mvn: 123 | embeddings = ( 124 | embeddings - embeddings.mean(axis=0) 125 | )/embeddings.std(axis=0) 126 | 127 | # Now sort by label 128 | sort_order = np.argsort(np.array(labels)) 129 | sorted_labels = np.array(labels)[sort_order] 130 | 131 | # Get cluster tick positions 132 | type_ticks = [0] 133 | for i in range(len(sorted_labels) - 1): 134 | if sorted_labels[i] != sorted_labels[i + 1]: 135 | type_ticks.append(i + 1) 136 | type_ticks.append(n_embeds) 137 | 138 | # Get label positions and labels 139 | type_label_ticks = [] 140 | type_labels = [] 141 | for i in sorted(list(set(labels))): 142 | where = np.where(sorted_labels == i)[0] 143 | if len(where) == 0: 144 | continue 145 | pos = int(np.mean(where)) 146 | type_label_ticks.append(pos) 147 | type_labels.append(i) 148 | 149 | # Variables used for plotting 150 | labels_offset = 1.04 151 | par2_linewidth = 0.5 152 | 153 | fig, host = plt.subplots(**kwargs) 154 | par2 = host.twinx() 155 | par2.spines["right"].set_position(("axes", labels_offset)) 156 | make_patch_spines_invisible(par2) 157 | par2.spines["right"].set_visible(True) 158 | par2.set_ylim([0, n_embeds]) 159 | par2.invert_yaxis() 160 | par2.set_yticks(type_ticks) 161 | par2.set_yticklabels([]) 162 | par2.tick_params(axis="y", width=par2_linewidth, length=10) 163 | par2.spines["right"].set_linewidth(par2_linewidth) 164 | par2.set_yticks(type_label_ticks, minor=True) 165 | par2.set_yticklabels(type_labels, minor=True) 166 | par2.set_ylabel("Word types") 167 | for line in par2.yaxis.get_minorticklines(): 168 | line.set_visible(False) 169 | 170 | cax = host.imshow( 171 | embeddings[sort_order], interpolation="nearest", 172 | aspect="auto" 173 | ) 174 | host.set_yticks([]) 175 | host.set_ylabel("Word embedding vectors") 176 | host.set_xlabel("Embedding dimensions") 177 | -------------------------------------------------------------------------------- /features/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions. 3 | 4 | Author: Herman Kamper 5 | Contact: kamperh@gmail.com 6 | Date: 2019 7 | """ 8 | 9 | from tqdm import tqdm 10 | import codecs 11 | import numpy as np 12 | import subprocess 13 | 14 | 15 | shell = lambda command: subprocess.Popen( 16 | command, shell=True, stdout=subprocess.PIPE 17 | ).communicate()[0] 18 | 19 | 20 | def filter_words(fa_fn, output_fn, min_frames=50, min_chars=5): 21 | """ 22 | Find words of at least `min_frames` frames and `min_chars` characters. 23 | 24 | Ground truth words are extracted from the forced alignment file `fa_fn` and 25 | written to the word list file `output_fn`. 26 | """ 27 | print("Reading:", fa_fn) 28 | print("Writing:", output_fn) 29 | n_tokens = 0 30 | with codecs.open(fa_fn, "r", "utf-8") as fa_f: 31 | with codecs.open(output_fn, "w", "utf-8") as output_f: 32 | for line in fa_f: 33 | utt_key, _, start, duration, label = line.strip().split() 34 | start = float(start) 35 | duration = float(duration) 36 | end = start + duration 37 | start_frame = int(round(start*100)) 38 | end_frame = int(round(end*100)) 39 | if (end_frame - start_frame >= min_frames and len(label) >= 40 | min_chars and label != "" and label != "sil" 41 | and label != "?" and label != "spn"): 42 | output_f.write( 43 | "{}_{}_{:06d}-{:06d}\n".format(label, utt_key, 44 | start_frame, end_frame + 1) 45 | ) 46 | n_tokens += 1 47 | print("No. tokens:", n_tokens) 48 | 49 | 50 | def segments_from_npz(input_npz_fn, segments_fn, output_npz_fn): 51 | """ 52 | Cut segments from a NumPy archive and save in a new archive. 53 | 54 | As keys, the archives use the format "label_spkr_utterance_start-end". 55 | """ 56 | 57 | # Read the .npz file 58 | print("Reading npz:", input_npz_fn) 59 | input_npz = np.load(input_npz_fn) 60 | 61 | # Create input npz segments dict 62 | utterance_segs = {} # utterance_segs["s08_02b_029657-029952"] 63 | # is (29657, 29952) 64 | for key in input_npz.keys(): 65 | s = key.split("_") 66 | if len(s) == 3: 67 | # Format: s08_02b_029657-029952 68 | utterance_segs[key] = tuple([int(i) for i in s[-1].split("-")]) 69 | elif len(s) == 2: 70 | # Format: s08_02b 71 | utterance_segs[key] = (0, input_npz[key].shape[0]) 72 | 73 | # Create target segments dict 74 | print("Reading segments:", segments_fn) 75 | target_segs = {} # target_segs["years_s01_01a_004951-005017"] 76 | # is ("s01_01a", 4951, 5017) 77 | for line in open(segments_fn): 78 | line_split = line.split("_") 79 | utterance = line_split[-3] + "_" + line_split[-2] 80 | start, end = line_split[-1].split("-") 81 | start = int(start) 82 | end = int(end) 83 | target_segs[line.strip()] = (utterance, start, end) 84 | 85 | print("Extracting segments:") 86 | output_npz = {} 87 | n_target_segs = 0 88 | for target_seg_key in tqdm(sorted(target_segs)): 89 | utterance, target_start, target_end = target_segs[target_seg_key] 90 | for utterance_key in [ 91 | i for i in utterance_segs.keys() if (i + 92 | "_").startswith(utterance + "_")]: 93 | # If like below: "GE008_128" also matches "GE008_12" 94 | # i for i in utterance_segs.keys() if i.startswith(utterance)]: 95 | utterance_start, utterance_end = utterance_segs[utterance_key] 96 | if (target_start >= utterance_start and target_start < 97 | utterance_end): 98 | start = target_start - utterance_start 99 | end = target_end - utterance_start 100 | output_npz[target_seg_key] = input_npz[ 101 | utterance_key 102 | ][start:end] 103 | n_target_segs += 1 104 | break 105 | 106 | print( 107 | "Extracted " + str(n_target_segs) + " out of " + str(len(target_segs)) 108 | + " segments" 109 | ) 110 | print("Writing:", output_npz_fn) 111 | np.savez(output_npz_fn, **output_npz) 112 | 113 | 114 | def terms_from_pairs(pairs_fn, output_list_fn): 115 | 116 | print("Reading:", pairs_fn) 117 | terms = set() 118 | with open(pairs_fn) as f: 119 | for line in f: 120 | line = line.replace("###", " ") 121 | (cluster, utt1, start1, end1, cluster2, utt2, start2, end2) = ( 122 | line.strip().split(" ") 123 | ) 124 | start1 = int(start1) 125 | end1 = int(end1) 126 | start2 = int(start2) 127 | end2 = int(end2) 128 | terms.add((cluster, utt1, start1, end1)) 129 | terms.add((cluster, utt2, start2, end2)) 130 | 131 | print("Writing:", output_list_fn) 132 | with open(output_list_fn, "w") as f: 133 | for cluster, utt, start, end in terms: 134 | f.write( 135 | cluster + "_" + utt + "_" + "%06d" % start + "-" + "%06d" % end 136 | + "\n" 137 | ) 138 | 139 | 140 | def format_enno_pairs(enno_pairs_fn, output_pairs_fn): 141 | print("Reading:", enno_pairs_fn) 142 | print("Writing:", output_pairs_fn) 143 | with codecs.open(enno_pairs_fn, "r", "utf-8") as enno_f: 144 | with codecs.open(output_pairs_fn, "w", "utf-8") as output_f: 145 | for line in enno_f: 146 | line = line.replace("###", " ") 147 | (cluster1, utt1, start1, end1, cluster2, utt2, start2, end2) = ( 148 | line.strip().split(" ") 149 | ) 150 | start1 = int(start1) 151 | end1 = int(end1) 152 | start2 = int(start2) 153 | end2 = int(end2) 154 | output_f.write( 155 | "{}_{}_{:06d}-{:06d} " 156 | "{}_{}_{:06d}-{:06d}\n".format(cluster1, utt1, start1, 157 | end1, cluster2, utt2, start2, end2) 158 | ) 159 | -------------------------------------------------------------------------------- /features/features.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for extracting filterbank and MFCC features. 3 | 4 | Author: Herman Kamper 5 | Contact: kamperh@gmail.com 6 | Date: 2019 7 | """ 8 | 9 | from os import path 10 | from tqdm import tqdm 11 | import glob 12 | import numpy as np 13 | import scipy.io.wavfile as wav 14 | 15 | 16 | def extract_fbank_dir(dir): 17 | """ 18 | Extract filterbanks for all audio files in `dir` and return a dictionary. 19 | 20 | Each dictionary key will be the filename of the associated audio file 21 | without the extension. Mel-scale log filterbanks are extracted. 22 | """ 23 | import librosa 24 | feat_dict = {} 25 | for wav_fn in tqdm(sorted(glob.glob(path.join(dir, "*.wav")))): 26 | signal, sample_rate = librosa.core.load(wav_fn, sr=None) 27 | signal = preemphasis(signal, coeff=0.97) 28 | fbank = np.log(librosa.feature.melspectrogram( 29 | signal, sr=sample_rate, n_mels=40, 30 | n_fft=int(np.floor(0.025*sample_rate)), 31 | hop_length=int(np.floor(0.01*sample_rate)), fmin=64, fmax=8000, 32 | )) 33 | # from python_speech_features import logfbank 34 | # samplerate, signal = wav.read(wav_fn) 35 | # fbanks = logfbank( 36 | # signal, samplerate=samplerate, winlen=0.025, winstep=0.01, 37 | # nfilt=45, nfft=2048, lowfreq=0, highfreq=None, preemph=0, 38 | # winfunc=np.hamming 39 | # ) 40 | key = path.splitext(path.split(wav_fn)[-1])[0] 41 | feat_dict[key] = fbank.T 42 | return feat_dict 43 | 44 | 45 | def extract_mfcc_dir(dir): 46 | """ 47 | Extract MFCCs for all audio files in `dir` and return a dictionary. 48 | 49 | Each dictionary key will be the filename of the associated audio file 50 | without the extension. Deltas and double deltas are also extracted. 51 | """ 52 | import librosa 53 | feat_dict = {} 54 | for wav_fn in tqdm(sorted(glob.glob(path.join(dir, "*.wav")))): 55 | signal, sample_rate = librosa.core.load(wav_fn, sr=None) 56 | if len(signal) == 0: 57 | continue 58 | signal = preemphasis(signal, coeff=0.97) 59 | mfcc = librosa.feature.mfcc( 60 | signal, sr=sample_rate, n_mfcc=13, n_mels=24, #dct_type=3, 61 | n_fft=int(np.floor(0.025*sample_rate)), 62 | hop_length=int(np.floor(0.01*sample_rate)), fmin=64, fmax=8000, 63 | #htk=True 64 | ) 65 | # mfcc = librosa.feature.mfcc( 66 | # signal, sr=sample_rate, n_mfcc=13, 67 | # n_fft=int(np.floor(0.025*sample_rate)), 68 | # hop_length=int(np.floor(0.01*sample_rate)) 69 | # ) 70 | if mfcc.shape[1] < 9: # need at least 9 frames for deltas 71 | continue 72 | mfcc_delta = librosa.feature.delta(mfcc) 73 | mfcc_delta_delta = librosa.feature.delta(mfcc, order=2) 74 | key = path.splitext(path.split(wav_fn)[-1])[0] 75 | feat_dict[key] = np.hstack([mfcc.T, mfcc_delta.T, mfcc_delta_delta.T]) 76 | 77 | # # Temp 78 | # if "SP005_49" in wav_fn: 79 | # print(key) 80 | # print(feat_dict[key].shape) 81 | # assert False 82 | 83 | # from python_speech_features import delta 84 | # from python_speech_features import mfcc 85 | # sample_rate, signal = wav.read(wav_fn) 86 | # mfccs = mfcc( 87 | # signal, samplerate=sample_rate, winlen=0.025, winstep=0.01, 88 | # numcep=13, nfilt=24, nfft=None, lowfreq=0, highfreq=None, 89 | # preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=np.hamming 90 | # ) 91 | # d_mfccs = delta(mfccs, 2) 92 | # dd_mfccs = delta(d_mfccs, 2) 93 | # key = path.splitext(path.split(wav_fn)[-1])[0] 94 | # feat_dict[key] = np.hstack([mfccs, d_mfccs, dd_mfccs]) 95 | 96 | # import matplotlib.pyplot as plt 97 | # plt.imshow(feat_dict[key][2000:2200,:]) 98 | # plt.show() 99 | # assert False 100 | return feat_dict 101 | 102 | 103 | def extract_vad(feat_dict, vad_dict): 104 | """ 105 | Remove silence based on voice activity detection (VAD). 106 | 107 | The `vad_dict` should have the same keys as `feat_dict` with the active 108 | speech regions given as lists of tuples of (start, end) frame, with the end 109 | excluded. 110 | """ 111 | output_dict = {} 112 | for utt_key in tqdm(sorted(feat_dict)): 113 | if utt_key not in vad_dict: 114 | print("Warning: Missing VAD for utterance", utt_key) 115 | continue 116 | for (start, end) in vad_dict[utt_key]: 117 | segment_key = utt_key + "_{:06d}-{:06d}".format(start, end) 118 | output_dict[segment_key] = feat_dict[utt_key][start:end, :] 119 | return output_dict 120 | 121 | 122 | def speaker_mvn(feat_dict): 123 | """ 124 | Perform per-speaker mean and variance normalisation. 125 | 126 | It is assumed that each of the keys in `feat_dict` starts with a speaker 127 | identifier followed by an underscore. 128 | """ 129 | 130 | speakers = set([key.split("_")[0] for key in feat_dict]) 131 | 132 | # Separate features per speaker 133 | speaker_features = {} 134 | for utt_key in sorted(feat_dict): 135 | speaker = utt_key.split("_")[0] 136 | if speaker not in speaker_features: 137 | speaker_features[speaker] = [] 138 | speaker_features[speaker].append(feat_dict[utt_key]) 139 | 140 | # Determine means and variances per speaker 141 | speaker_mean = {} 142 | speaker_std = {} 143 | for speaker in speakers: 144 | features = np.vstack(speaker_features[speaker]) 145 | speaker_mean[speaker] = np.mean(features, axis=0) 146 | speaker_std[speaker] = np.std(features, axis=0) 147 | 148 | # Normalise per speaker 149 | output_dict = {} 150 | for utt_key in tqdm(sorted(feat_dict)): 151 | speaker = utt_key.split("_")[0] 152 | output_dict[utt_key] = ( 153 | (feat_dict[utt_key] - speaker_mean[speaker]) / 154 | speaker_std[speaker] 155 | ) 156 | 157 | return output_dict 158 | 159 | 160 | def preemphasis(signal, coeff=0.97): 161 | """Perform preemphasis on the input `signal`.""" 162 | return np.append(signal[0], signal[1:] - coeff*signal[:-1]) 163 | -------------------------------------------------------------------------------- /blackbox/dp_align.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions and classes for aligning two lists using dynamic programming. 3 | 4 | The algorithm is based on on a slight variation of the method given at: 5 | http://www.avatar.se/molbioinfo2001/dynprog/adv_dynamic.html. By default NIST 6 | insertion, deletion and substitution penalties are used. 7 | 8 | Author: Herman Kamper 9 | Contact: kamperh@gmail.com 10 | Date: 2011, 2014, 2015, 2019 11 | """ 12 | 13 | import numpy as np 14 | 15 | 16 | #-----------------------------------------------------------------------------# 17 | # DYNAMIC PROGRAMMING CLASSES # 18 | #-----------------------------------------------------------------------------# 19 | 20 | class DPEntry: 21 | """Alignment type ("d", "i", "s", or "m") and an integer score.""" 22 | def __init__(self, align="m", score=0): 23 | self.align = align 24 | self.score = score 25 | 26 | 27 | class DPError(object): 28 | """ 29 | Attributes 30 | ---------- 31 | n_del : int 32 | n_ins : int 33 | n_sub : int 34 | n_match : int 35 | n_total : int 36 | """ 37 | 38 | def __init__(self, n_del=0, n_ins=0, n_sub=0, n_match=0, n_total=0): 39 | self.n_del = n_del 40 | self.n_ins = n_ins 41 | self.n_sub = n_sub 42 | self.n_match = n_match 43 | self.n_total = n_total 44 | 45 | def __add__(self, other): 46 | """Add this DPError to another.""" 47 | if type(other) == DPError: 48 | self.n_del += other.n_del 49 | self.n_ins += other.n_ins 50 | self.n_sub += other.n_sub 51 | self.n_match += other.n_match 52 | self.n_total += other.n_total 53 | return self 54 | 55 | __radd__ = __add__ 56 | __iadd__ = __add__ 57 | 58 | def __str__(self): 59 | """Returns a string representation of the alignment error.""" 60 | return ( 61 | "H = " + str(self.n_match) + ", D = " + str(self.n_del) + ", S = " 62 | + str(self.n_sub) + ", I = " + str(self.n_ins)+ ", N = " + 63 | str(self.n_total) 64 | ) 65 | 66 | def get_levenshtein(self): 67 | """Returns the Levenshtein distance of the alignment.""" 68 | return self.n_del + self.n_sub + self.n_ins 69 | 70 | def get_accuracy(self): 71 | """ 72 | Calculates the accuracy given the stored errors using the formula: 73 | Accuracy = (Matches - Insertions) / Total 74 | """ 75 | return float(self.n_match - self.n_ins) / self.n_total 76 | 77 | def get_wer(self): 78 | """ 79 | Calculates the word error rate (WER) using: 80 | WER = (Substitutions + Deletions + Insertions) / Total 81 | """ 82 | return float(self.n_sub + self.n_del + self.n_ins) / self.n_total 83 | 84 | 85 | #-----------------------------------------------------------------------------# 86 | # DYNAMIC PROGRAMMING ALIGNMENT FUNCTION # 87 | #-----------------------------------------------------------------------------# 88 | 89 | def dp_align(ref_list, test_list, ins_penalty=3, del_penalty=3, sub_penalty=4): 90 | """ 91 | Performs dynamic programming alignment of `ref_list` to `test_list`. 92 | 93 | Parameters 94 | ---------- 95 | ref_list : list 96 | test_list : list 97 | """ 98 | 99 | # Initialise the alignment matrix 100 | dp_matrix = np.empty( 101 | [len(test_list) + 1, len(ref_list) + 1], dtype = object 102 | ) 103 | for i in range(len(test_list) + 1): 104 | for j in range(len(ref_list) + 1): 105 | dp_matrix[i][j] = DPEntry() 106 | 107 | # Initialise the origin 108 | dp_matrix[0][0].score = 0 109 | dp_matrix[0][0].align = "m" 110 | 111 | # The first row is all delections: 112 | for j in range(1, len(ref_list) + 1): 113 | dp_matrix[0][j].score = j*del_penalty 114 | dp_matrix[0][j].align = "d" 115 | 116 | # Fill dp_matrix 117 | for i in range(1, len(test_list) + 1): 118 | 119 | # First column is all insertions 120 | dp_matrix[i][0].score = i*ins_penalty 121 | dp_matrix[i][0].align = "i" 122 | 123 | for j in range(1, len(ref_list) + 1): 124 | del_score = dp_matrix[i, j - 1].score + del_penalty 125 | ins_score = dp_matrix[i - 1, j].score + ins_penalty 126 | 127 | if test_list[i - 1] == ref_list[j - 1]: 128 | 129 | # Considering a match 130 | match_score = dp_matrix[i - 1, j - 1].score 131 | 132 | # Test for a match 133 | if match_score <= del_score and match_score <= ins_score: 134 | dp_matrix[i, j].score = match_score 135 | dp_matrix[i, j].align = "m" 136 | # Test for a deletion 137 | elif del_score <= ins_score: 138 | dp_matrix[i, j].score = del_score 139 | dp_matrix[i, j].align = "d" 140 | # Test for an insertion (only option left) 141 | else: 142 | dp_matrix[i, j].score = ins_score 143 | dp_matrix[i, j].align = "i" 144 | 145 | else: 146 | 147 | # Considering a substitution 148 | sub_score = dp_matrix[i - 1, j - 1].score + sub_penalty 149 | 150 | # Test for a substitution 151 | if sub_score < del_score and sub_score <= ins_score: 152 | dp_matrix[i, j].score = sub_score 153 | dp_matrix[i, j].align = "s" 154 | # Test for a deletion 155 | elif del_score <= ins_score: 156 | dp_matrix[i, j].score = del_score 157 | dp_matrix[i, j].align = "d" 158 | # Test for an insertion (only option left) 159 | else: 160 | dp_matrix[i, j].score = ins_score 161 | dp_matrix[i, j].align = "i" 162 | 163 | # Perform alignment by tracking through the dp_matrix 164 | dp_errors = DPError() 165 | dp_errors.n_total = len(ref_list) 166 | i = len(test_list) 167 | j = len(ref_list) 168 | while i > 0 or j > 0: 169 | if dp_matrix[i, j].align == "m": 170 | #print test_list[i - 1], ref_list[j - 1] 171 | i -= 1 172 | j -= 1 173 | dp_errors.n_match += 1 174 | elif dp_matrix[i, j].align == "s": 175 | #print test_list[i - 1], ref_list[j - 1] 176 | i -= 1 177 | j -= 1 178 | dp_errors.n_sub += 1 179 | elif dp_matrix[i, j].align == "d": 180 | #print "-", ref_list[j - 1] 181 | j -= 1 182 | dp_errors.n_del += 1 183 | elif dp_matrix[i, j].align == "i": 184 | #print test_list[i - 1], "-" 185 | i -= 1 186 | dp_errors.n_ins += 1 187 | 188 | # Return the alignment results 189 | return dp_errors 190 | 191 | 192 | #-----------------------------------------------------------------------------# 193 | # MAIN FUNCTION # 194 | #-----------------------------------------------------------------------------# 195 | 196 | def main(): 197 | a = dp_align( 198 | "recycling", "recycle", ins_penalty=1, del_penalty=1, sub_penalty=1 199 | ) 200 | print( 201 | "Levenshtein distance between recycling and recycle:", 202 | a.get_levenshtein() 203 | ) 204 | 205 | 206 | if __name__ == "__main__": 207 | main() 208 | -------------------------------------------------------------------------------- /embeddings/data_io.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data input and output functions. 3 | 4 | Author: Herman Kamper 5 | Contact: kamperh@gmail.com 6 | Date: 2018, 2019 7 | """ 8 | 9 | from collections import Counter 10 | from os import path 11 | from tqdm import tqdm 12 | import numpy as np 13 | import random 14 | import sys 15 | 16 | sys.path.append(path.join("..", "src")) 17 | 18 | from tflego import NP_DTYPE 19 | 20 | 21 | def load_data_from_npz(npz_fn, min_length=None): 22 | print("Reading:", npz_fn) 23 | npz = np.load(npz_fn) 24 | x = [] 25 | labels = [] 26 | speakers = [] 27 | lengths = [] 28 | keys = [] 29 | n_items = 0 30 | for utt_key in tqdm(sorted(npz)): 31 | cur_x = npz[utt_key] 32 | if min_length is not None and len(cur_x) <= min_length: 33 | continue 34 | keys.append(utt_key) 35 | x.append(cur_x) 36 | utt_key_split = utt_key.split("_") 37 | word = utt_key_split[0] 38 | speaker = utt_key_split[1] 39 | labels.append(word) 40 | speakers.append(speaker) 41 | lengths.append(len(cur_x)) 42 | n_items += 1 43 | # print("No. items:", n_items) 44 | print("E.g. item shape:", x[0].shape) 45 | return (x, labels, lengths, keys, speakers) 46 | 47 | 48 | def filter_data(data, labels, lengths, keys, speakers, 49 | n_min_tokens_per_type=None, n_max_types=None, n_max_tokens=None, 50 | n_max_tokens_per_type=None): 51 | """ 52 | Filter the output from `load_data_from_npz` based on specifications. 53 | 54 | Each filter is applied independelty, so they could influence each other. 55 | E.g. `n_max_tokens` could further reduce the number of types if it is used 56 | in conjunction with `n_max_types`. 57 | 58 | Return 59 | ------ 60 | data, labels, lengths keys, speakers : list, list, list, list 61 | The filtered lists. 62 | """ 63 | 64 | random.seed(1) 65 | 66 | if n_max_types is not None: 67 | 68 | print("Maximum no. of types:", n_max_types) 69 | 70 | # Find valid types 71 | types = [i[0] for i in Counter(labels).most_common(n_max_types)] 72 | 73 | # Filter 74 | filtered_data = [] 75 | filtered_labels = [] 76 | filtered_lengths = [] 77 | filtered_keys = [] 78 | filtered_speakers = [] 79 | for i in range(len(data)): 80 | if labels[i] in types: 81 | filtered_data.append(data[i]) 82 | filtered_labels.append(labels[i]) 83 | filtered_lengths.append(lengths[i]) 84 | filtered_keys.append(keys[i]) 85 | filtered_speakers.append(speakers[i]) 86 | 87 | data = filtered_data 88 | labels = filtered_labels 89 | lengths = filtered_lengths 90 | keys = filtered_keys 91 | speakers = filtered_speakers 92 | 93 | if n_max_tokens_per_type is not None: 94 | 95 | print("Maximum tokens per type:", n_max_tokens_per_type) 96 | 97 | # Filter 98 | filtered_data = [] 99 | filtered_labels = [] 100 | filtered_lengths = [] 101 | filtered_keys = [] 102 | filtered_speakers = [] 103 | indices = list(range(len(data))) 104 | random.shuffle(indices) 105 | tokens_per_type = Counter() 106 | for i in indices: 107 | if tokens_per_type[labels[i]] < n_max_tokens_per_type: 108 | filtered_data.append(data[i]) 109 | filtered_labels.append(labels[i]) 110 | filtered_lengths.append(lengths[i]) 111 | filtered_keys.append(keys[i]) 112 | filtered_speakers.append(speakers[i]) 113 | tokens_per_type[labels[i]] += 1 114 | 115 | data = filtered_data 116 | labels = filtered_labels 117 | lengths = filtered_lengths 118 | keys = filtered_keys 119 | speakers = filtered_speakers 120 | 121 | if n_max_tokens is not None: 122 | 123 | print("Maximum no. of tokens:", n_max_tokens) 124 | 125 | # Filter 126 | filtered_data = [] 127 | filtered_labels = [] 128 | filtered_lengths = [] 129 | filtered_keys = [] 130 | filtered_speakers = [] 131 | indices = list(range(len(data))) 132 | random.shuffle(indices) 133 | # for i in range(len(data)): 134 | for i in indices[:n_max_tokens]: 135 | filtered_data.append(data[i]) 136 | filtered_labels.append(labels[i]) 137 | filtered_lengths.append(lengths[i]) 138 | filtered_keys.append(keys[i]) 139 | filtered_speakers.append(speakers[i]) 140 | 141 | data = filtered_data 142 | labels = filtered_labels 143 | lengths = filtered_lengths 144 | keys = filtered_keys 145 | speakers = filtered_speakers 146 | 147 | if n_min_tokens_per_type is not None: 148 | 149 | print("Minimum tokens per type:", n_min_tokens_per_type) 150 | 151 | # Find valid types 152 | types = [] 153 | counts = Counter(labels) 154 | for key in counts: 155 | if counts[key] >= n_min_tokens_per_type: 156 | types.append(key) 157 | 158 | # Filter 159 | filtered_data = [] 160 | filtered_labels = [] 161 | filtered_lengths = [] 162 | filtered_keys = [] 163 | filtered_speakers = [] 164 | for i in range(len(data)): 165 | if labels[i] in types: 166 | filtered_data.append(data[i]) 167 | filtered_labels.append(labels[i]) 168 | filtered_lengths.append(lengths[i]) 169 | filtered_keys.append(keys[i]) 170 | filtered_speakers.append(speakers[i]) 171 | 172 | data = filtered_data 173 | labels = filtered_labels 174 | lengths = filtered_lengths 175 | keys = filtered_keys 176 | speakers = filtered_speakers 177 | 178 | print("No. types:", len(Counter(labels))) 179 | print("No. tokens:", len(labels)) 180 | return (data, labels, lengths, keys, speakers) 181 | 182 | 183 | def trunc_and_limit_dim(x, lengths, d_frame, max_length): 184 | for i, seq in enumerate(x): 185 | x[i] = x[i][:max_length, :d_frame] 186 | if max_length is not None: 187 | lengths[i] = min(lengths[i], max_length) 188 | 189 | 190 | def pad_sequences(x, n_padded, center_padded=True, return_mask=False): 191 | """Return the padded sequences and their original lengths.""" 192 | padded_x = np.zeros((len(x), n_padded, x[0].shape[1]), dtype=NP_DTYPE) 193 | if return_mask: 194 | mask_x = np.zeros((len(x), n_padded), dtype=NP_DTYPE) 195 | lengths = [] 196 | for i_data, cur_x in enumerate(x): 197 | length = cur_x.shape[0] 198 | if center_padded: 199 | padding = int(np.round((n_padded - length) / 2.)) 200 | if length <= n_padded: 201 | padded_x[i_data, padding:padding + length, :] = cur_x 202 | if return_mask: 203 | mask_x[i_data, padding:padding + length] = 1 204 | else: 205 | # Cut out snippet from sequence exceeding n_padded 206 | padded_x[i_data, :, :] = cur_x[-padding:-padding + n_padded] 207 | if return_mask: 208 | mask_x[i_data, :] = 1 209 | lengths.append(min(length, n_padded)) 210 | else: 211 | length = min(length, n_padded) 212 | padded_x[i_data, :length, :] = cur_x[:length, :] 213 | if return_mask: 214 | mask_x[i_data, :length] = 1 215 | lengths.append(length) 216 | if return_mask: 217 | return padded_x, lengths, mask_x 218 | else: 219 | return padded_x, lengths 220 | -------------------------------------------------------------------------------- /embeddings/apply_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Encode the set using the specified model. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2018, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | import argparse 14 | import pickle 15 | import numpy as np 16 | import os 17 | import sys 18 | import tensorflow as tf 19 | 20 | sys.path.append(path.join("..", "src")) 21 | 22 | from link_mfcc import sixteen_languages 23 | from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE 24 | import batching 25 | import data_io 26 | 27 | 28 | #-----------------------------------------------------------------------------# 29 | # APPLY MODEL FUNCTIONS # 30 | #-----------------------------------------------------------------------------# 31 | 32 | def build_model(x, x_lengths, options_dict): 33 | model_dict = {} 34 | if options_dict["script"] == "train_cae_rnn": 35 | import train_cae_rnn 36 | cae = train_cae_rnn.build_cae_from_options_dict( 37 | x, x_lengths, x_lengths, options_dict 38 | ) 39 | model_dict["output"] = cae["y"] 40 | model_dict["encoding"] = cae["z"] 41 | model_dict["mask"] = cae["mask"] 42 | elif options_dict["script"] == "train_vae": 43 | import train_vae 44 | vae = train_vae.build_vae_from_options_dict(x, x_lengths, options_dict) 45 | model_dict["output"] = vae["decoder_output"] 46 | model_dict["encoding"] = vae["latent_layer"]["z_mean"] 47 | model_dict["mask"] = vae["mask"] 48 | elif options_dict["script"] == "train_siamese_rnn": 49 | import train_siamese_rnn 50 | siamese = train_siamese_rnn.build_siamese_from_options_dict( 51 | x, x_lengths, options_dict 52 | ) 53 | model_dict["encoding"] = siamese["output"] 54 | elif options_dict["script"] == "train_siamese_cnn": 55 | import train_siamese_cnn 56 | siamese = train_siamese_cnn.build_siamese_cnn_from_options_dict( 57 | x, options_dict 58 | ) 59 | model_dict["encoding"] = siamese["output"] 60 | elif options_dict["script"] == "train_rnn": 61 | import train_rnn 62 | rnn = train_rnn.build_rnn_from_options_dict( 63 | x, x_lengths, options_dict 64 | ) 65 | model_dict["encoding"] = rnn["encoding"] 66 | elif options_dict["script"] == "train_rnn_split": 67 | import train_rnn_split 68 | rnn = train_rnn_split.build_rnn_from_options_dict( 69 | x, x_lengths, options_dict 70 | ) 71 | model_dict["encoding"] = rnn["encoding"] 72 | else: 73 | assert False, "model type not supported" 74 | return model_dict 75 | 76 | 77 | def apply_model(model_fn, subset, language): 78 | 79 | # assert language is None # to-do 80 | 81 | # Load the model options 82 | model_dir = path.split(model_fn)[0] 83 | options_dict_fn = path.join(model_dir, "options_dict.pkl") 84 | print("Reading:", options_dict_fn) 85 | with open(options_dict_fn, "rb") as f: 86 | options_dict = pickle.load(f) 87 | 88 | # Load data 89 | npz_fn = path.join("data", language, subset + ".npz") 90 | x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz( 91 | npz_fn 92 | ) 93 | 94 | if "cnn" in options_dict["script"]: 95 | 96 | # Pad and flatten data 97 | x_data, _ = data_io.pad_sequences( 98 | x_data, options_dict["max_length"], True 99 | ) 100 | x_data = np.transpose(x_data, (0, 2, 1)) 101 | x_data = x_data.reshape((-1, options_dict["d_in"])) 102 | 103 | # Build model 104 | x = tf.placeholder(TF_DTYPE, [None, options_dict["d_in"]]) 105 | model = build_model(x, None, options_dict) 106 | 107 | # Embed data 108 | batch_iterator = batching.LabelledIterator( 109 | x_data, None, x_data.shape[0], False 110 | ) 111 | saver = tf.train.Saver() 112 | with tf.Session() as session: 113 | saver.restore(session, model_fn) 114 | for batch_x in batch_iterator: 115 | np_z = session.run( 116 | [model["encoding"]], feed_dict={x: batch_x})[0] 117 | break # single batch 118 | 119 | else: # rnn 120 | 121 | # Truncate and limit dimensionality 122 | data_io.trunc_and_limit_dim( 123 | x_data, lengths, options_dict["n_input"], 124 | options_dict["max_length"] 125 | ) 126 | 127 | # Build model 128 | x = tf.placeholder(TF_DTYPE, [None, None, options_dict["n_input"]]) 129 | x_lengths = tf.placeholder(TF_ITYPE, [None]) 130 | model = build_model(x, x_lengths, options_dict) 131 | 132 | # Embed data 133 | batch_iterator = batching.SimpleIterator(x_data, len(x_data), False) 134 | saver = tf.train.Saver() 135 | with tf.Session() as session: 136 | saver.restore(session, model_fn) 137 | for batch_x_padded, batch_x_lengths in batch_iterator: 138 | np_x = batch_x_padded 139 | np_x_lengths = batch_x_lengths 140 | np_z = session.run( 141 | [model["encoding"]], feed_dict={x: np_x, x_lengths: 142 | np_x_lengths} 143 | )[0] 144 | break # single batch 145 | 146 | embed_dict = {} 147 | for i, utt_key in enumerate([keys[i] for i in batch_iterator.indices]): 148 | embed_dict[utt_key] = np_z[i] 149 | 150 | return embed_dict 151 | 152 | 153 | #-----------------------------------------------------------------------------# 154 | # UTILITY FUNCTIONS # 155 | #-----------------------------------------------------------------------------# 156 | 157 | def check_argv(): 158 | """Check the command line arguments.""" 159 | parser = argparse.ArgumentParser( 160 | description=__doc__.strip().split("\n")[0], add_help=False 161 | ) 162 | parser.add_argument("model_fn", type=str, help="model checkpoint filename") 163 | parser.add_argument( 164 | "language", type=str, help="language to apply model to", 165 | choices=sixteen_languages 166 | ) 167 | parser.add_argument( 168 | "subset", type=str, help="subset to apply model to", 169 | choices=["val", "test"] 170 | ) 171 | if len(sys.argv) == 1: 172 | parser.print_help() 173 | sys.exit(1) 174 | return parser.parse_args() 175 | 176 | 177 | #-----------------------------------------------------------------------------# 178 | # MAIN FUNCTION # 179 | #-----------------------------------------------------------------------------# 180 | 181 | def main(): 182 | args = check_argv() 183 | 184 | # Do not output TensorFlow info and warning messages 185 | import warnings 186 | warnings.filterwarnings("ignore") 187 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" 188 | tf.logging.set_verbosity(tf.logging.ERROR) 189 | if type(tf.contrib) != type(tf): 190 | tf.contrib._warning = None 191 | 192 | # Embed data 193 | embed_dict = apply_model(args.model_fn, args.subset, args.language) 194 | 195 | # Save embeddings 196 | model_dir, model_fn = path.split(args.model_fn) 197 | if args.language is None: 198 | npz_fn = args.subset + ".npz" 199 | else: 200 | npz_fn = args.language + "." + args.subset + ".npz" 201 | npz_fn = path.join(model_dir, path.splitext(model_fn)[0] + "." + npz_fn) 202 | print("Writing:", npz_fn) 203 | np.savez_compressed(npz_fn, **embed_dict) 204 | print(datetime.now()) 205 | 206 | 207 | if __name__ == "__main__": 208 | main() 209 | -------------------------------------------------------------------------------- /qbe/sandbox.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sandox: QbE keyword lists" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Herman Kamper, Stellenbosch University, 2018-2019." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Preliminaries" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 11, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "The autoreload extension is already loaded. To reload it, use:\n", 34 | " %reload_ext autoreload\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "%matplotlib inline\n", 40 | "%load_ext autoreload\n", 41 | "%autoreload 2\n", 42 | "\n", 43 | "from collections import Counter\n", 44 | "from os import path\n", 45 | "import codecs\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "import numpy as np\n", 48 | "import random" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Keywords" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "dev_keywords_fn = \"../features/mfcc/HA/ha.dev.gt_words.npz\"\n", 65 | "test_fn = \"../features/mfcc/HA/ha.eval.npz\"\n", 66 | "dev_keywords_features = np.load(dev_keywords_fn)\n", 67 | "test_features = np.load(test_fn)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "def read_forced_alignment(globalphone_fa_fn):\n", 77 | " \"\"\"Return a dictionary of transcriptions obtained from a GlobalPhone forced alignment file.\"\"\"\n", 78 | " transcription_dict = {}\n", 79 | " with codecs.open(globalphone_fa_fn, \"r\", \"utf-8\") as f:\n", 80 | " for line in f:\n", 81 | " line = line.strip().split(\" \")\n", 82 | " utterance_key = line[0]\n", 83 | " label = line[4].lower()\n", 84 | " if utterance_key not in transcription_dict:\n", 85 | " transcription_dict[utterance_key] = []\n", 86 | " transcription_dict[utterance_key].append(label)\n", 87 | " return transcription_dict \n", 88 | "\n", 89 | "test_transcription = read_forced_alignment(\"/home/kamperh/endgame/datasets/globalphone_alignments/HA/eval.ctm\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "test_counter = Counter()\n", 99 | "for utterance_key in test_transcription:\n", 100 | " for word in test_transcription[utterance_key]:\n", 101 | " test_counter[word] += 1" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 43, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "No. words more than 9: 111\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "n = 9\n", 119 | "more_than_n = set()\n", 120 | "for word, count in test_counter.most_common():\n", 121 | " if count >= n:\n", 122 | " more_than_n.add(word)\n", 123 | "print(\"No. words more than {}: {}\".format(n, len(more_than_n)))" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 44, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "dev_counter = Counter()\n", 133 | "dev_words = set()\n", 134 | "for segment_key in dev_keywords_features:\n", 135 | " word = segment_key.split(\"_\")[0].lower()\n", 136 | " dev_counter[word] += 1\n", 137 | " dev_words.add(word)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 45, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "No. words overlap: 33\n", 150 | "aikin: 6 times in dev\n", 151 | "amfani: 8 times in dev\n", 152 | "amurka: 6 times in dev\n", 153 | "arziki: 2 times in dev\n", 154 | "babban: 2 times in dev\n", 155 | "bayan: 6 times in dev\n", 156 | "bayyana: 5 times in dev\n", 157 | "birnin: 7 times in dev\n", 158 | "cikin: 3 times in dev\n", 159 | "daban: 1 times in dev\n", 160 | "daular: 6 times in dev\n", 161 | "domin: 2 times in dev\n", 162 | "duniya: 8 times in dev\n", 163 | "hankali: 7 times in dev\n", 164 | "hanyar: 5 times in dev\n", 165 | "harkokin: 12 times in dev\n", 166 | "kasance: 6 times in dev\n", 167 | "kasar: 14 times in dev\n", 168 | "kasashe: 4 times in dev\n", 169 | "kasashen: 13 times in dev\n", 170 | "lokacin: 12 times in dev\n", 171 | "majalisar: 8 times in dev\n", 172 | "mutane: 18 times in dev\n", 173 | "samun: 5 times in dev\n", 174 | "sarki: 7 times in dev\n", 175 | "sosai: 25 times in dev\n", 176 | "tattalin: 4 times in dev\n", 177 | "tsakanin: 11 times in dev\n", 178 | "wajen: 2 times in dev\n", 179 | "wanda: 1 times in dev\n", 180 | "wannan: 5 times in dev\n", 181 | "zaman: 1 times in dev\n", 182 | "zamanin: 9 times in dev\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "overlap = more_than_n.intersection(dev_words)\n", 188 | "print(\"No. words overlap:\", len(overlap))\n", 189 | "for word in sorted(overlap):\n", 190 | " print(\"{}: {} times in dev\".format(word, dev_counter[word]))" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 48, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "n_keywords = 30\n", 200 | "keywords = list(overlap)\n", 201 | "random.seed(1)\n", 202 | "random.shuffle(keywords)\n", 203 | "keywords = keywords[:n_keywords]" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 49, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "Keywords: ['amfani', 'amurka', 'arziki', 'babban', 'bayan', 'bayyana', 'birnin', 'daban', 'daular', 'domin', 'duniya', 'hankali', 'hanyar', 'harkokin', 'kasar', 'kasashe', 'kasashen', 'lokacin', 'majalisar', 'mutane', 'samun', 'sarki', 'sosai', 'tattalin', 'tsakanin', 'wajen', 'wanda', 'wannan', 'zaman', 'zamanin']\n", 216 | "No. keywords: 30\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "print(\"Keywords:\", sorted(keywords))\n", 222 | "print(\"No. keywords:\", len(keywords))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 50, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "with codecs.open(\"keywords.txt\", \"w\", \"utf-8\") as f:\n", 232 | " for keyword in sorted(keywords):\n", 233 | " f.write(keyword + \"\\n\")" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.5.2" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 2 265 | } 266 | -------------------------------------------------------------------------------- /embeddings/analyse_embeds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Analyse a given file with embedding tokens. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2015, 2018, 2019 9 | """ 10 | 11 | from os import path 12 | from sklearn import decomposition, ensemble, manifold 13 | import argparse 14 | import matplotlib.pyplot as plt 15 | import numpy as np 16 | import random 17 | import sys 18 | 19 | basedir = path.dirname(path.abspath(__file__)) 20 | sys.path.append(path.join(basedir, "..", "src")) 21 | 22 | import plotting 23 | 24 | 25 | #-----------------------------------------------------------------------------# 26 | # PLOTTING # 27 | #-----------------------------------------------------------------------------# 28 | 29 | def plot_raw_embeds(npz, types=None): 30 | """Plot all the embeddings of type `types`, if None plot everything.""" 31 | 32 | # Get embeddings 33 | embeddings = [] 34 | labels = [] 35 | for key in npz: 36 | if "_" in key: 37 | label = key.split("_")[0] 38 | else: 39 | label = key 40 | if types is None: 41 | labels.append(label) 42 | embeddings.append(npz[key]) 43 | elif label in types: 44 | labels.append(label) 45 | embeddings.append(npz[key]) 46 | n_embeds = len(embeddings) 47 | 48 | # Now sort by label 49 | sort_order = np.argsort(np.array(labels)) 50 | sorted_labels = np.array(labels)[sort_order] 51 | 52 | # Get cluster tick positions 53 | type_ticks = [0] 54 | for i in range(len(sorted_labels) - 1): 55 | if sorted_labels[i] != sorted_labels[i + 1]: 56 | type_ticks.append(i + 1) 57 | type_ticks.append(n_embeds) 58 | 59 | # Get label positions and labels 60 | type_label_ticks = [] 61 | type_labels = [] 62 | for i in sorted(list(set(labels))): 63 | where = np.where(sorted_labels == i)[0] 64 | if len(where) == 0: 65 | continue 66 | pos = int(np.mean(where)) 67 | type_label_ticks.append(pos) 68 | type_labels.append(i) 69 | 70 | # print("Plotting all embeddings") 71 | 72 | # Variables used for plotting 73 | labels_offset = 1.04 74 | par2_linewidth = 0.5 75 | 76 | fig, host = plt.subplots() 77 | par2 = host.twinx() 78 | par2.spines["right"].set_position(("axes", labels_offset)) 79 | plotting.make_patch_spines_invisible(par2) 80 | par2.spines["right"].set_visible(True) 81 | par2.set_ylim([0, n_embeds]) 82 | par2.invert_yaxis() 83 | par2.set_yticks(type_ticks) 84 | par2.set_yticklabels([]) 85 | par2.tick_params(axis="y", width=par2_linewidth, length=10) 86 | par2.spines["right"].set_linewidth(par2_linewidth) 87 | par2.set_yticks(type_label_ticks, minor=True) 88 | par2.set_yticklabels(type_labels, minor=True) 89 | par2.set_ylabel("Word types") 90 | for line in par2.yaxis.get_minorticklines(): 91 | line.set_visible(False) 92 | cax = host.imshow( 93 | np.array(embeddings)[sort_order], interpolation="nearest", 94 | aspect="auto" 95 | ) 96 | host.set_yticks([]) 97 | # host.set_xticklabels([]) 98 | host.set_ylabel("Word embedding vector") 99 | host.set_xlabel("Embedding dimensions") 100 | # fig.colorbar(cax, orientation="horizontal") 101 | 102 | 103 | # From http://scikit-learn.org/stable/_downloads/plot_lle_digits.py. 104 | def plot_embeds_2d(embeds_dict, types=None): 105 | print("Computing PCA projection") 106 | embeddings, labels = get_embeds_and_labels(embeds_dict, types) 107 | X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform( 108 | embeddings 109 | ) 110 | plot_labelled_2d_data(X_pca, labels, "PCA") 111 | 112 | print("Computing t-SNE embedding") 113 | embeddings, labels = get_embeds_and_labels(embeds_dict, types) 114 | tsne = manifold.TSNE( 115 | n_components=2, perplexity=20, init="random", random_state=1 116 | ) 117 | X_tsne = tsne.fit_transform(embeddings) 118 | plot_labelled_2d_data(X_tsne, labels, "t-SNE") 119 | 120 | # print("Computing Spectral embedding") 121 | # embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, 122 | # eigen_solver="arpack") 123 | # X_se = embedder.fit_transform(embeddings) 124 | # plot_labelled_2d_data(X_se, labels) 125 | 126 | # print("Computing Totally Random Trees embedding") 127 | # hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, 128 | # max_depth=5) 129 | # X_transformed = hasher.fit_transform(embeddings) 130 | # pca = decomposition.TruncatedSVD(n_components=2) 131 | # X_reduced = pca.fit_transform(X_transformed) 132 | # plot_labelled_2d_data(X_reduced, labels) 133 | 134 | # print("Computing MDS embedding") 135 | # clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) 136 | # X_mds = clf.fit_transform(embeddings) 137 | # plot_labelled_2d_data(X_mds, labels) 138 | 139 | print("Computing Isomap embedding") 140 | n_neighbors = 10 141 | X_iso = manifold.Isomap( 142 | n_neighbors, n_components=2).fit_transform(embeddings 143 | ) 144 | plot_labelled_2d_data( 145 | X_iso, labels, "Isomap (" + str(n_neighbors) + " neighbours)" 146 | ) 147 | 148 | 149 | def plot_labelled_2d_data(X, labels, title=None): 150 | plt.figure() 151 | classes = set(labels) 152 | for label in sorted(classes): 153 | indices = np.where(np.array(labels) == label)[0] 154 | plt.scatter(X[indices, 0], X[indices, 1], label=label) 155 | if title is not None: 156 | plt.title(title) 157 | plt.legend(loc="best", ncol=2) 158 | 159 | 160 | def plot_data_labelled(X, labels, title=None): 161 | ordered_labels = sorted(set(labels)) 162 | n_labels = len(set(labels)) 163 | 164 | x_min, x_max = np.min(X, 0), np.max(X, 0) 165 | X = (X - x_min) / (x_max - x_min) 166 | 167 | plt.figure() 168 | ax = plt.subplot(111) 169 | for i in range(X.shape[0]): 170 | plt.text( 171 | X[i, 0], X[i, 1], str(labels[i]), 172 | color=plt.cm.Set1(1.0*ordered_labels.index(labels[i]) / n_labels), 173 | fontdict={"weight": "bold", "size": 9} 174 | ) 175 | 176 | if title is not None: 177 | plt.title(title) 178 | 179 | plt.xticks([]), plt.yticks([]) 180 | 181 | 182 | #-----------------------------------------------------------------------------# 183 | # UTILITY FUNCTIONS # 184 | #-----------------------------------------------------------------------------# 185 | 186 | def check_argv(): 187 | """Check the command line arguments.""" 188 | parser = argparse.ArgumentParser( 189 | description=__doc__.strip().split("\n")[0], add_help=False 190 | ) 191 | parser.add_argument("npz_fn", type=str, help="") 192 | parser.add_argument( 193 | "--word_type", type=str, 194 | help="show a plot for these word types, given as " 195 | "comma-seperated values" 196 | ) 197 | parser.add_argument( 198 | "--plot_rnd", type=int, 199 | help="plot this number of randomly selected embeddings" 200 | ) 201 | parser.add_argument( 202 | "--plot_all", action="store_true", help="plot all embeddings" 203 | ) 204 | parser.add_argument( 205 | "--normalize", dest="normalize", action="store_true", 206 | help="normalize embeddings to unit sphere before calculating " 207 | "distances (default is not to do this)" 208 | ) 209 | parser.set_defaults(normalize=False) 210 | if len(sys.argv) == 1: 211 | parser.print_help() 212 | sys.exit(1) 213 | return parser.parse_args() 214 | 215 | 216 | def get_embeds_and_labels(embeds_dict, types=None): 217 | embeddings = [] 218 | labels = [] 219 | for utt in embeds_dict: 220 | if "_" in utt: 221 | label = buckeye_utt_to_label(utt) 222 | else: 223 | label = utt 224 | if types is None: 225 | labels.append(label) 226 | embeddings.append(embeds_dict[utt]) 227 | elif label in types: 228 | labels.append(label) 229 | embeddings.append(embeds_dict[utt]) 230 | embeddings = np.array(embeddings) 231 | return embeddings, labels 232 | 233 | 234 | def buckeye_utt_to_label(utt): 235 | return utt.split("_")[0] 236 | 237 | 238 | #-----------------------------------------------------------------------------# 239 | # MAIN FUNCTION # 240 | #-----------------------------------------------------------------------------# 241 | 242 | def main(): 243 | args = check_argv() 244 | 245 | print("Reading:", args.npz_fn) 246 | npz = np.load(args.npz_fn) 247 | 248 | if args.normalize: 249 | print("Normalizing embeddings") 250 | norm_npz = {} 251 | for key in npz: 252 | embed = npz[key] 253 | norm_npz[key] = embed/np.linalg.norm(embed) 254 | npz = norm_npz 255 | 256 | print( 257 | "Minimum embedding value:", np.min([np.min(npz[key]) for key in npz]) 258 | ) 259 | print( 260 | "Maximum embedding value:", np.max([np.max(npz[key]) for key in npz]) 261 | ) 262 | 263 | if args.word_type: 264 | if not "," in args.word_type: 265 | # A single word type 266 | print("Plotting embeddings for type:", args.word_type) 267 | embeddings = [] 268 | for key in npz: 269 | if args.word_type in key: 270 | embed = npz[key] 271 | embeddings.append(embed) 272 | print("No. embeddings matching type:", len(embeddings)) 273 | plt.imshow(embeddings, interpolation="nearest", aspect="auto") 274 | else: 275 | # Multiple word types 276 | # plot_embeds_tsne(npz, args.word_type.split(",")) 277 | plot_raw_embeds(npz, args.word_type.split(",")) 278 | plot_embeds_2d(npz, args.word_type.split(",")) 279 | 280 | # print("Example embedding:", npz[npz.keys()[0]]) 281 | 282 | if args.plot_all: 283 | plot_raw_embeds(npz) 284 | # plot_embeds_2d(npz) 285 | 286 | if args.plot_rnd is not None: 287 | print("Analysing", args.plot_rnd, "randomly sampled embeddings") 288 | random.seed(42) 289 | sample_keys = random.sample(npz.keys(), args.plot_rnd) 290 | npz_sampled = {} 291 | for key in sample_keys: 292 | npz_sampled[key] = npz[key] 293 | plot_raw_embeds(npz_sampled) 294 | plot_embeds_2d(npz_sampled) 295 | 296 | if args.word_type or args.plot_all or args.plot_rnd: 297 | plt.show() 298 | 299 | 300 | if __name__ == "__main__": 301 | main() 302 | -------------------------------------------------------------------------------- /qbe/apply_model_dense.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Apply a model to dense segmentationi intervals. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2018, 2019 9 | """ 10 | 11 | from datetime import datetime 12 | from os import path 13 | from tqdm import tqdm 14 | import argparse 15 | import pickle 16 | import numpy as np 17 | import os 18 | import sys 19 | import tensorflow as tf 20 | 21 | sys.path.append(path.join("..", "src")) 22 | sys.path.append(path.join("..", "embeddings")) 23 | 24 | from apply_model import build_model 25 | from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE 26 | import batching 27 | import data_io 28 | 29 | 30 | #-----------------------------------------------------------------------------# 31 | # APPLY MODEL FUNCTIONS # 32 | #-----------------------------------------------------------------------------# 33 | 34 | """ 35 | def build_model(x, x_lengths, options_dict): 36 | model_dict = {} 37 | if options_dict["script"] == "train_cae_rnn": 38 | import train_cae_rnn 39 | cae = train_cae_rnn.build_cae_from_options_dict( 40 | x, x_lengths, x_lengths, options_dict 41 | ) 42 | model_dict["output"] = cae["y"] 43 | model_dict["encoding"] = cae["z"] 44 | model_dict["mask"] = cae["mask"] 45 | elif options_dict["script"] == "train_vae": 46 | import train_vae 47 | vae = train_vae.build_vae_from_options_dict(x, x_lengths, options_dict) 48 | model_dict["output"] = vae["decoder_output"] 49 | model_dict["encoding"] = vae["latent_layer"]["z_mean"] 50 | model_dict["mask"] = vae["mask"] 51 | elif options_dict["script"] == "train_siamese_rnn": 52 | import train_siamese_rnn 53 | siamese = train_siamese_rnn.build_siamese_from_options_dict( 54 | x, x_lengths, options_dict 55 | ) 56 | model_dict["encoding"] = siamese["output"] 57 | elif options_dict["script"] == "train_siamese_cnn": 58 | import train_siamese_cnn 59 | siamese = train_siamese_cnn.build_siamese_cnn_from_options_dict( 60 | x, options_dict 61 | ) 62 | model_dict["encoding"] = siamese["output"] 63 | elif options_dict["script"] == "train_rnn": 64 | import train_rnn 65 | rnn = train_rnn.build_rnn_from_options_dict( 66 | x, x_lengths, options_dict 67 | ) 68 | model_dict["encoding"] = rnn["encoding"] 69 | else: 70 | assert False, "model type not supported" 71 | return model_dict 72 | """ 73 | 74 | 75 | def apply_model(model_fn, language, subset, segtag): 76 | 77 | # Load the model options 78 | model_dir = path.split(model_fn)[0] 79 | options_dict_fn = path.join(model_dir, "options_dict.pkl") 80 | print("Reading:", options_dict_fn) 81 | with open(options_dict_fn, "rb") as f: 82 | options_dict = pickle.load(f) 83 | 84 | # Load data and intervals 85 | npz_fn = path.join("data", language, subset + ".npz") 86 | x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz( 87 | npz_fn 88 | ) 89 | seglist_fn = path.join( 90 | "data", language, "search.seglist." + segtag + ".pkl" 91 | ) 92 | print("Reading:", seglist_fn) 93 | with open(seglist_fn, "rb") as f: 94 | seglist_dict = pickle.load(f) 95 | seglists = [seglist_dict[i] for i in keys] 96 | print("No. utterances:", len(x_data)) 97 | n_intervals = sum([len(i) for i in seglists]) 98 | print("No. intervals:", n_intervals) 99 | 100 | # assert False 101 | # print("Reading:", input_npz_fn) 102 | # features_dict = np.load(input_npz_fn) 103 | # seglist_fn = path.join( 104 | # "data", language, "search.seglist." + segtag + ".pkl" 105 | # ) 106 | # print("Reading:", seglist_fn) 107 | # with open(seglist_fn, "rb") as f: 108 | # seglist_dict = pickle.load(f) 109 | # utterances = sorted(features_dict.keys()) 110 | # input_sequences = [features_dict[i] for i in utterances] 111 | # seglists = [seglist_dict[i] for i in utterances] 112 | # print("No. utterances:", len(input_sequences)) 113 | # n_intervals = sum([len(i) for i in seglists]) 114 | # print("No. intervals:", n_intervals) 115 | 116 | # if "cnn" in options_dict["script"]: 117 | # assert False, "to-do" 118 | # else: # rnn 119 | 120 | # print("No. utterances:", len(input_sequences)) 121 | # n_intervals = sum([len(i) for i in seglists]) 122 | # print("No. intervals:", n_intervals) 123 | 124 | 125 | # # Load data 126 | # npz_fn = path.join("data", language, subset + ".npz") 127 | # x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz( 128 | # npz_fn 129 | # ) 130 | 131 | 132 | if "cnn" in options_dict["script"]: 133 | 134 | assert False, "to-do" 135 | 136 | # Pad and flatten data 137 | x_data, _ = data_io.pad_sequences( 138 | x_data, options_dict["max_length"], True 139 | ) 140 | x_data = np.transpose(x_data, (0, 2, 1)) 141 | x_data = x_data.reshape((-1, options_dict["d_in"])) 142 | 143 | # Build model 144 | x = tf.placeholder(TF_DTYPE, [None, options_dict["d_in"]]) 145 | model = build_model(x, None, options_dict) 146 | 147 | # Embed data 148 | batch_iterator = batching.LabelledIterator( 149 | x_data, None, x_data.shape[0], False 150 | ) 151 | saver = tf.train.Saver() 152 | with tf.Session() as session: 153 | saver.restore(session, model_fn) 154 | for batch_x in batch_iterator: 155 | np_z = session.run( 156 | [model["encoding"]], feed_dict={x: batch_x})[0] 157 | break # single batch 158 | 159 | else: # rnn 160 | 161 | # Truncate and limit dimensionality 162 | data_io.trunc_and_limit_dim( 163 | x_data, lengths, options_dict["n_input"], None 164 | ) 165 | 166 | class DenseBatchFeedIterator(object): 167 | 168 | def __init__(self, input_sequences, seglists): 169 | self.input_sequences = input_sequences 170 | self.n_input = self.input_sequences[0].shape[-1] 171 | self.seglists = seglists 172 | 173 | def __iter__(self): 174 | for i_utt in range(len(self.input_sequences)): 175 | 176 | # Get intervals 177 | seglist = self.seglists[i_utt] 178 | input_sequence = self.input_sequences[i_utt] 179 | 180 | # Get segments for intervals 181 | segments = [] 182 | for i, j in seglist: 183 | segments.append(input_sequence[i:j, :]) 184 | 185 | batch_x_lengths = [i.shape[0] for i in segments] 186 | 187 | # Pad to maximum length in batch 188 | batch_x_padded = np.zeros( 189 | (len(batch_x_lengths), np.max(batch_x_lengths), 190 | self.n_input), dtype=NP_DTYPE 191 | ) 192 | for i, length in enumerate(batch_x_lengths): 193 | seq = segments[i] 194 | batch_x_padded[i, :length, :] = seq 195 | 196 | yield (batch_x_padded, batch_x_lengths) 197 | 198 | batch_iterator = DenseBatchFeedIterator(x_data, seglists) 199 | 200 | # Build model 201 | x = tf.placeholder(TF_DTYPE, [None, None, options_dict["n_input"]]) 202 | x_lengths = tf.placeholder(TF_ITYPE, [None]) 203 | model = build_model(x, x_lengths, options_dict) 204 | 205 | # Embed data 206 | # batch_iterator = batching.SimpleIterator(x_data, len(x_data), False) 207 | saver = tf.train.Saver() 208 | n_outputs = 0 209 | embed_dict = {} 210 | with tf.Session() as session: 211 | saver.restore(session, model_fn) 212 | # print(datetime.now()) 213 | print( 214 | "Applying model to segments ({} iterations):".format( 215 | len(x_data)) 216 | ) 217 | for i_batch, (batch_x_padded, batch_x_lengths) in \ 218 | tqdm(enumerate(batch_iterator)): 219 | cur_output = session.run( 220 | [model["encoding"]], feed_dict={x: batch_x_padded, 221 | x_lengths: batch_x_lengths} 222 | )[0] 223 | utt_key = keys[i_batch] 224 | seglist = seglists[i_batch] 225 | embeddings = [] 226 | for i in range(cur_output.shape[0]): 227 | embeddings.append(cur_output[i, :]) 228 | n_outputs += 1 229 | embed_dict[utt_key] = np.array(embeddings) 230 | # print(datetime.now()) 231 | 232 | # for batch_x_padded, batch_x_lengths in batch_iterator: 233 | # np_x = batch_x_padded 234 | # np_x_lengths = batch_x_lengths 235 | # np_z = session.run( 236 | # [model["encoding"]], feed_dict={x: np_x, x_lengths: 237 | # np_x_lengths} 238 | # )[0] 239 | # break # single batch 240 | 241 | print("Processed {} out of {} inputs".format(n_outputs, n_intervals)) 242 | 243 | return embed_dict 244 | 245 | 246 | #-----------------------------------------------------------------------------# 247 | # UTILITY FUNCTIONS # 248 | #-----------------------------------------------------------------------------# 249 | 250 | def check_argv(): 251 | """Check the command line arguments.""" 252 | parser = argparse.ArgumentParser( 253 | description=__doc__.strip().split("\n")[0], add_help=False 254 | ) 255 | parser.add_argument("model_fn", type=str, help="model checkpoint filename") 256 | parser.add_argument( 257 | "language", type=str, help="GlobalPhone language", 258 | choices=["HA"] 259 | ) 260 | parser.add_argument( 261 | "subset", type=str, help="subset to apply model to", 262 | choices=["search.0", "search.1", "search.test"] 263 | ) 264 | parser.add_argument( 265 | "--segtag", type=str, 266 | help="a tag to identify the dense segments lists " 267 | "(default: %(default)s)", default="min_20.max_60.step_3" 268 | ) 269 | if len(sys.argv) == 1: 270 | parser.print_help() 271 | sys.exit(1) 272 | return parser.parse_args() 273 | 274 | 275 | #-----------------------------------------------------------------------------# 276 | # MAIN FUNCTION # 277 | #-----------------------------------------------------------------------------# 278 | 279 | def main(): 280 | args = check_argv() 281 | 282 | # Do not output TensorFlow info and warning messages 283 | import warnings 284 | warnings.filterwarnings("ignore") 285 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" 286 | tf.logging.set_verbosity(tf.logging.ERROR) 287 | if type(tf.contrib) != type(tf): 288 | tf.contrib._warning = None 289 | 290 | # Embed data 291 | embed_dict = apply_model( 292 | args.model_fn, args.language, args.subset, args.segtag 293 | ) 294 | 295 | # Save embeddings 296 | model_dir, model_fn = path.split(args.model_fn) 297 | model_key = path.split(path.normpath(model_dir))[1] 298 | output_dir = path.join("exp", args.language, model_key + "." + args.segtag) 299 | if not path.isdir(output_dir): 300 | os.makedirs(output_dir) 301 | npz_fn = path.join(output_dir, args.subset + ".npz") 302 | print("Writing:", npz_fn) 303 | np.savez_compressed(npz_fn, **embed_dict) 304 | print(datetime.now()) 305 | 306 | 307 | if __name__ == "__main__": 308 | main() 309 | -------------------------------------------------------------------------------- /blackbox/extract_analysis_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Extract MFCC features for a GlobalPhone language for further analysis. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from collections import Counter 12 | from os import path 13 | import argparse 14 | import codecs 15 | import numpy as np 16 | import os 17 | import random 18 | import sys 19 | 20 | sys.path.append("..") 21 | sys.path.append(path.join("..", "features")) 22 | 23 | from paths import gp_alignments_dir 24 | import utils 25 | 26 | 27 | #-----------------------------------------------------------------------------# 28 | # UTILITY FUNCTIONS # 29 | #-----------------------------------------------------------------------------# 30 | 31 | def check_argv(): 32 | """Check the command line arguments.""" 33 | parser = argparse.ArgumentParser( 34 | description=__doc__.strip().split("\n")[0], add_help=False 35 | ) 36 | parser.add_argument( 37 | "language", type=str, help="GlobalPhone language", 38 | choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA", "KO", "PL", "PO", 39 | "RU", "SP", "SW", "TH", "TU", "VN"] 40 | ) 41 | parser.add_argument( 42 | "--analyse", action="store_true", 43 | help="intermediate list analysis", default=False 44 | ) 45 | if len(sys.argv) == 1: 46 | parser.print_help() 47 | sys.exit(1) 48 | return parser.parse_args() 49 | 50 | 51 | def read_fa(fa_fn): 52 | """ 53 | Return a dict of list of (start_time, end_time, label) with utterance keys. 54 | """ 55 | fa_dict = {} 56 | with codecs.open(fa_fn) as f: 57 | for line in f: 58 | utt_key, _, start, duration, label = line.strip().split() 59 | start = float(start) 60 | duration = float(duration) 61 | end = start + duration 62 | if not utt_key in fa_dict: 63 | fa_dict[utt_key] = [] 64 | fa_dict[utt_key].append((start, end, label)) 65 | return fa_dict 66 | 67 | 68 | def pronunciations_from_fa(word_fa_fn, phone_fa_fn): 69 | """ 70 | Return a dict of word tokens with pronunciations using forced alignments. 71 | 72 | The dictionary keys are the word token keys and the values are lists of 73 | phone labels. 74 | """ 75 | 76 | # Read forced alignments 77 | # phone_fa[utt_key] is list of (start_time, end_time, phone) 78 | print("Reading:", phone_fa_fn) 79 | phone_fa = read_fa(phone_fa_fn) 80 | print("Reading:", word_fa_fn) 81 | word_fa = read_fa(word_fa_fn) 82 | 83 | # For each word 84 | pronunciations_dict = {} 85 | for utt_key in sorted(word_fa): 86 | for word_start, word_end, word in word_fa[utt_key]: 87 | 88 | if word == "": 89 | continue 90 | 91 | # Find phone sequence 92 | phone_sequence = [] 93 | for (phone_start, phone_end, phone) in phone_fa[utt_key]: 94 | if (phone_start >= word_start and phone_start < word_end and 95 | phone != "sil"): 96 | # Phone is in word 97 | phone = phone.split("_")[0] 98 | phone_sequence.append(phone) 99 | assert len(phone_sequence) != 0, "pronunciation not found" 100 | word_start_frame = int(round(word_start*100)) 101 | word_end_frame = int(round(word_end*100)) 102 | segment_key = "{}_{}_{:06d}-{:06d}".format( 103 | word, utt_key, word_start_frame, word_end_frame + 1 104 | ) 105 | pronunciations_dict[segment_key] = phone_sequence 106 | 107 | return pronunciations_dict 108 | 109 | 110 | def filter_segment_keys(segment_keys, n_min_tokens_per_type=0, 111 | n_max_tokens_per_type=np.inf, n_max_tokens=np.inf): 112 | 113 | random.seed(1) 114 | random.shuffle(segment_keys) 115 | labels = [i.split("_")[0] for i in segment_keys] 116 | 117 | # Find valid types 118 | valid_types = [] 119 | counts = Counter(labels) 120 | for key in counts: 121 | if counts[key] >= n_min_tokens_per_type: 122 | valid_types.append(key) 123 | 124 | # Filter 125 | filtered_keys = [] 126 | tokens_per_type = Counter() 127 | for i in range(len(labels)): 128 | label = labels[i] 129 | if (label in valid_types and tokens_per_type[label] <= 130 | n_max_tokens_per_type): 131 | filtered_keys.append(segment_keys[i]) 132 | tokens_per_type[label] += 1 133 | 134 | if n_max_tokens != np.inf: 135 | random.shuffle(filtered_keys) 136 | filtered_keys = filtered_keys[:n_max_tokens] 137 | 138 | return filtered_keys 139 | 140 | 141 | #-----------------------------------------------------------------------------# 142 | # MAIN FUNCTION # 143 | #-----------------------------------------------------------------------------# 144 | 145 | def main(): 146 | args = check_argv() 147 | feat_type = "mfcc" 148 | 149 | list_dir = path.join("lists", args.language) 150 | if not path.isdir(list_dir): 151 | os.makedirs(list_dir) 152 | feat_dir = path.join(feat_type, args.language) 153 | if not path.isdir(feat_dir): 154 | os.makedirs(feat_dir) 155 | 156 | # All ground truth word segments with pronunciations 157 | for subset in ["dev"]: #, "eval", "train"]: 158 | 159 | list_fn = path.join(list_dir, subset + ".all_gt_words.list") 160 | pronunciations_fn = path.join(list_dir, subset + ".prons") 161 | 162 | # Read forced alignments and obtain pronunciations 163 | word_fa_fn = path.join( 164 | gp_alignments_dir, args.language, subset + ".ctm" 165 | ) 166 | phone_fa_fn = path.join( 167 | # gp_alignments_dir, args.language, subset + ".phone.ctm" 168 | gp_alignments_dir, args.language, subset + ".phone.ipa.ctm" 169 | ) 170 | if not path.isfile(phone_fa_fn): 171 | print("Warning: IPA pronunciations not found") 172 | phone_fa_fn = path.join( 173 | gp_alignments_dir, args.language, subset + ".phone.ctm" 174 | ) 175 | pronunciations_dict = pronunciations_from_fa( 176 | word_fa_fn, phone_fa_fn 177 | ) 178 | 179 | # Write pronunciation list 180 | if not path.isfile(pronunciations_fn): 181 | print("Writing:", pronunciations_fn) 182 | with codecs.open(pronunciations_fn, "w", "utf-8") as f: 183 | for segment_key in sorted(pronunciations_dict): 184 | f.write( 185 | segment_key + " " + 186 | ",".join(pronunciations_dict[segment_key]) + "\n" 187 | ) 188 | else: 189 | print("Using existing file:", pronunciations_fn) 190 | 191 | # Write word list 192 | if not path.isfile(list_fn): 193 | print("Writing:", list_fn) 194 | with codecs.open(list_fn, "w", "utf-8") as f: 195 | for segment_key in sorted(pronunciations_dict): 196 | f.write(segment_key + "\n") 197 | else: 198 | print("Using existing file:", list_fn) 199 | 200 | # Write individual phone list 201 | phone_list_fn = path.join(list_dir, subset + ".phone.list") 202 | if not path.isfile(phone_list_fn): 203 | utils.filter_words( 204 | phone_fa_fn, phone_list_fn, min_frames=5, min_chars=0 205 | ) 206 | else: 207 | print("Using existing file:", phone_list_fn) 208 | 209 | # Filter phones 210 | print("Reading:", phone_list_fn) 211 | phone_segment_keys = [] 212 | with codecs.open(phone_list_fn, "r", "utf-8") as f: 213 | for line in f: 214 | phone_segment_keys.append(line.strip()) 215 | phone_filtered_keys = filter_segment_keys( 216 | phone_segment_keys, n_max_tokens=5000 217 | ) 218 | phone_filtered_list_fn = path.join( 219 | list_dir, subset + ".filter1_phone.list" 220 | ) 221 | print("Writing:", phone_filtered_list_fn) 222 | if not path.isfile(phone_filtered_list_fn): 223 | with codecs.open(phone_filtered_list_fn, "w", "utf-8") as f: 224 | for segment_key in sorted(phone_filtered_keys): 225 | f.write(segment_key + "\n") 226 | else: 227 | print("Using existing file:", phone_filtered_list_fn) 228 | 229 | # Extract phone segments from the MFCC NumPy archives 230 | input_npz_fn = path.join( 231 | "..", "features", feat_type, args.language, args.language.lower() + 232 | "." + subset + ".npz" 233 | ) 234 | output_npz_fn = path.join( 235 | feat_dir, args.language.lower() + "." + subset + 236 | ".filter1_phone.npz" 237 | ) 238 | if not path.isfile(output_npz_fn): 239 | utils.segments_from_npz( 240 | input_npz_fn, phone_filtered_list_fn, output_npz_fn 241 | ) 242 | else: 243 | print("Using existing file:", output_npz_fn) 244 | 245 | if args.analyse: 246 | import matplotlib.pyplot as plt 247 | import numpy as np 248 | 249 | # Most common words 250 | labels = [i.split("_")[0] for i in pronunciations_dict] 251 | counter = Counter(labels) 252 | print("No. word types:", len(counter)) 253 | print("No. word tokens:", len(labels)) 254 | print("Most common words:", counter.most_common(10)) 255 | 256 | # Histogram of word count 257 | counts = counter.values() 258 | plt.figure() 259 | plt.hist(counts, 50) 260 | plt.yscale("log") 261 | plt.ylabel("No. of types with this many tokens") 262 | plt.xlabel("No. of tokens") 263 | 264 | # # Temp 265 | # # Most common words 266 | # labels = [i.split("_")[0] for i in filtered_keys] 267 | # counter = Counter(labels) 268 | # print("No. word types:", len(counter)) 269 | # print("No. word tokens:", len(labels)) 270 | # print("Most common words:", counter.most_common(10)) 271 | 272 | # # Histogram of word count 273 | # counts = counter.values() 274 | # plt.figure() 275 | # plt.hist(counts, 50) 276 | # plt.yscale("log") 277 | # plt.ylabel("No. of types with this many tokens") 278 | # plt.xlabel("No. of tokens") 279 | 280 | plt.show() 281 | 282 | # Filter 1 283 | print("Applying filter 1") 284 | n_min_tokens_per_type = 10 285 | n_max_tokens_per_type = 25 286 | filtered_keys = filter_segment_keys( 287 | list(pronunciations_dict), n_min_tokens_per_type, 288 | n_max_tokens_per_type 289 | ) 290 | print("No. tokens:", len(filtered_keys)) 291 | print( 292 | "No. types:", len(set([i.split("_")[0] for i in filtered_keys])) 293 | ) 294 | filtered_list_fn = path.join(list_dir, subset + ".filter1_gt.list") 295 | print("Writing:", filtered_list_fn) 296 | if not path.isfile(filtered_list_fn): 297 | with codecs.open(filtered_list_fn, "w", "utf-8") as f: 298 | for segment_key in sorted(filtered_keys): 299 | f.write(segment_key + "\n") 300 | else: 301 | print("Using existing file:", filtered_list_fn) 302 | 303 | # Extract word segments from the MFCC NumPy archives 304 | input_npz_fn = path.join( 305 | "..", "features", feat_type, args.language, args.language.lower() + 306 | "." + subset + ".npz" 307 | ) 308 | output_npz_fn = path.join( 309 | feat_dir, args.language.lower() + "." + subset + ".filter1_gt.npz" 310 | ) 311 | if not path.isfile(output_npz_fn): 312 | utils.segments_from_npz( 313 | input_npz_fn, filtered_list_fn, output_npz_fn 314 | ) 315 | else: 316 | print("Using existing file:", output_npz_fn) 317 | 318 | # dev.filtered_gt_words.list 319 | 320 | if __name__ == "__main__": 321 | main() 322 | -------------------------------------------------------------------------------- /blackbox/analyse_pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Analyse all pair-wise distances and compare to a number of other properties. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2019 9 | """ 10 | 11 | from os import path 12 | from scipy.spatial.distance import pdist 13 | from tqdm import tqdm 14 | import argparse 15 | import codecs 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | import random 19 | import sys 20 | 21 | sys.path.append(path.join("..", "..", "src", "speech_dtw", "utils")) 22 | 23 | from dp_align import DPEntry, DPError 24 | import dp_align 25 | import samediff 26 | 27 | 28 | #-----------------------------------------------------------------------------# 29 | # UTILITY FUNCTIONS # 30 | #-----------------------------------------------------------------------------# 31 | 32 | def check_argv(): 33 | """Check the command line arguments.""" 34 | parser = argparse.ArgumentParser( 35 | description=__doc__.strip().split("\n")[0], add_help=False 36 | ) 37 | parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings") 38 | parser.add_argument( 39 | "--pronunciation", type=str, 40 | help="if provided, the pronunciations for this GlobalPhone " 41 | "language is used", choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA", 42 | "KO", "PL", "PO", "RU", "SP", "SW", "TH", "TU", "VN"], default=None 43 | ) 44 | if len(sys.argv) == 1: 45 | parser.print_help() 46 | sys.exit(1) 47 | return parser.parse_args() 48 | 49 | 50 | def editdistance_array(labels): 51 | """ 52 | Return an array of int in the same order as the distances from 53 | `scipy.spatial.distance.pdist` indicating the edit distance between all 54 | pairs of labels. 55 | """ 56 | N = len(labels) 57 | edits = np.zeros(int(N*(N - 1)/2), dtype=int) 58 | 59 | # Calculate the edit distance for every pair of labels 60 | cur_edits_i = 0 61 | for n in tqdm(range(N - 1)): 62 | cur_label = labels[n] 63 | # distances = [] 64 | for i_offset, test_label in enumerate(labels[n + 1:]): 65 | a = dp_align.dp_align(cur_label, test_label) 66 | edits[cur_edits_i + i_offset] = a.get_levenshtein() 67 | # print( 68 | # "Distance between {} and {}: {}".format(cur_label, test_label, 69 | # a.get_levenshtein()) 70 | # ) 71 | # edits[cur_edits_i:cur_edits_i + (N - n) - 1] = distances 72 | # edits[cur_edits_i:cur_edits_i + (N - n) - 1] = np.asarray( 73 | # labels[n + 1:] 74 | # ) == cur_label 75 | cur_edits_i += N - n - 1 76 | 77 | return edits 78 | 79 | 80 | def read_pronunciations(fn): 81 | pronunciations = {} 82 | with codecs.open(fn, "r", "utf-8") as f: 83 | for line in f: 84 | utt_key, pronunciation = line.strip().split() 85 | pronunciations[utt_key] = pronunciation.split(",") 86 | return pronunciations 87 | 88 | 89 | #-----------------------------------------------------------------------------# 90 | # SPECIALISED ALIGNMENT FUNCTION # 91 | #-----------------------------------------------------------------------------# 92 | 93 | def dp_align_edit_positions(ref_list, test_list, ins_penalty=3, del_penalty=3, 94 | sub_penalty=4): 95 | """ 96 | Determines whether a edit operation occurs in the beginning, middle or end. 97 | 98 | Parameters 99 | ---------- 100 | ref_list : list 101 | test_list : list 102 | 103 | Return 104 | ------ 105 | dp_errors, edit_start, edit_middle, edit_end : DPError, (bool, bool, bool) 106 | """ 107 | 108 | # Initialise the alignment matrix 109 | dp_matrix = np.empty( 110 | [len(test_list) + 1, len(ref_list) + 1], dtype = object 111 | ) 112 | for i in range(len(test_list) + 1): 113 | for j in range(len(ref_list) + 1): 114 | dp_matrix[i][j] = DPEntry() 115 | 116 | # Initialise the origin 117 | dp_matrix[0][0].score = 0 118 | dp_matrix[0][0].align = "m" 119 | 120 | # The first row is all delections: 121 | for j in range(1, len(ref_list) + 1): 122 | dp_matrix[0][j].score = j*del_penalty 123 | dp_matrix[0][j].align = "d" 124 | 125 | # Fill dp_matrix 126 | for i in range(1, len(test_list) + 1): 127 | 128 | # First column is all insertions 129 | dp_matrix[i][0].score = i*ins_penalty 130 | dp_matrix[i][0].align = "i" 131 | 132 | for j in range(1, len(ref_list) + 1): 133 | del_score = dp_matrix[i, j - 1].score + del_penalty 134 | ins_score = dp_matrix[i - 1, j].score + ins_penalty 135 | 136 | if test_list[i - 1] == ref_list[j - 1]: 137 | 138 | # Considering a match 139 | match_score = dp_matrix[i - 1, j - 1].score 140 | 141 | # Test for a match 142 | if match_score <= del_score and match_score <= ins_score: 143 | dp_matrix[i, j].score = match_score 144 | dp_matrix[i, j].align = "m" 145 | # Test for a deletion 146 | elif del_score <= ins_score: 147 | dp_matrix[i, j].score = del_score 148 | dp_matrix[i, j].align = "d" 149 | # Test for an insertion (only option left) 150 | else: 151 | dp_matrix[i, j].score = ins_score 152 | dp_matrix[i, j].align = "i" 153 | 154 | else: 155 | 156 | # Considering a substitution 157 | sub_score = dp_matrix[i - 1, j - 1].score + sub_penalty 158 | 159 | # Test for a substitution 160 | if sub_score < del_score and sub_score <= ins_score: 161 | dp_matrix[i, j].score = sub_score 162 | dp_matrix[i, j].align = "s" 163 | # Test for a deletion 164 | elif del_score <= ins_score: 165 | dp_matrix[i, j].score = del_score 166 | dp_matrix[i, j].align = "d" 167 | # Test for an insertion (only option left) 168 | else: 169 | dp_matrix[i, j].score = ins_score 170 | dp_matrix[i, j].align = "i" 171 | 172 | # Perform alignment by tracking through the dp_matrix 173 | dp_errors = DPError() 174 | dp_errors.n_total = len(ref_list) 175 | i = len(test_list) 176 | j = len(ref_list) 177 | edit_start = False 178 | edit_end = False 179 | edit_middle = False 180 | while i > 0 or j > 0: 181 | if dp_matrix[i, j].align == "m": 182 | i -= 1 183 | j -= 1 184 | dp_errors.n_match += 1 185 | elif dp_matrix[i, j].align == "s": 186 | if i == len(test_list) and j == len(ref_list): 187 | edit_end = True 188 | elif i == 1 and j == 1: 189 | edit_start = True 190 | else: 191 | edit_middle = True 192 | i -= 1 193 | j -= 1 194 | dp_errors.n_sub += 1 195 | elif dp_matrix[i, j].align == "d": 196 | if i == len(test_list) and j == len(ref_list): 197 | edit_end = True 198 | elif i == 0 and j == 1: 199 | edit_start = True 200 | else: 201 | edit_middle = True 202 | j -= 1 203 | dp_errors.n_del += 1 204 | elif dp_matrix[i, j].align == "i": 205 | if i == len(test_list) and j == len(ref_list): 206 | edit_end = True 207 | elif i == 1 and j == 0: 208 | edit_start = True 209 | else: 210 | edit_middle = True 211 | i -= 1 212 | dp_errors.n_ins += 1 213 | 214 | # Return the alignment and edit positions 215 | return dp_errors, edit_start, edit_middle, edit_end 216 | 217 | 218 | #-----------------------------------------------------------------------------# 219 | # MAIN FUNCTION # 220 | #-----------------------------------------------------------------------------# 221 | 222 | def main(): 223 | args = check_argv() 224 | 225 | print("Reading:", args.npz_fn) 226 | embeddings = np.load(args.npz_fn) 227 | 228 | # # Temp 229 | # data = {} 230 | # a = list(embeddings) 231 | # random.shuffle(a) 232 | # for key in a[:100]: 233 | # data[key] = embeddings[key] 234 | # embeddings = data 235 | 236 | print("Ordering embeddings:") 237 | n_embeds = 0 238 | X = [] 239 | utt_keys = [] 240 | labels = [] 241 | speakers = [] 242 | for utt_key in tqdm(sorted(embeddings)): 243 | utt_keys.append(utt_key) 244 | X.append(embeddings[utt_key]) 245 | utt_key = utt_key.split("_") 246 | label = utt_key[0] 247 | speaker = utt_key[1] 248 | labels.append(label) 249 | speakers.append(speaker) 250 | X = np.array(X) 251 | print("No. embeddings:", X.shape[0]) 252 | print("Embedding dimensionality:", X.shape[1]) 253 | 254 | # Normalise 255 | normed = (X - X.mean(axis=0)) / X.std(axis=0) 256 | X = normed 257 | 258 | print("Calculating distances") 259 | distances = pdist(X, metric="cosine") 260 | 261 | # Plot: Matching words 262 | print("Getting word matches") 263 | word_matches = samediff.generate_matches_array(labels) 264 | print("Total no. pairs:", word_matches.shape[0]) 265 | print("No. same-word pairs:", sum(word_matches)) 266 | distances_pos_avg = np.mean(distances[word_matches == True]) 267 | distances_neg_avg = np.mean(distances[word_matches == False]) 268 | distances_pos_std = np.std(distances[word_matches == True]) 269 | distances_neg_std = np.std(distances[word_matches == False]) 270 | plt.figure() 271 | plt.bar( 272 | [0, 1], [distances_neg_avg, distances_pos_avg], 273 | yerr=[distances_neg_std, distances_pos_std] 274 | ) 275 | plt.xticks([0, 1], ("No", "Yes")) 276 | plt.xlabel("Matching words") 277 | plt.ylabel("Cosine distance") 278 | plt.ylim([0, 1.2]) 279 | 280 | # Plot: Same speakers 281 | print("Getting speaker matches") 282 | speaker_matches = samediff.generate_matches_array(speakers) 283 | print("No. same-speaker pairs:", sum(speaker_matches)) 284 | distances_pos_avg = np.mean( 285 | distances[np.logical_and(word_matches, speaker_matches)] 286 | ) 287 | distances_neg_avg = np.mean( 288 | distances[np.logical_and(word_matches, speaker_matches == False)] 289 | ) 290 | distances_pos_std = np.std( 291 | distances[np.logical_and(word_matches, speaker_matches)] 292 | ) 293 | distances_neg_std = np.std( 294 | distances[np.logical_and(word_matches, speaker_matches == False)] 295 | ) 296 | # distances_pos_avg = np.mean(distances[speaker_matches == True]) 297 | # distances_neg_avg = np.mean(distances[speaker_matches == False]) 298 | # distances_pos_std = np.std(distances[speaker_matches == True]) 299 | # distances_neg_std = np.std(distances[speaker_matches == False]) 300 | plt.figure() 301 | plt.bar( 302 | [0, 1], [distances_neg_avg, distances_pos_avg], 303 | yerr=[distances_neg_std, distances_pos_std] 304 | ) 305 | plt.xticks([0, 1], ("No", "Yes")) 306 | plt.xlabel("Matching speakers") 307 | plt.ylabel("Cosine distance") 308 | plt.ylim([0, 1.2]) 309 | plt.title("Distances between same-word pairs") 310 | 311 | # Plot: Edit distances 312 | if args.pronunciation is not None: 313 | 314 | # Pronunciations 315 | pron_fn = path.join("lists", args.pronunciation, "dev.prons") 316 | print("Reading:", pron_fn) 317 | pronunciations = read_pronunciations(pron_fn) 318 | pron_labels = [] 319 | for utt_key in utt_keys: 320 | pron_labels.append(pronunciations[utt_key]) 321 | 322 | # Get distances 323 | print("Getting edit distances:") 324 | # edit_distances = editdistance_array(labels) 325 | edit_distances = editdistance_array(pron_labels) 326 | 327 | # Plot distances 328 | edits = sorted(set(edit_distances)) 329 | averages = [] 330 | stds = [] 331 | for edit in edits: 332 | averages.append(np.mean(distances[edit_distances == edit])) 333 | stds.append(np.std(distances[edit_distances == edit])) 334 | plt.figure() 335 | plt.bar(edits, averages, yerr=stds) 336 | plt.ylim([0, 1.2]) 337 | plt.xlabel("Phone edit distance") 338 | plt.ylabel("Cosine distance") 339 | 340 | plt.show() 341 | 342 | 343 | if __name__ == "__main__": 344 | main() 345 | -------------------------------------------------------------------------------- /qbe/eval_qbe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Evaluate QbE performance for a given costs directory. 5 | 6 | Author: Herman Kamper 7 | Contact: kamperh@gmail.com 8 | Date: 2017, 2019 9 | """ 10 | 11 | from collections import Counter 12 | from os import path 13 | from scipy.interpolate import interp1d 14 | from scipy.optimize import brentq 15 | from tqdm import tqdm 16 | import argparse 17 | import codecs 18 | import pickle 19 | import numpy as np 20 | import sklearn.metrics as metrics 21 | import sys 22 | 23 | sys.path.append("..") 24 | 25 | from paths import gp_alignments_dir 26 | 27 | 28 | #-----------------------------------------------------------------------------# 29 | # UTILITY FUNCTIONS # 30 | #-----------------------------------------------------------------------------# 31 | 32 | def check_argv(): 33 | """Check the command line arguments.""" 34 | parser = argparse.ArgumentParser( 35 | description=__doc__.strip().split("\n")[0], add_help=False 36 | ) 37 | parser.add_argument( 38 | "language", type=str, help="GlobalPhone language", 39 | choices=["HA"] 40 | ) 41 | parser.add_argument( 42 | "cost_dict_fn", type=str, 43 | help="filename of the cost dictionary" 44 | ) 45 | if len(sys.argv) == 1: 46 | parser.print_help() 47 | sys.exit(1) 48 | return parser.parse_args() 49 | 50 | 51 | 52 | #-----------------------------------------------------------------------------# 53 | # EVALUATION FUNCTIONS # 54 | #-----------------------------------------------------------------------------# 55 | 56 | def calculate_eer(y_true, y_score): 57 | # https://yangcha.github.io/EER-ROC/ 58 | fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score) 59 | eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.) 60 | thresh = interp1d(fpr, thresholds)(eer) 61 | return eer 62 | 63 | 64 | def calculate_auc(y_true, y_score): 65 | fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score) 66 | return metrics.auc(fpr, tpr) 67 | 68 | 69 | def eval_precision_recall_fscore(cost_dict, label_dict, threshold, 70 | analyse=False): 71 | """Evaluate precision and recall for a particular output.""" 72 | 73 | # # Get average scores 74 | # avg_keyword_scores = {} 75 | # for keyword in cost_dict: 76 | # scores = [] 77 | # for utt in cost_dict[keyword]: 78 | # scores.append(cost_dict[keyword][utt]) 79 | # avg_keyword_scores[keyword] = np.mean(scores) 80 | # print(avg_keyword_scores) 81 | 82 | # For each utterance, which keywords above threshold 83 | threshold_dict = {} 84 | for keyword in cost_dict: 85 | for utt in cost_dict[keyword]: 86 | if utt not in threshold_dict: 87 | threshold_dict[utt] = [] 88 | if cost_dict[keyword][utt] <= threshold: 89 | # if (cost_dict[keyword][utt] <= 90 | # avg_keyword_scores[keyword]*threshold): 91 | threshold_dict[utt].append(keyword) 92 | keywords = cost_dict.keys() 93 | 94 | # Calculate precision and recall 95 | n_tp = 0 96 | n_pred = 0 97 | n_true = 0 98 | word_tokens_correct = [] 99 | if analyse: 100 | print() 101 | for utt in sorted(threshold_dict): 102 | if utt not in label_dict: 103 | continue 104 | y_pred = threshold_dict[utt] 105 | y_true = [i for i in label_dict[utt].split() if i in keywords] 106 | cur_tokens_correct = set([i for i in y_true if i in y_pred]) 107 | word_tokens_correct.extend(cur_tokens_correct) 108 | n_tp += len(cur_tokens_correct) 109 | n_pred += len(y_pred) 110 | n_true += len(set(y_true)) 111 | if analyse: 112 | if len(y_pred) > 0: 113 | print("-"*79) 114 | print("Utterance:", utt) 115 | print("Predicted:", sorted(y_pred)) 116 | print("Ground truth:", y_true) 117 | if n_pred > 0: 118 | print( 119 | "Current precision: {} / {} = {:.4f}".format( n_tp, 120 | n_pred, float(n_tp)/n_pred*100.) 121 | ) 122 | if n_true > 0: 123 | print( 124 | "Current recall: {} / {} = {:.4f}".format( 125 | n_tp, n_true, float(n_tp)/n_true*100.) 126 | ) 127 | precision = float(n_tp)/n_pred if n_pred != 0 else 0 128 | recall = float(n_tp)/n_true 129 | f_score = ( 130 | 2*precision*recall/(precision + recall) if precision + recall != 0 else 131 | 0 132 | ) 133 | 134 | if analyse: 135 | print("-"*79) 136 | print 137 | print( 138 | "Most common correctly predicted words:", 139 | Counter(word_tokens_correct).most_common(15) 140 | ) 141 | 142 | return n_tp, n_pred, n_true, precision, recall, f_score 143 | 144 | 145 | def eval_qbe(cost_dict, label_dict, analyse=False): 146 | """ 147 | Return dictionaries of P@10, P@N and EER for each query item. 148 | 149 | The keys of each of the returned dictionaries are the unique keyword types, 150 | with the value a list of the scores for each of the queries of that keyword 151 | type. 152 | """ 153 | 154 | # Unique keywords with query keys 155 | keyword_dict = {} 156 | for query_key in cost_dict: 157 | keyword = query_key.split("_")[0] 158 | if keyword not in keyword_dict: 159 | keyword_dict[keyword] = [] 160 | keyword_dict[keyword].append(query_key) 161 | 162 | # For each keywords 163 | eer_dict = {} # `eer_dict[keyword]` is a list of EER scores for each query 164 | # of that keyword type 165 | auc_dict = {} 166 | p_at_10_dict = {} 167 | p_at_n_dict = {} 168 | if analyse: 169 | print() 170 | for keyword in tqdm(sorted(keyword_dict)): 171 | 172 | eer_dict[keyword] = [] 173 | auc_dict[keyword] = [] 174 | p_at_10_dict[keyword] = [] 175 | p_at_n_dict[keyword] = [] 176 | 177 | # For each query key 178 | for query_key in sorted(keyword_dict[keyword]): 179 | 180 | # Rank search keys 181 | utt_order = [ 182 | utt_key for utt_key in sorted(cost_dict[query_key], 183 | key=cost_dict[query_key].get) if utt_key in label_dict 184 | ] 185 | 186 | # EER 187 | y_true = [] 188 | for utt_key in utt_order: 189 | if keyword in label_dict[utt_key]: 190 | y_true.append(1) 191 | else: 192 | y_true.append(0) 193 | y_score = [cost_dict[query_key][utt_key] for utt_key in utt_order] 194 | cur_eer = calculate_eer(y_true, [-i for i in y_score]) 195 | cur_auc = calculate_auc(y_true, [-i for i in y_score]) 196 | eer_dict[keyword].append(cur_eer) 197 | auc_dict[keyword].append(cur_auc) 198 | 199 | # P@10 200 | cur_p_at_10 = float(sum(y_true[:10]))/10. 201 | p_at_10_dict[keyword].append(cur_p_at_10) 202 | 203 | # P@N 204 | cur_p_at_n = np.float64(sum(y_true[:sum(y_true)]))/sum(y_true) 205 | p_at_n_dict[keyword].append(cur_p_at_n) 206 | 207 | if analyse: 208 | print("-"*79) 209 | print("Query:", query_key) 210 | print("Current P@10: {:.4f}".format(cur_p_at_10)) 211 | print("Current P@N: {:.4f}".format(cur_p_at_n)) 212 | print("Current EER: {:.4f}".format(cur_eer)) 213 | print("Current AUC: {:.4f}".format(cur_auc)) 214 | # print("Top 10 utterances: ", utt_order[:10]) 215 | print("Top 10 utterances:") 216 | for i_utt, utt in enumerate(utt_order[:10]): 217 | print("{}: {}".format( 218 | # utt, " ".join(label_dict[utt])), end='' 219 | utt, label_dict[utt]), end='' 220 | ) 221 | if y_true[i_utt] == 0: 222 | print(" *") 223 | else: 224 | print() 225 | 226 | if analyse: 227 | print("-"*79) 228 | print() 229 | 230 | return eer_dict, auc_dict, p_at_10_dict, p_at_n_dict 231 | 232 | 233 | def get_avg_scores(score_dict): 234 | """ 235 | Return the overall average, and unweighted average, median and maximum 236 | scores over all keyword types. 237 | 238 | Return 239 | ------ 240 | avg_all_scores, avg_avg_scores, avg_median_scores, avg_max_scores 241 | """ 242 | all_scores = [] 243 | avg_scores = [] 244 | median_scores = [] 245 | max_scores = [] 246 | min_scores = [] 247 | 248 | for keyword in score_dict: 249 | all_scores.extend(score_dict[keyword]) 250 | avg_scores.append(np.mean(score_dict[keyword])) 251 | median_scores.append(np.median(score_dict[keyword])) 252 | max_scores.append(np.max(score_dict[keyword])) 253 | min_scores.append(np.min(score_dict[keyword])) 254 | 255 | avg_all_scores = np.mean(all_scores) 256 | avg_avg_scores = np.mean(avg_scores) 257 | avg_median_scores = np.mean(median_scores) 258 | avg_max_scores = np.mean(max_scores) 259 | avg_min_scores = np.mean(min_scores) 260 | 261 | return ( 262 | avg_all_scores, avg_avg_scores, avg_median_scores, avg_max_scores, 263 | avg_min_scores 264 | ) 265 | 266 | def read_forced_alignment(globalphone_fa_fn): 267 | """ 268 | Return a dictionary of transcriptions obtained from a GlobalPhone forced 269 | alignment file. 270 | """ 271 | transcription_dict = {} 272 | with codecs.open(globalphone_fa_fn, "r", "utf-8") as f: 273 | for line in f: 274 | line = line.strip().split(" ") 275 | utterance_key = line[0] 276 | label = line[4].lower() 277 | if utterance_key not in transcription_dict: 278 | transcription_dict[utterance_key] = label 279 | # transcription_dict[utterance_key] = [] 280 | else: 281 | transcription_dict[utterance_key] += " " + label 282 | # transcription_dict[utterance_key].append(label) 283 | return transcription_dict 284 | 285 | 286 | #-----------------------------------------------------------------------------# 287 | # MAIN FUNCTION # 288 | #-----------------------------------------------------------------------------# 289 | 290 | def main(): 291 | args = check_argv() 292 | 293 | fn = path.join(args.cost_dict_fn) 294 | print("Reading:", fn) 295 | with open(fn, "rb") as f: 296 | cost_dict = pickle.load(f) 297 | print( 298 | "Keywords: " + ", ".join(sorted(set([i.split("_")[0] for i in 299 | cost_dict.keys()]))) 300 | ) 301 | 302 | globalphone_fa_fn = path.join(gp_alignments_dir, args.language, "eval.ctm") 303 | print("Reading:", globalphone_fa_fn) 304 | transcription_dict = read_forced_alignment(globalphone_fa_fn) 305 | # print(transcription_dict) 306 | 307 | print("Evaluating:") 308 | eer_dict, auc_dict, p_at_10_dict, p_at_n_dict = eval_qbe( 309 | cost_dict, transcription_dict 310 | ) 311 | 312 | eer_overall, eer_avg, eer_median, eer_max, eer_min = get_avg_scores( 313 | eer_dict 314 | ) 315 | auc_overall, auc_avg, auc_median, auc_max, auc_min = get_avg_scores( 316 | auc_dict 317 | ) 318 | p_at_10_overall, p_at_10_avg, p_at_10_median, p_at_10_max, p_at_10_min = ( 319 | get_avg_scores(p_at_10_dict) 320 | ) 321 | p_at_n_overall, p_at_n_avg, p_at_n_median, p_at_n_max, p_at_n_min = ( 322 | get_avg_scores(p_at_n_dict) 323 | ) 324 | 325 | print() 326 | print("-"*79) 327 | print( 328 | "EER: {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, " 329 | "min: {:.4f}".format(eer_overall, eer_avg, eer_median, eer_max, 330 | eer_min) 331 | ) 332 | print( 333 | "AUC: {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, " 334 | "min: {:.4f}".format(auc_overall, auc_avg, auc_median, auc_max, 335 | auc_min) 336 | ) 337 | print( 338 | "P@10: {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, " 339 | "min: {:.4f}".format(p_at_10_overall, p_at_10_avg, p_at_10_median, 340 | p_at_10_max, p_at_10_min) 341 | ) 342 | print( 343 | "P@N: {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, " 344 | "min: {:.4f}".format(p_at_n_overall, p_at_n_avg, p_at_n_median, 345 | p_at_n_max, p_at_n_min) 346 | ) 347 | print("-"*79) 348 | 349 | 350 | if __name__ == "__main__": 351 | main() 352 | --------------------------------------------------------------------------------