├── Makefile
├── environment.yml
├── .gitignore
├── data
    ├── breadcrumbs.md
    ├── HA
    │   └── keywords.txt
    ├── eval_spk.list
    ├── dev_spk.list
    └── train_spk.list
├── paths.py
├── install_local.sh
├── samediff
    ├── readme.md
    ├── get_npz_keys.py
    ├── create_labels.py
    ├── create_speakers.py
    ├── run_samediff.sh
    ├── run_local.py
    └── run_calcdists.sh
├── downsample
    ├── readme.md
    └── downsample.py
├── src
    ├── print_dict.py
    └── plotting.py
├── qbe
    ├── combine_model_output.py
    ├── .ipynb_checkpoints
    │   └── sandbox-checkpoint.ipynb
    ├── readme.md
    ├── extract_queries_link_search.py
    ├── dense_seg_mvn.py
    ├── get_dtw_costs.py
    ├── data_prep_dense_seg.py
    ├── get_dense_seg_costs.py
    ├── sandbox.ipynb
    ├── apply_model_dense.py
    └── eval_qbe.py
├── blackbox
    ├── npz_to_tsv.py
    ├── readme.md
    ├── logreg_speaker.py
    ├── hierarchical_clustering.py
    ├── logreg_pronlength.py
    ├── dp_align.py
    ├── extract_analysis_features.py
    └── analyse_pairs.py
├── embeddings
    ├── readme.md
    ├── eval_samediff.py
    ├── link_mfcc.py
    ├── apply_model_to_npz.py
    ├── data_io.py
    ├── apply_model.py
    └── analyse_embeds.py
├── readme.md
├── notebooks
    └── sandbox_splitnet.ipynb
└── features
    ├── analyse_utd_pairs.py
    ├── utils.py
    └── features.py


/Makefile:
--------------------------------------------------------------------------------
1 | test:
2 | 	nosetests -v
3 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: tf1.13
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.7
 7 |   - numpy
 8 |   - tensorflow-gpu=1.13.1
 9 |   - tqdm
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !*/
 3 | !.*
 4 | !*.py
 5 | !*.pyx
 6 | !*.md
 7 | !*.sh
 8 | !*.yml
 9 | !*.ipynb
10 | !*.conf
11 | !Makefile
12 | !docker/Dockerfile*
13 | !data/*
14 | !data/*/*
15 | notebooks/.ipynb_checkpoints/*
16 | 


--------------------------------------------------------------------------------
/data/breadcrumbs.md:
--------------------------------------------------------------------------------
1 | Breadcrumbs
2 | -----------
3 | - Wordpairs obtained from `/disk/scratch/s1680167/zero/data/word_pairs/`.
4 | - Speaker lists obtained from the GlobalPhone Kaldi recipe; the training list
5 |   was generated.
6 | 


--------------------------------------------------------------------------------
/paths.py:
--------------------------------------------------------------------------------
1 | # gp_data_dir = "/group/corporapublic/global_phone/"
2 | # gp_alignments_dir = "/disk/scratch/v1hkampe/endgame/datasets/globalphone_alignments/"
3 | gp_data_dir = "/home/kamperh/endgame/datasets/globalphone/"
4 | gp_alignments_dir = "/home/kamperh/endgame/datasets/globalphone_alignments/"
5 | 


--------------------------------------------------------------------------------
/data/HA/keywords.txt:
--------------------------------------------------------------------------------
 1 | amfani
 2 | amurka
 3 | arziki
 4 | babban
 5 | bayan
 6 | bayyana
 7 | birnin
 8 | daban
 9 | daular
10 | domin
11 | duniya
12 | hankali
13 | hanyar
14 | harkokin
15 | kasar
16 | kasashe
17 | kasashen
18 | lokacin
19 | majalisar
20 | mutane
21 | samun
22 | sarki
23 | sosai
24 | tattalin
25 | tsakanin
26 | wajen
27 | wanda
28 | wannan
29 | zaman
30 | zamanin
31 | 


--------------------------------------------------------------------------------
/install_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | if [ ! -d ../src ]; then
 5 |     mkdir ../src/
 6 | fi
 7 | cd ../src/
 8 | 
 9 | # Install speech_dtw
10 | if [ ! -d speech_dtw ]; then
11 |     git clone https://github.com/kamperh/speech_dtw.git
12 |     cd speech_dtw
13 |     make
14 |     make test
15 |     cd -
16 | fi
17 | 
18 | # Install shorten
19 | if [ ! -d shorten-3.6.1 ]; then
20 |     wget https://download.tuxfamily.org/xcfaudio/PROG_ABS_FRUGALWARE/SHORTEN/shorten-3.6.1.tar.gz
21 |     # wget http://etree.org/shnutils/shorten/dist/src/shorten-3.6.1.tar.gz
22 |     tar -zxf shorten-3.6.1.tar.gz
23 |     cd shorten-3.6.1
24 |     ./configure --prefix=`pwd`
25 |     make
26 |     make install
27 | fi
28 | 
29 | set +e
30 | 


--------------------------------------------------------------------------------
/data/eval_spk.list:
--------------------------------------------------------------------------------
 1 | AR	TBA
 2 | BG	040 059 063 068 095 109 110
 3 | CR	037 038 039 040 041 042 043 044 045 047
 4 | CZ	084 086 088 090 092 094 096 098 100 102
 5 | FR	091 092 093 094 095 096 097 098
 6 | GE	018 020 021 026 029 073
 7 | HA	002 014 025 028 030 052 053 062 070 088
 8 | JA	XXX
 9 | KO	019 029 032 042 051 064 069 080 082 088
10 | CH	080 081 082 083 084 085 086 087 088 089
11 | PO	135 137 138 139 142 143 312
12 | PL	050 001 031 043 023 004 098 009 044 033
13 | RU	002 027 036 063 069 092 102 104 109 112
14 | WU	TBA
15 | SP	011 012 013 014 015 016 017 018
16 | SW	040 041 042 043 044 060 061 062 063 064
17 | TH	101 102 103 104 105 106 107 108
18 | TA	TBA
19 | TU	025 030 031 032 037 039 041 046 056 063
20 | VN	092 094 096 098 102 103 106 110 113
21 | 


--------------------------------------------------------------------------------
/data/dev_spk.list:
--------------------------------------------------------------------------------
 1 | AR	TBA
 2 | BG	051 055 058 084 090 100 106
 3 | CR	033 034 035 036 046 048 051 053 054 057
 4 | CZ	083 085 087 089 091 093 095 097 099 101
 5 | FR	082 083 084 085 086 087 088 089
 6 | GE	001 002 003 004 008 010
 7 | HA	018 031 034 038 046 047 050 055 058 072
 8 | JA	XXX
 9 | KO	006 012 025 040 045 061 084 086 091 098
10 | CH	028 029 030 031 032 039 040 041 042 043 044
11 | PO	064 072 102 103 104 132 133 134
12 | PL	097 046 041 005 012 063 040 030 090 011
13 | RU	005 033 042 065 078 097 103 106 110 122
14 | WU	TBA
15 | SP	001 002 003 004 005 006 007 008 009 010
16 | SW	045 046 047 048 049 066 067 068 069
17 | TH	023 025 028 037 045 061 073 085
18 | TA	TBA
19 | TU	001 002 003 005 006 008 013 014 015 016 019
20 | VN	200 201 202 203 204 205 206 207 208
21 | 


--------------------------------------------------------------------------------
/samediff/readme.md:
--------------------------------------------------------------------------------
 1 | Same-Different Evaluation
 2 | =========================
 3 | 
 4 | Overview
 5 | --------
 6 | Performs same-different evaluation on frame-level features using dynamic time
 7 | warping (DTW) alignment.
 8 | 
 9 | 
10 | Evaluation
11 | ----------
12 | This needs to be run on a multi-core machine. Change the `n_cpus` variable in
13 | `run_calcdists.sh` and `run_samediff.sh` to the number of CPUs on the machine.
14 | 
15 | As an example, to evaluate the Spanish development MFCCs:
16 | 
17 |     ./run_calcdists.sh ../features/mfcc/KO/ko.dev.gt_words.npz  # finish first
18 |     ./run_samediff.sh ../features/mfcc/KO/ko.dev.gt_words.npz
19 | 
20 | 
21 | Results
22 | -------
23 | *(Deprecated)* SWDP average precision:
24 | 
25 | - CH dev: 0.15380600
26 | - CR dev: 0.13270483
27 | - HA dev: 0.21368697
28 | - SP dev: 0.19288643
29 | - SW dev: 0.10928384
30 | - TU dev: 0.18624635
31 | 
32 | - GE dev: 0.22482616
33 | - KO dev: 0.15748395
34 | 
35 | - SP eval: 0.29650854
36 | 


--------------------------------------------------------------------------------
/downsample/readme.md:
--------------------------------------------------------------------------------
 1 | Downsampled Acoustic Word Embeddings
 2 | ====================================
 3 | 
 4 | Overview
 5 | --------
 6 | MFCCs are downsampled to obtain acoustic word embeddings. These are evaluated
 7 | using same-different evaluation.
 8 | 
 9 | 
10 | Downsampling
11 | ------------
12 | Perform downsampling on MFCCs without deltas:
13 | 
14 |     mkdir -p exp/SP
15 |     ./downsample.py --technique resample --frame_dims 13 \
16 |         ../features/mfcc/CH/ch.eval.gt_words.npz \
17 |         exp/CH/mfcc.eval.gt_words.downsample_10.npz 10
18 | 
19 | 
20 | Evaluation
21 | ----------
22 | Evaluate and analyse downsampled MFCCs without deltas:
23 | 
24 |     ../embeddings/eval_samediff.py --mvn \
25 |         exp/SP/mfcc.dev.gt_words.downsample_10.npz
26 |     ../embeddings/analyse_embeds.py --normalize --word_type \
27 |         guatemala,presidente,autoridades,candidatos,asesinato,presupuesto,vicepresidente,negociaciones,netanyahu,social,explotaciones \
28 |         exp/SP/mfcc.dev.gt_words.downsample_10.npz
29 | 
30 | 
31 | Results
32 | -------
33 | SWDP average precision:
34 | 
35 | - SP dev: 0.14567458
36 | 
37 | 
38 | *(Deprecated)* SWDP average precision:
39 | 
40 | - CH dev: 0.11420457
41 | - CR dev: 0.11620668
42 | - HA dev: 0.11831970
43 | - SP dev: 0.12301926
44 | - SW dev: 0.06808896
45 | - TU dev: 0.13914600
46 | 
47 | - GE dev: 0.08031011
48 | - KO dev: 0.13563458
49 | - TH dev: 0.08781202
50 | - VN dev: 0.02734849
51 | 
52 | - SP eval: 0.19438775
53 | 


--------------------------------------------------------------------------------
/src/print_dict.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Print the contents of a pickled dictionary.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2020
 9 | """
10 | 
11 | from os import path
12 | import argparse
13 | import pickle
14 | import sys
15 | 
16 | 
17 | #-----------------------------------------------------------------------------#
18 | #                              UTILITY FUNCTIONS                              #
19 | #-----------------------------------------------------------------------------#
20 | 
21 | def check_argv():
22 |     """Check the command line arguments."""
23 |     parser = argparse.ArgumentParser(
24 |         description=__doc__.strip().split("\n")[0], add_help=False
25 |         )
26 |     parser.add_argument("pickle_dict_fn", type=str, help="pickled dictionary")
27 |     if len(sys.argv) == 1:
28 |         parser.print_help()
29 |         sys.exit(1)
30 |     return parser.parse_args()
31 | 
32 | 
33 | #-----------------------------------------------------------------------------#
34 | #                                MAIN FUNCTION                                #
35 | #-----------------------------------------------------------------------------#
36 | 
37 | def main():
38 |     args = check_argv()
39 | 
40 |     if path.isfile(args.pickle_dict_fn):
41 |         with open(args.pickle_dict_fn, "rb") as f:
42 |             d = pickle.load(f)
43 |         for key in sorted(d):
44 |             print(key, ":", d[key])
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/samediff/get_npz_keys.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Write the keys in a given Numpy archive.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2015, 2018, 2019
 9 | """
10 | 
11 | import argparse
12 | import sys
13 | import numpy as np
14 | 
15 | #-----------------------------------------------------------------------------#
16 | #                              UTILITY FUNCTIONS                              #
17 | #-----------------------------------------------------------------------------#
18 | 
19 | def check_argv():
20 |     """Check the command line arguments."""
21 |     parser = argparse.ArgumentParser(
22 |         description=__doc__.strip().split("\n")[0], add_help=False
23 |         )
24 |     parser.add_argument("npz_fn", type=str, help="the Numpy archive")
25 |     parser.add_argument(
26 |         "keys_fn", type=str, help="the file to write the keys to"
27 |         )
28 |     if len(sys.argv) == 1:
29 |         parser.print_help()
30 |         sys.exit(1)
31 |     return parser.parse_args()
32 | 
33 | 
34 | #-----------------------------------------------------------------------------#
35 | #                                MAIN FUNCTION                                #
36 | #-----------------------------------------------------------------------------#
37 | 
38 | def main():
39 |     args = check_argv()
40 | 
41 |     npz = np.load(args.npz_fn)
42 | 
43 |     print("Writing keys:", args.keys_fn)
44 |     open(args.keys_fn, "w").write("\n".join(npz.keys()) + "\n")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/samediff/create_labels.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Create a list of the word labels from a list of utterance IDs.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2014, 2018, 2019
 9 | """
10 | 
11 | import argparse
12 | import codecs
13 | import sys
14 | 
15 | 
16 | #-----------------------------------------------------------------------------#
17 | #                              UTILITY FUNCTIONS                              #
18 | #-----------------------------------------------------------------------------#
19 | 
20 | def check_argv():
21 |     """Check the command line arguments."""
22 |     parser = argparse.ArgumentParser(
23 |         description=__doc__.strip().split("\n")[0], add_help=False
24 |         )
25 |     parser.add_argument("utterance_ids_fn")
26 |     parser.add_argument("labels_fn")
27 |     if len(sys.argv) == 1:
28 |         parser.print_help()
29 |         sys.exit(1)
30 |     return parser.parse_args()
31 | 
32 | 
33 | #-----------------------------------------------------------------------------#
34 | #                                MAIN FUNCTION                                #
35 | #-----------------------------------------------------------------------------#
36 | 
37 | def main():
38 |     
39 |     args = check_argv()
40 | 
41 |     utt_ids = [i.strip() for i in open(args.utterance_ids_fn)]
42 |     labels = []
43 |     for utt_id in utt_ids:
44 |         word = utt_id.split("_")[0] #"_".join(utt_id.split("_")[:-2])
45 |         labels.append(word)
46 |     with codecs.open(args.labels_fn, "w", "utf-8") as f:
47 |         for label in labels:
48 |             f.write(label + "\n")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/samediff/create_speakers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Create a list of the speaker labels from a list of utterance IDs.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2014, 2018, 2019
 9 | """
10 | 
11 | import argparse
12 | import codecs
13 | import sys
14 | 
15 | 
16 | #-----------------------------------------------------------------------------#
17 | #                              UTILITY FUNCTIONS                              #
18 | #-----------------------------------------------------------------------------#
19 | 
20 | def check_argv():
21 |     """Check the command line arguments."""
22 |     parser = argparse.ArgumentParser(
23 |         description=__doc__.strip().split("\n")[0], add_help=False
24 |         )
25 |     parser.add_argument("utterance_ids_fn")
26 |     parser.add_argument("labels_fn")
27 |     if len(sys.argv) == 1:
28 |         parser.print_help()
29 |         sys.exit(1)
30 |     return parser.parse_args()
31 | 
32 | 
33 | #-----------------------------------------------------------------------------#
34 | #                                MAIN FUNCTION                                #
35 | #-----------------------------------------------------------------------------#
36 | 
37 | def main():
38 |     
39 |     args = check_argv()
40 | 
41 |     utt_ids = [i.strip() for i in open(args.utterance_ids_fn)]
42 |     labels = []
43 |     for utt_id in utt_ids:
44 |         speaker = utt_id.split("_")[1] #"_".join(utt_id.split("_")[:-2])
45 |         labels.append(speaker)
46 |     with codecs.open(args.labels_fn, "w", "utf-8") as f:
47 |         for label in labels:
48 |             f.write(label + "\n")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/qbe/combine_model_output.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Combine `apply_model_dense_seg.py` output into a single Numpy archive.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2017, 2019
 9 | """
10 | 
11 | from os import path
12 | from tqdm import tqdm
13 | import argparse
14 | import pickle
15 | import glob
16 | import numpy as np
17 | import sys
18 | 
19 | 
20 | #-----------------------------------------------------------------------------#
21 | #                              UTILITY FUNCTIONS                              #
22 | #-----------------------------------------------------------------------------#
23 | 
24 | def check_argv():
25 |     """Check the command line arguments."""
26 |     parser = argparse.ArgumentParser(
27 |         description=__doc__.strip().split("\n")[0], add_help=False
28 |         )
29 |     parser.add_argument("exp_dir", type=str, help="experiments directory")
30 |     if len(sys.argv) == 1:
31 |         parser.print_help()
32 |         sys.exit(1)
33 |     return parser.parse_args()
34 | 
35 | 
36 | #-----------------------------------------------------------------------------#
37 | #                                MAIN FUNCTION                                #
38 | #-----------------------------------------------------------------------------#
39 | 
40 | def main():
41 |     args = check_argv()
42 |     
43 |     features_dict = {}
44 |     for fn in glob.glob(path.join(args.exp_dir, "search.*.npz")):
45 |         print("Reading:", fn)
46 |         split_features_dict = np.load(fn)
47 |         for key in tqdm(split_features_dict):
48 |             features_dict[key] = split_features_dict[key]
49 |             # print(split_features_dict[key].shape)
50 |     print("Total no. utterances:", len(features_dict))
51 | 
52 |     fn = path.join(args.exp_dir, "search.npz")
53 |     print("Writing:", fn)
54 |     np.savez(fn, **features_dict)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/samediff/run_samediff.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Calculate distances for same-different evaluation of autoencoder features.
 4 | # Herman Kamper, h.kamper@sms.ed.ac.uk, 2014-2015, 2018.
 5 | 
 6 | # General setup
 7 | n_cpus=29
 8 | 
 9 | # Input features
10 | features_npz=$1  # features_npz=../data/mfcc_test.npz
11 | if [ -z $features_npz ]; then
12 |     echo "usage: ${0} features_npz"
13 |     exit 1
14 | fi
15 | if [ ! -f $features_npz ]; then
16 |     echo "Error: $features_npz does not exist"
17 |     exit 1
18 | fi
19 | 
20 | # Files and directories
21 | basename=`basename $features_npz`
22 | basename="${basename%.*}"
23 | samediff_dir=exp/$basename
24 | pairs=$samediff_dir/pairs.list
25 | pairs_split_dir=$samediff_dir/pairs_split
26 | labels=$samediff_dir/labels.list
27 | speakers=$samediff_dir/speakers.list
28 | distances_split_dir=$samediff_dir/distances_split
29 | distances=$samediff_dir/distances.dist
30 | samediff_result=$samediff_dir/samediff_result.txt
31 | 
32 | # Make sure that all the jobs are done
33 | complete=`ls $distances_split_dir/distances.*.log | xargs grep "End time" \
34 |     | wc -l`
35 | echo "Number of splits completed: $complete out of $n_cpus"
36 | if [ "$n_cpus" -ne "$complete" ]; then 
37 |     echo "Error: wait for jobs to complete"
38 |     exit 1
39 | fi
40 | 
41 | # Concatenate distances
42 | if [ ! -f $distances ]; then
43 |     touch $distances
44 |     for JOB in $(seq 1 $n_cpus); do
45 |         cat $distances_split_dir/distances.$JOB.dist >> $distances
46 |     done
47 | fi
48 | 
49 | if [ ! -f $samediff_result ]; then
50 |     python ../../src/speech_dtw/utils/samediff.py --binary_dists $labels \
51 |         --speakers_fn $speakers $distances > $samediff_result
52 |     echo
53 |     cat $samediff_result
54 |     echo
55 | 
56 |     if [ $? -ne 0 ]; then
57 |         echo "Exiting"
58 |         rm $samediff_result
59 |         exit 1
60 |     fi
61 | fi
62 | 
63 | # Clean directories
64 | read -p "Clean distances (y/n)? " -n 1 -r
65 | echo
66 | if [[ $REPLY =~ ^[Yy]$ ]]; then
67 |     rm -r $pairs $pairs_split_dir $distances $distances_split_dir
68 | fi
69 | 


--------------------------------------------------------------------------------
/samediff/run_local.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Run the same command in parallel on an SGE grid.
 5 | 
 6 | As an example, run::
 7 | 
 8 |     ./run_local.py 1 3 log.JOB "echo start;sleep 10;echo finished job JOB"
 9 | 
10 | The final line of output is the last spawned PID.
11 | 
12 | Author: Herman Kamper
13 | Contact: kamperh@gmail.com
14 | Date: 2014, 2018, 2019
15 | """
16 | 
17 | import argparse
18 | import subprocess
19 | import sys
20 | import re
21 | 
22 | shell = lambda command: subprocess.Popen(
23 |     command, shell=True, stdout=subprocess.PIPE
24 |     ).communicate()[0]
25 | 
26 | 
27 | def check_argv():
28 |     """Check the command line arguments."""
29 |     parser = argparse.ArgumentParser(
30 |         description=__doc__.strip().split("\n")[0], add_help=False
31 |         )
32 |     parser.add_argument("JOB_start", type=int, help="JOB id start value")
33 |     parser.add_argument(
34 |         "JOB_end", type=int, help="JOB id end value (exclusive)"
35 |         )
36 |     parser.add_argument(
37 |         "log_fn", type=str,
38 |         help="log file, substituting JOB for the current id"
39 |         )
40 |     parser.add_argument(
41 |         "command", type=str,
42 |         help="execute this command, substituting JOB for the current" 
43 |         " id (enclose in quotes if using parameters)"
44 |         )
45 |     if len(sys.argv) == 1:
46 |         parser.print_help()
47 |         sys.exit(1)
48 |     return parser.parse_args()
49 | 
50 | 
51 | def main():
52 |     args = check_argv()
53 |     job_start = args.JOB_start
54 |     job_end = args.JOB_end
55 |     log_fn = args.log_fn
56 |     command = args.command
57 | 
58 |     pid = -1
59 |     for i in range(job_start, job_end + 1):
60 |         cur_command = re.sub("JOB", str(i), command)
61 |         cur_log = re.sub("JOB", str(i), log_fn)
62 |         pid = subprocess.Popen(
63 |             cur_command, shell=True, stderr=subprocess.STDOUT,
64 |             stdout=open(cur_log, "wb")
65 |             ).pid
66 |         print("Spawning job " + str(i) + " with PID:", pid)
67 |     print(pid)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/samediff/run_calcdists.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Calculate distances for same-different evaluation.
 4 | # Herman Kamper, kamperh@gmail.com, 2014-2015, 2018-2019.
 5 | 
 6 | set -e
 7 | 
 8 | # General setup
 9 | n_cpus=29
10 | cmd="python run_local.py"
11 | # cmd="./local/run_sge.py --extraargs -P inf_hcrc_cstr_students"
12 | export PYTHONUNBUFFERED="YOUR_SET"  # flush after every Python print statement
13 | 
14 | 
15 | # Input features
16 | features_npz=$1
17 | if [ -z $features_npz ]; then
18 |     echo "usage: ${0} features_npz"
19 |     exit 1
20 | fi
21 | if [ ! -f $features_npz ]; then
22 |     echo "Error: $features_npz does not exist"
23 |     exit 1
24 | fi
25 | 
26 | # Files and directories
27 | basename=`basename $features_npz`
28 | basename="${basename%.*}"
29 | samediff_dir=exp/$basename
30 | utterance_ids=$samediff_dir/utterance_ids.list
31 | pairs=$samediff_dir/pairs.list
32 | pairs_split_dir=$samediff_dir/pairs_split
33 | labels=$samediff_dir/labels.list
34 | speakers=$samediff_dir/speakers.list
35 | distances_split_dir=$samediff_dir/distances_split
36 | distances=$samediff_dir/distances.dist
37 | 
38 | # Create samediff dir
39 | [ ! -d $samediff_dir ] && mkdir -p $samediff_dir
40 | 
41 | # Create utterance IDs and label files
42 | [ ! -f $utterance_ids ] && python get_npz_keys.py $features_npz $utterance_ids
43 | [ ! -f $labels ] && python create_labels.py $utterance_ids $labels
44 | [ ! -f $speakers ] && python create_speakers.py $utterance_ids $speakers
45 | 
46 | # Generate a list of all possible pairs and split for parallel processing
47 | [ ! -f $pairs ] && python ../../src/speech_dtw/utils/create_pair_file.py \
48 |     $utterance_ids $pairs
49 | [ ! -d $pairs_split_dir ] && ../../src/speech_dtw/utils/split_file.py \
50 |     $pairs $n_cpus $pairs_split_dir
51 | 
52 | # Calculate DTW distances
53 | if [ ! -d $distances_split_dir ]; then
54 |     mkdir -p $distances_split_dir
55 |     dist_cmd="python ../../src/speech_dtw/utils/calculate_dtw_costs.py \
56 |         --binary_dists --input_fmt npz $pairs_split_dir/pairs.JOB.list \
57 |         $features_npz $distances_split_dir/distances.JOB.dist"
58 |     $cmd 1 $n_cpus $distances_split_dir/distances.JOB.log "$dist_cmd"
59 | fi
60 | 
61 | echo "Wait to complete, then run run_samediff.sh"
62 | 


--------------------------------------------------------------------------------
/blackbox/npz_to_tsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Convert a NumPy archive to a TSV file for visualising embeddings.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2019
 9 | """
10 | 
11 | from os import path
12 | from tqdm import tqdm
13 | import argparse
14 | import numpy as np
15 | import sys
16 | 
17 | 
18 | #-----------------------------------------------------------------------------#
19 | #                              UTILITY FUNCTIONS                              #
20 | #-----------------------------------------------------------------------------#
21 | 
22 | def check_argv():
23 |     """Check the command line arguments."""
24 |     parser = argparse.ArgumentParser(
25 |         description=__doc__.strip().split("\n")[0], add_help=False
26 |         )
27 |     parser.add_argument("npz_fn", type=str, help="input NumPy archive")
28 |     parser.add_argument(
29 |         "tsv_fn", type=str, help="output TSV file; if 'auto', then an output "
30 |         "filename is generated automatically based on the input filename"
31 |         )
32 |     if len(sys.argv) == 1:
33 |         parser.print_help()
34 |         sys.exit(1)
35 |     return parser.parse_args()
36 | 
37 | 
38 | #-----------------------------------------------------------------------------#
39 | #                                MAIN FUNCTION                                #
40 | #-----------------------------------------------------------------------------#
41 | 
42 | def main():
43 |     args = check_argv()
44 | 
45 |     print("Reading:", args.npz_fn)
46 |     features = np.load(args.npz_fn)
47 |     
48 |     if args.tsv_fn == "auto":
49 |         npz_fn_split = path.split(args.npz_fn)
50 |         args.tsv_fn = (
51 |             path.split(npz_fn_split[-2])[-1] + "." +
52 |             path.splitext(npz_fn_split[-1])[0] + ".tsv"
53 |             )
54 |     metadata_fn = args.tsv_fn + ".metadata"
55 |     print("Writing:", args.tsv_fn)
56 |     print("Writing:", metadata_fn)
57 |     with open(args.tsv_fn, "w") as f_tsv, open(metadata_fn, "w") as f_metadata:
58 |         f_metadata.write("word\tspeaker\n")
59 |         for utt_key in tqdm(sorted(features)):
60 |             f_tsv.write(
61 |                 "\t".join(["{:.5f}".format(i) for i in features[utt_key]]) +
62 |                 "\n"
63 |                 )
64 |             utt_key_split = utt_key.split("_")
65 |             f_metadata.write(utt_key_split[0] + "\t" + utt_key_split[1] + "\n")
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/qbe/.ipynb_checkpoints/sandbox-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sandox: Hauso QbE"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Herman Kamper, Stellenbosch University, 2018-2019."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Preliminaries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "%matplotlib inline\n",
 31 |     "%load_ext autoreload\n",
 32 |     "%autoreload 2\n",
 33 |     "\n",
 34 |     "from os import path\n",
 35 |     "import matplotlib.pyplot as plt\n",
 36 |     "import numpy as np"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Keywords"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 8,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "dev_keywords_fn = \"../features/mfcc/HA/ha.dev.gt_words.npz\"\n",
 53 |     "test_fn = \"../features/mfcc/HA/ha.eval.npz\"\n",
 54 |     "dev_keywords_features = np.load(dev_keywords_fn)\n",
 55 |     "test_features = np.load(test_fn)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 9,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "'HA002_94'"
 67 |       ]
 68 |      },
 69 |      "execution_count": 9,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "list(test_features)[0]"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "def read_forced_alignment(globalphone_fa_fn):\n",
 85 |     "    \"\"\"Read a GlobalPhone forced alignment file.\"\"\"\n",
 86 |     "\n",
 87 |     "test_transcript = read_forced_alignment(\"/home/kamperh/endgame/datasets/globalphone_alignments/HA/eval.ctm\")"
 88 |    ]
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "kernelspec": {
 93 |    "display_name": "Python 3",
 94 |    "language": "python",
 95 |    "name": "python3"
 96 |   },
 97 |   "language_info": {
 98 |    "codemirror_mode": {
 99 |     "name": "ipython",
100 |     "version": 3
101 |    },
102 |    "file_extension": ".py",
103 |    "mimetype": "text/x-python",
104 |    "name": "python",
105 |    "nbconvert_exporter": "python",
106 |    "pygments_lexer": "ipython3",
107 |    "version": "3.5.2"
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 2
112 | }
113 | 


--------------------------------------------------------------------------------
/blackbox/readme.md:
--------------------------------------------------------------------------------
 1 | Black-Box Analysis of Embedding Models
 2 | ======================================
 3 | 
 4 | Extract features for analysis
 5 | -----------------------------
 6 | While the default evaluation data (typically including a `gt_words` tag) are
 7 | extracted with a minimum duration of 0.5 seconds at at least 5 characters, it
 8 | is useful to do analysis on a larger range of word segments. This is done in
 9 | the script below.
10 | 
11 | Extract features and perform intermediate analysis:
12 | 
13 |     ./extract_analysis_features.py --analyse RU
14 | 
15 | 
16 | Process features with model
17 | ---------------------------
18 | The extracted features would typically be passed through a model.
19 | 
20 | For instance, to obtain downsampled embeddings, run:
21 | 
22 |     cd ../downsample
23 |     ./downsample.py --technique resample --frame_dims 13 \
24 |         ../blackbox/mfcc/GE/ge.dev.filter1_gt.npz \
25 |         exp/GE/mfcc.dev.filter1_gt.downsample_10.npz 10
26 |     ../embeddings/eval_samediff.py --mvn \
27 |         exp/GE/mfcc.dev.filter1_gt.downsample_10.npz
28 |     cd -
29 | 
30 | To obtain embeddings from a particular model, run:
31 | 
32 |     cd ../embeddings
33 |     ./apply_model_to_npz.py \
34 |         models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ckpt \
35 |         ../blackbox/mfcc/GE/ge.dev.filter1_gt.npz
36 |     ./eval_samediff.py --mvn \
37 |         models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz
38 |     cd -
39 | 
40 | 
41 | t-SNE visualisation
42 | -------------------
43 | To visualise embeddings, https://projector.tensorflow.org/ can be used. To
44 | generate the input required by this tool, run:
45 | 
46 |     ./npz_to_tsv.py \
47 |         ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz
48 | 
49 | and load the data into the tool.
50 | 
51 | 
52 | Agglomerative clustering
53 | ------------------------
54 | Clustering can be applied and visualised by running:
55 | 
56 |     ./hierarchical_clustering.py --n_samples 1000 \
57 |         ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz
58 | 
59 | Here, the colouring in the labels indicate the speaker for that token.
60 | 
61 | 
62 | Classifier analysis
63 | -------------------
64 | Perform speaker classification by training a multi-class logistic regression
65 | classifier on 80% of the data and then test on the remaining 20%:
66 | 
67 |     ./logreg_speaker.py \
68 |         ../downsample/exp/GE/mfcc.dev.gt_words.downsample_10.npz
69 |     ./logreg_speaker.py \
70 |         ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.GE.val.npz
71 | 
72 | Perform length (number of phones) classification:
73 | 
74 |     # To-do: should train and test on different sets here
75 |     ./logreg_pronlength.py \
76 |         ../downsample/exp/GE/mfcc.dev.filter1_gt.downsample_10.npz GE
77 |     ./logreg_pronlength.py \
78 |         ../embeddings/models/GE.gt/train_cae_rnn/15b3ecce63/cae.best_val.ge.dev.filter1_gt.npz GE
79 | 


--------------------------------------------------------------------------------
/qbe/readme.md:
--------------------------------------------------------------------------------
 1 | Query-by-Example Search on Hausa
 2 | ================================
 3 | 
 4 | Overview
 5 | --------
 6 | Queries are extracted from validation data and the evaluation data is treated
 7 | as the search collection.
 8 | 
 9 | 
10 | Prepare and link data
11 | ---------------------
12 | Extract features and link the required speech features:
13 | 
14 |     ./extract_queries_link_search.py HA
15 | 
16 | Extract the search intervals:
17 | 
18 |     ./data_prep_dense_seg.py --min_frames 20 --max_frames 60 --step 3 \
19 |         --n_splits 2 HA
20 | 
21 | 
22 | DTW-based QbE
23 | -------------
24 | Get QbE costs and write these to file:
25 | 
26 |     ./get_dtw_costs.py --n_cpus 29 HA
27 | 
28 | Evaluate QbE performance:
29 | 
30 |     ./eval_qbe.py HA exp/HA/dtw/cost_dict.pkl
31 | 
32 | HA results:
33 | 
34 |     Avg. duration per comparison: 0.057 sec
35 |     ---------------------------------------------------------------------------
36 |     EER:  0.2655, avg: 0.2918, median: 0.2783, max: 0.4505, min: 0.1844
37 |     AUC:  0.8002, avg: 0.7724, median: 0.7960, max: 0.8766, min: 0.5752
38 |     P@10: 0.4139, avg: 0.3468, median: 0.3550, max: 0.5433, min: 0.0933
39 |     P@N:  0.3257, avg: 0.2870, median: 0.2937, max: 0.4471, min: 0.0836
40 |     ---------------------------------------------------------------------------
41 | 
42 | 
43 | Embedding-based QbE
44 | -------------------
45 | Apply a CAE-RNN to the dense intervals for the different splits:
46 | 
47 |     ./apply_model_dense.py \
48 |         ../embeddings/models/HA.utd/train_cae_rnn/5addd62282/cae.best_val.ckpt \
49 |         HA search.0
50 |     ./apply_model_dense.py \
51 |         ../embeddings/models/HA.utd/train_cae_rnn/5addd62282/cae.best_val.ckpt \
52 |         HA search.1
53 | 
54 | Combine the splits:
55 | 
56 |     ./combine_model_output.py exp/HA/5addd62282.min_20.max_60.step_3
57 | 
58 | Remove split files:
59 | 
60 |     rm exp/HA/*/search.?.npz
61 | 
62 | Embed the queries:
63 | 
64 |     ../embeddings/apply_model_to_npz.py \
65 |         ../embeddings/models/HA.utd/train_cae_rnn/5addd62282/cae.best_val.ckpt \
66 |         data/HA/queries.npz \
67 |         --output_npz_fn exp/HA/5addd62282.min_20.max_60.step_3/queries.npz
68 | 
69 | Apply normalisation:
70 | 
71 |     ./dense_seg_mvn.py exp/HA/5addd62282.min_20.max_60.step_3
72 | 
73 | Calculate costs:
74 | 
75 |     # Unnormalised
76 |     ./get_dense_seg_costs.py exp/HA/5addd62282.min_20.max_60.step_3
77 |     # MVN
78 |     ./get_dense_seg_costs.py exp/HA/mvn.5addd62282.min_20.max_60.step_3
79 | 
80 | Evaluate QbE performance:
81 | 
82 |     # Unnormalised
83 |     ./eval_qbe.py HA \
84 |         exp/HA/5addd62282.min_20.max_60.step_3/cost_dict.cosine.pkl
85 |     # MVN
86 |     ./eval_qbe.py HA \
87 |         exp/HA/mvn.5addd62282.min_20.max_60.step_3/cost_dict.cosine.pkl
88 | 
89 | HA results with normalisation:
90 | 
91 |     Avg. duration per comparison: 0.00061147 sec
92 |     ---------------------------------------------------------------------------
93 |     EER:  0.2354, avg: 0.2720, median: 0.2419, max: 0.4669, min: 0.1669
94 |     AUC:  0.8254, avg: 0.7879, median: 0.8284, max: 0.8813, min: 0.5541
95 |     P@10: 0.3745, avg: 0.3045, median: 0.3133, max: 0.5233, min: 0.0600
96 |     P@N:  0.3053, avg: 0.2515, median: 0.2618, max: 0.4180, min: 0.0528
97 |     ---------------------------------------------------------------------------
98 | 


--------------------------------------------------------------------------------
/qbe/extract_queries_link_search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Extract queries and link search datasets for a particular GlobalPhone language.
 5 | 
 6 | Author: Herman Kamper
 7 | Contact: kamperh@gmail.com
 8 | Date: 2019
 9 | """
10 | 
11 | from os import path
12 | from tqdm import tqdm
13 | import argparse
14 | import codecs
15 | import numpy as np
16 | import os
17 | import sys
18 | 
19 | sys.path.append(path.join("..", "embeddings"))
20 | 
21 | from link_mfcc import link_features
22 | 
23 | 
24 | #-----------------------------------------------------------------------------#
25 | #                              UTILITY FUNCTIONS                              #
26 | #-----------------------------------------------------------------------------#
27 | 
28 | def check_argv():
29 |     """Check the command line arguments."""
30 |     parser = argparse.ArgumentParser(
31 |         description=__doc__.strip().split("\n")[0], add_help=False
32 |         )
33 |     parser.add_argument(
34 |         "language", type=str, help="GlobalPhone language",
35 |         choices=["HA"]
36 |         )
37 |     if len(sys.argv) == 1:
38 |         parser.print_help()
39 |         sys.exit(1)
40 |     return parser.parse_args()
41 | 
42 | 
43 | #-----------------------------------------------------------------------------#
44 | #                                MAIN FUNCTION                                #
45 | #-----------------------------------------------------------------------------#
46 | 
47 | def main():
48 |     args = check_argv()
49 | 
50 |     # Create feature/link directory
51 |     feat_dir = path.join("data", args.language)
52 |     if not path.isdir(feat_dir):
53 |         os.makedirs(feat_dir)
54 | 
55 |     # Read keywords
56 |     keywords_fn = path.join("..", "data", args.language, "keywords.txt")
57 |     with codecs.open(keywords_fn, "r", "utf-8") as f:
58 |         keywords = [line.strip() for line in f]
59 |     print("No. keywords:", len(keywords))
60 | 
61 |     # Extract queries from development data
62 |     queries_feat_fn = path.join(feat_dir, "queries.npz")
63 |     if not path.isfile(queries_feat_fn):
64 |         dev_feat_fn = path.join(
65 |             "..", "features", "mfcc", args.language, args.language.lower() +
66 |             ".dev.gt_words.npz"
67 |             )
68 |         assert path.isfile(dev_feat_fn), "file not found: " + dev_feat_fn
69 |         print("Reading:", dev_feat_fn)
70 |         dev_feat_dict = np.load(dev_feat_fn)
71 |         print("Extracting queries:")
72 |         queries_feat_dict = {}
73 |         for utterance_key in tqdm(dev_feat_dict):
74 |             label = utterance_key.split("_")[0]
75 |             if label in keywords:
76 |                 queries_feat_dict[utterance_key] = dev_feat_dict[utterance_key]
77 |         print("No. queries tokens:", len(queries_feat_dict))
78 |         print("Writing:", queries_feat_fn)
79 |         np.savez(queries_feat_fn, **queries_feat_dict)
80 |     else:
81 |         print("Using existing file:", queries_feat_fn)
82 | 
83 |     # Link test search utterances
84 |     search_feat_fn = path.join(
85 |         "..", "..", "..", "features", "mfcc", args.language,
86 |         args.language.lower() + ".eval.npz"
87 |         )  # relative path
88 |     link_fn = path.join(feat_dir, "search.npz")
89 |     link_features(search_feat_fn, link_fn, feat_dir)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/qbe/dense_seg_mvn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Perform mean and variance normalisation.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2017, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | from tqdm import tqdm
 14 | import argparse
 15 | import numpy as np
 16 | import os
 17 | import sys
 18 | 
 19 | 
 20 | #-----------------------------------------------------------------------------#
 21 | #                              UTILITY FUNCTIONS                              #
 22 | #-----------------------------------------------------------------------------#
 23 | 
 24 | def check_argv():
 25 |     """Check the command line arguments."""
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__.strip().split("\n")[0], add_help=False
 28 |         )
 29 |     parser.add_argument("exp_dir", type=str, help="experiments directory")
 30 |     if len(sys.argv) == 1:
 31 |         parser.print_help()
 32 |         sys.exit(1)
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | #-----------------------------------------------------------------------------#
 37 | #                                MAIN FUNCTION                                #
 38 | #-----------------------------------------------------------------------------#
 39 | 
 40 | def main():
 41 |     args = check_argv()
 42 | 
 43 |     print(datetime.now())
 44 | 
 45 |     # Read queries
 46 |     fn = path.join(args.exp_dir, "queries.npz")
 47 |     if not path.isfile(fn):
 48 |         import re
 49 |         fn = path.join(
 50 |             re.sub("min\_.*step\_\d*\.", "", args.exp_dir), "queries.npz"
 51 |             )
 52 |     print("Reading:", fn)
 53 |     queries_dict = np.load(fn)
 54 |     print("No. queries:", len(queries_dict.keys()))
 55 | 
 56 |     # Read search collection
 57 |     fn = path.join(args.exp_dir, "search.npz")
 58 |     print("Reading:", fn)
 59 |     search_dict = np.load(fn)
 60 |     print("No. search utterances:", len(search_dict.keys()))
 61 | 
 62 |     # Calculate mean and variance
 63 |     search_stacked = np.vstack([search_dict[i] for i in search_dict])
 64 |     mean = np.mean(search_stacked, axis=0)
 65 |     std = np.std(search_stacked, axis=0)
 66 |     std[std == 0] = np.mean(std)  # hack
 67 | 
 68 |     # Apply normalisation
 69 |     mvn_queries_dict = {}
 70 |     print("Normalising queries:")
 71 |     for query_key in tqdm(queries_dict):
 72 |         mvn_queries_dict[query_key] = (
 73 |             np.array(queries_dict[query_key]) - mean
 74 |             ) / std
 75 |     print("No. queries:", len(mvn_queries_dict))
 76 |     mvn_search_dict = {}
 77 |     print("Normalising search utterances:")
 78 |     for search_key in tqdm(search_dict):
 79 |         mvn_search_dict[search_key] = (
 80 |             np.array(search_dict[search_key]) - mean
 81 |             ) / std
 82 |     print("No. search utterances:", len(mvn_search_dict))
 83 | 
 84 |     print(datetime.now())
 85 | 
 86 |     # Create output directory
 87 |     exp_dir = path.normpath(args.exp_dir)
 88 |     output_dir = path.join(
 89 |         path.split(exp_dir)[0], "mvn." + path.split(exp_dir)[1]
 90 |         )
 91 |     if not path.isdir(output_dir):
 92 |         os.makedirs(output_dir)
 93 | 
 94 |     # Write normalized Numpy archives
 95 |     fn = path.join(output_dir, "queries.npz")
 96 |     print("Writing:", fn)
 97 |     np.savez(fn, **mvn_queries_dict)
 98 |     fn = path.join(output_dir, "search.npz")
 99 |     print("Writing:", fn)
100 |     np.savez(fn, **mvn_search_dict)
101 | 
102 |     print(datetime.now())
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/qbe/get_dtw_costs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Obtain the QbE costs for a given set of queries and search utterances.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2017, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | import argparse
 14 | import pickle
 15 | import numpy as np
 16 | import os
 17 | import sys
 18 | import timeit
 19 | 
 20 | sys.path.append(path.join("..", "..", "src", "speech_dtw"))
 21 | 
 22 | from speech_dtw import qbe
 23 | 
 24 | 
 25 | #-----------------------------------------------------------------------------#
 26 | #                              UTILITY FUNCTIONS                              #
 27 | #-----------------------------------------------------------------------------#
 28 | 
 29 | def check_argv():
 30 |     """Check the command line arguments."""
 31 |     parser = argparse.ArgumentParser(
 32 |         description=__doc__.strip().split("\n")[0], add_help=False
 33 |         )
 34 |     parser.add_argument(
 35 |         "--n_cpus", type=int,
 36 |         help="number of CPUs to parallelise over (default: %(default)s)",
 37 |         default=1
 38 |         )
 39 |     parser.add_argument(
 40 |         "feature_label", type=str,
 41 |         help="identifier for the set of queries and search utterances"
 42 |         )
 43 |     if len(sys.argv) == 1:
 44 |         parser.print_help()
 45 |         sys.exit(1)
 46 |     return parser.parse_args()
 47 | 
 48 | 
 49 | #-----------------------------------------------------------------------------#
 50 | #                                MAIN FUNCTION                                #
 51 | #-----------------------------------------------------------------------------#
 52 | 
 53 | def main():
 54 |     args = check_argv()
 55 | 
 56 |     print(datetime.now())
 57 | 
 58 |     # Read queries into a list
 59 |     fn = path.join("data", args.feature_label, "queries.npz")
 60 |     print("Reading:", fn)
 61 |     queries_dict = np.load(fn)
 62 |     queries_keys = sorted(queries_dict.keys())
 63 |     queries_list = [
 64 |         np.asarray(queries_dict[i], np.double) for i in queries_keys
 65 |         ]
 66 |     print("No. queries:", len(queries_list))
 67 | 
 68 |     # Read search collection into a list
 69 |     fn = path.join("data", args.feature_label, "search.npz")
 70 |     print("Reading:", fn)
 71 |     search_dict = np.load(fn)
 72 |     search_keys = sorted(search_dict.keys())
 73 |     search_list = [
 74 |         np.asarray(search_dict[i], np.double) for i in search_keys
 75 |         ]
 76 |     print("No. search items:", len(search_list))
 77 | 
 78 |     print(datetime.now())
 79 | 
 80 |     # Perform QbE
 81 |     print("Calculating costs: {} cores".format(args.n_cpus))
 82 |     start_time = timeit.default_timer()
 83 |     dtw_costs = qbe.parallel_dtw_sweep_min(
 84 |         queries_list, search_list, n_cpus=args.n_cpus
 85 |         )
 86 |     end_time = timeit.default_timer()
 87 |     duration = end_time - start_time
 88 |     print(datetime.now())
 89 |     print(
 90 |         "Avg. duration per comparison: {:.8f} sec".format(duration *
 91 |         args.n_cpus / (len(queries_list) * len(search_list)))
 92 |         )
 93 | 
 94 |     # Write costs
 95 |     cost_dict = {}
 96 |     for i_query, key_query in enumerate(queries_keys):
 97 |         if key_query not in cost_dict:
 98 |             cost_dict[key_query] = {}
 99 |         for i_search, key_search in enumerate(search_keys):
100 |             cost_dict[key_query][key_search] = dtw_costs[i_query][i_search]
101 |     output_dir = path.join("exp", args.feature_label, "dtw")
102 |     if not path.isdir(output_dir):
103 |         os.makedirs(output_dir)
104 |     fn = path.join(output_dir, "cost_dict.pkl")
105 |     print("Writing:", fn)
106 |     with open(fn, "wb") as f:
107 |         pickle.dump(cost_dict, f, -1)
108 | 
109 |     print(datetime.now())
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     main()
114 | 


--------------------------------------------------------------------------------
/downsample/downsample.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Downsample a given file using a particular technique and target dimensionality.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2015, 2018, 2019
  9 | """
 10 | 
 11 | import argparse
 12 | import numpy as np
 13 | import scipy.interpolate as interpolate
 14 | import scipy.signal as signal
 15 | import sys
 16 | 
 17 | flatten_order = "C"
 18 | 
 19 | 
 20 | #-----------------------------------------------------------------------------#
 21 | #                              UTILITY FUNCTIONS                              #
 22 | #-----------------------------------------------------------------------------#
 23 | 
 24 | def check_argv():
 25 |     """Check the command line arguments."""
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__.strip().split("\n")[0], add_help=False
 28 |         )
 29 |     parser.add_argument("input_npz_fn", type=str, help="input speech file")
 30 |     parser.add_argument(
 31 |         "output_npz_fn", type=str, help="output embeddings file"
 32 |         )
 33 |     parser.add_argument("n", type=int, help="number of samples")
 34 |     parser.add_argument(
 35 |         "--technique", choices=["interpolate", "resample", "rasanen"],
 36 |         default="resample"
 37 |         )
 38 |     parser.add_argument(
 39 |         "--frame_dims", type=int, default=None,
 40 |         help="only keep these number of dimensions"
 41 |         )
 42 |     if len(sys.argv) == 1:
 43 |         parser.print_help()
 44 |         sys.exit(1)
 45 |     return parser.parse_args()
 46 | 
 47 | 
 48 | #-----------------------------------------------------------------------------#
 49 | #                                MAIN FUNCTION                                #
 50 | #-----------------------------------------------------------------------------#
 51 | 
 52 | def main():
 53 |     args = check_argv()
 54 |     
 55 |     print("Reading:", args.input_npz_fn)
 56 |     input_npz = np.load(args.input_npz_fn)
 57 |     d_frame = input_npz[sorted(input_npz.keys())[0]].shape[1]
 58 | 
 59 |     print("Frame dimensionality:", d_frame)
 60 |     if args.frame_dims is not None and args.frame_dims < d_frame:
 61 |         d_frame = args.frame_dims
 62 |         print("Reducing frame dimensionality:", d_frame)
 63 | 
 64 |     print("Downsampling:", args.technique)
 65 |     output_npz = {}
 66 |     for key in input_npz:
 67 | 
 68 |         # Limit input dimensionailty
 69 |         y = input_npz[key][:, :args.frame_dims].T
 70 | 
 71 |         # Downsample
 72 |         if args.technique == "interpolate":
 73 |             x = np.arange(y.shape[1])
 74 |             f = interpolate.interp1d(x, y, kind="linear")
 75 |             x_new = np.linspace(0, y.shape[1] - 1, args.n)
 76 |             y_new = f(x_new).flatten(flatten_order) #.flatten("F")
 77 |         elif args.technique == "resample":
 78 |             y_new = signal.resample(
 79 |                 y, args.n, axis=1
 80 |                 ).flatten(flatten_order) #.flatten("F")
 81 |         elif args.technique == "rasanen":
 82 |             # Taken from Rasenen et al., Interspeech, 2015
 83 |             n_frames_in_multiple = int(np.floor(y.shape[1] / args.n)) * args.n
 84 |             y_new = np.mean(
 85 |                 y[:, :n_frames_in_multiple].reshape((d_frame, args.n, -1)),
 86 |                 axis=-1
 87 |                 ).flatten(flatten_order) #.flatten("F")
 88 | 
 89 |         # This was done in Rasenen et al., 2015, but didn't help here
 90 |         # last_term = args.n/3. * np.log10(y.shape[1] * 10e-3)
 91 |         # Not sure if the above should be in frames or ms
 92 |         # y_new = np.hstack([y_new, last_term])
 93 |         
 94 |         # Save result
 95 |         output_npz[key] = y_new
 96 | 
 97 |     print(
 98 |         "Output dimensionality:",
 99 |         output_npz[sorted(output_npz.keys())[0]].shape[0]
100 |         )
101 | 
102 |     print("Writing:", args.output_npz_fn)
103 |     np.savez_compressed(args.output_npz_fn, **output_npz)
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/blackbox/logreg_speaker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Use logistic regression for speaker classification.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.metrics import accuracy_score, classification_report
 14 | from tqdm import tqdm
 15 | import argparse
 16 | import numpy as np
 17 | import sys
 18 | 
 19 | 
 20 | #-----------------------------------------------------------------------------#
 21 | #                              UTILITY FUNCTIONS                              #
 22 | #-----------------------------------------------------------------------------#
 23 | 
 24 | def check_argv():
 25 |     """Check the command line arguments."""
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__.strip().split("\n")[0], add_help=False
 28 |         )
 29 |     parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings")
 30 |     if len(sys.argv) == 1:
 31 |         parser.print_help()
 32 |         sys.exit(1)
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | #-----------------------------------------------------------------------------#
 37 | #                                MAIN FUNCTION                                #
 38 | #-----------------------------------------------------------------------------#
 39 | 
 40 | def main():
 41 |     args = check_argv()
 42 | 
 43 |     print("Reading:", args.npz_fn)
 44 |     embeddings = np.load(args.npz_fn)
 45 | 
 46 |     # # Temp
 47 |     # import random
 48 |     # data = {}
 49 |     # a = list(embeddings)
 50 |     # random.shuffle(a)
 51 |     # for key in a[:100]:
 52 |     #     data[key] = embeddings[key]
 53 |     # embeddings = data
 54 | 
 55 |     print("Ordering embeddings:")
 56 |     n_embeds = 0
 57 |     X = []
 58 |     utt_keys = []
 59 |     words = []
 60 |     speakers = []
 61 |     for utt_key in tqdm(sorted(embeddings)):
 62 |         utt_keys.append(utt_key)
 63 |         X.append(embeddings[utt_key])
 64 |         utt_key = utt_key.split("_")
 65 |         word = utt_key[0]
 66 |         speaker = utt_key[1]
 67 |         words.append(word)
 68 |         speakers.append(speaker)
 69 |     X = np.array(X)
 70 |     print("No. embeddings:", X.shape[0])
 71 |     print("Embedding dimensionality:", X.shape[1])
 72 | 
 73 |     # Convert words to IDs
 74 |     speaker_set = set(speakers)
 75 |     speaker_to_id = dict(
 76 |         zip(sorted(list(speaker_set)), range(len(speaker_set)))
 77 |         )
 78 |     id_to_speaker = dict([[v,k] for k, v in speaker_to_id.items()])
 79 |     y = []
 80 |     for speaker in speakers:
 81 |         y.append(speaker_to_id[speaker])
 82 |     y = np.array(y, dtype=int)
 83 |     print("No. speakers:", len(speaker_to_id))
 84 | 
 85 |     # Split training and test sets 80/20
 86 |     indices = np.arange(X.shape[0])
 87 |     np.random.seed(1)
 88 |     np.random.shuffle(indices)
 89 |     n_train = int(round(X.shape[0]*0.8))
 90 |     X_train = X[indices[:n_train]]
 91 |     X_test = X[indices[n_train:]]
 92 |     y_train = y[indices[:n_train]]
 93 |     y_test = y[indices[n_train:]]
 94 |     print("Training data shape:", X_train.shape)
 95 |     print("Test data shape:", X_test.shape)
 96 | 
 97 |     # Multi-class logistic regression
 98 |     print(datetime.now())
 99 |     print("Fitting multi-class logistic regression model")
100 |     logreg = LogisticRegression(
101 |         C=1e5, solver="lbfgs", multi_class="multinomial"
102 |         # solver="lbfgs", multi_class="ovr", max_iter=200
103 |         )
104 |     logreg.fit(X_train, y_train)
105 |     print(datetime.now())
106 | 
107 |     # Predict classes
108 |     y_pred = logreg.predict(X_test)
109 |     accuracy = accuracy_score(y_test, y_pred)
110 |     
111 |     print("Speaker classification accuracy: {:.2f}%".format(accuracy*100))
112 |     print(
113 |         classification_report(y_test, y_pred,
114 |         target_names=[id_to_speaker[i] for i in range(max(y) + 1)])
115 |         )
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/embeddings/readme.md:
--------------------------------------------------------------------------------
  1 | Acoustic Word Embedding Models and Evaluation
  2 | =============================================
  3 | 
  4 | Overview
  5 | --------
  6 | The examples below are intended for illustration purposes -- there are many
  7 | different language-combinations and other settings which can be adjusted for
  8 | the different models. But the most important command-line arguments are
  9 | illustrated in the examples here.
 10 | 
 11 | 
 12 | Data preparation
 13 | ----------------
 14 | Create links to the MFCC NumPy archives:
 15 | 
 16 |     ./link_mfcc.py SP
 17 | 
 18 | You need to run `link_mfcc.py` for all languages; run it without any arguments
 19 | to see all 16 language codes. Alternatively, links can be greated for all
 20 | languages by passing the "all" argument.
 21 | 
 22 | 
 23 | Autoencoder RNN
 24 | ---------------
 25 | Train an AE-RNN on Spanish UTD segments:
 26 | 
 27 |     ./train_cae_rnn.py --extrinsic_usefinal --ae_n_val_interval 9 \
 28 |         --ae_n_epochs 10 --cae_n_epochs 0 --train_tag utd --val_lang SP SP
 29 | 
 30 | Train an AE-RNN on seven languages using ground truth segments and validate on
 31 | German:
 32 | 
 33 |     ./train_cae_rnn.py --ae_n_epochs 25 --cae_n_epochs 0 \
 34 |         --n_max_types 1000 --train_tag gt --val_lang GE RU+CZ+FR+PL+TH+PO
 35 | 
 36 | 
 37 | Correspondence autoencoder RNN
 38 | ------------------------------
 39 | Train a CAE-RNN on Spanish UTD segments:
 40 | 
 41 |     ./train_cae_rnn.py --pretrain_usefinal --extrinsic_usefinal \
 42 |         --ae_n_val_interval 14 --ae_n_epochs 15 --cae_n_epochs 3 \
 43 |         --cae_batch_size 600 --train_tag utd --val_lang SP SP
 44 | 
 45 | Evaluate the model:
 46 | 
 47 |     ./apply_model.py \
 48 |         models/SP.utd/train_cae_rnn/17b498a959/cae.best_val.ckpt SP val
 49 |     ./eval_samediff.py --mvn \
 50 |         models/SP.utd/train_cae_rnn/17b498a959/cae.best_val.SP.val.npz
 51 | 
 52 | Analyse embeddings:
 53 | 
 54 |     ./analyse_embeds.py --normalize --word_type \
 55 |         guatemala,presidente,autoridades,candidatos,vicepresidente,social \
 56 |         models/SP.utd/train_cae_rnn/17b498a959/cae.best_val.SP.val.npz
 57 | 
 58 | All the models trained below can be applied, evaluated and analysed using the
 59 | scripts above.
 60 | 
 61 | Train a CNN-RNN on Spanish ground truth segments:
 62 | 
 63 |     ./train_cae_rnn.py --pretrain_usefinal --n_max_pairs 100000 \
 64 |         --ae_n_val_interval 14 --ae_n_epochs 15 --cae_n_epochs 25 \
 65 |         --train_tag gt --val_lang SP SP
 66 | 
 67 | Train a CAE-RNN jointly on multiple languages, limiting the maximum overall
 68 | number of pairs, the maximum number of types per language and requiring a
 69 | minimum number of tokens per type:
 70 | 
 71 |     ./train_cae_rnn.py --pretrain_usefinal --ae_n_val_interval 14 \
 72 |         --ae_n_epochs 15 --cae_n_epochs 10 --n_max_pairs 300000 \
 73 |         --n_min_tokens_per_type 2 --n_max_types 1000 --train_tag gt \
 74 |         --val_lang GE RU+CZ+FR+PL+TH+PO
 75 | 
 76 | 
 77 | Siamese RNN
 78 | -----------
 79 | Train a Siamese RNN on ground truth segments:
 80 | 
 81 |     ./train_siamese_rnn.py --n_epochs 25 --train_tag gt --val_lang SP SP
 82 | 
 83 | Train a Siamese RNN ensuring that each batch contains paired data, i.e., no
 84 | batch will have a singleton token:
 85 | 
 86 |     ./train_siamese_rnn_pairbatch.py --n_epochs 15 --train_tag gt \
 87 |         --margin 0.2 --val_lang GE GE
 88 | 
 89 | 
 90 | Siamese CNN
 91 | -----------
 92 | Train a Siamese CNN on ground truth segments:
 93 | 
 94 |     ./train_siamese_cnn.py --n_epochs 150 --train_tag gt --n_val_interval 5 SP
 95 | 
 96 | 
 97 | Classifier CNN
 98 | --------------
 99 | Train a word classifier CNN on ground truth segments:
100 | 
101 |     ./train_cnn.py --n_epochs 100 --train_tag gt --n_val_interval 5 SP
102 | 
103 | 
104 | Classifier RNN
105 | --------------
106 | Train a word classifier RNN on ground truth segments:
107 | 
108 |     ./train_rnn.py --n_epochs 25 --train_tag gt --val_lang SP SP
109 | 
110 | Train a word classifier RNN jointly on multiple languages:
111 | 
112 |     ./train_rnn.py --n_epochs 15 --train_tag gt --n_max_types 10000 \
113 |         --n_max_tokens_per_type 20 --val_lang GE RU+CZ+FR+PL+TH+PO
114 | 
115 | 


--------------------------------------------------------------------------------
/blackbox/hierarchical_clustering.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Apply agglomerative clustering to embeddings and plot a labelled dendrogram.
  5 | 
  6 | See
  7 | https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
  8 | 
  9 | Author: Herman Kamper
 10 | Contact: kamperh@gmail.com
 11 | Date: 2019
 12 | """
 13 | 
 14 | from os import path
 15 | from scipy.cluster.hierarchy import dendrogram, linkage
 16 | from tqdm import tqdm
 17 | import argparse
 18 | import matplotlib.pyplot as plt
 19 | import numpy as np
 20 | import random
 21 | import sys
 22 | 
 23 | 
 24 | #-----------------------------------------------------------------------------#
 25 | #                              UTILITY FUNCTIONS                              #
 26 | #-----------------------------------------------------------------------------#
 27 | 
 28 | def check_argv():
 29 |     """Check the command line arguments."""
 30 |     parser = argparse.ArgumentParser(
 31 |         description=__doc__.strip().split("\n")[0], add_help=False
 32 |         )
 33 |     parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings")
 34 |     parser.add_argument(
 35 |         "--n_samples", type=int,
 36 |         help="if given, the embeddings are subsampled"
 37 |         )
 38 |     if len(sys.argv) == 1:
 39 |         parser.print_help()
 40 |         sys.exit(1)
 41 |     return parser.parse_args()
 42 | 
 43 | 
 44 | #-----------------------------------------------------------------------------#
 45 | #                                MAIN FUNCTION                                #
 46 | #-----------------------------------------------------------------------------#
 47 | 
 48 | def main():
 49 |     args = check_argv()
 50 | 
 51 |     print("Reading:", args.npz_fn)
 52 |     embeddings = np.load(args.npz_fn)
 53 | 
 54 |     if args.n_samples is not None:
 55 |         utt_keys = list(embeddings)
 56 |         random.seed(1)
 57 |         random.shuffle(utt_keys)
 58 |         new_embeddings = {}
 59 |         for utt_key in utt_keys[:args.n_samples]:
 60 |             new_embeddings[utt_key] = embeddings[utt_key]
 61 |         embeddings = new_embeddings
 62 | 
 63 |     print("Ordering embeddings:")
 64 |     n_embeds = 0
 65 |     X = []
 66 |     utt_keys = []
 67 |     labels = []
 68 |     speakers = []
 69 |     for utt_key in tqdm(sorted(embeddings)):
 70 |         utt_keys.append(utt_key)
 71 |         X.append(embeddings[utt_key])
 72 |         utt_key = utt_key.split("_")
 73 |         label = utt_key[0]
 74 |         speaker = utt_key[1]
 75 |         labels.append(label)
 76 |         speakers.append(speaker)
 77 |     X = np.array(X)
 78 |     print("No. embeddings:", X.shape[0])
 79 |     print("Embedding dimensionality:", X.shape[1])
 80 | 
 81 |     # Normalise
 82 |     normed = (X - X.mean(axis=0)) / X.std(axis=0)
 83 |     X = normed
 84 | 
 85 |     # Get a speaker colour map
 86 |     # cmap = plt.cm.jet
 87 |     cmap = plt.cm.viridis
 88 |     cmaplist = [cmap(i) for i in range(cmap.N)]
 89 |     speakers_set = set(speakers)
 90 |     n_speakers = len(speakers_set)
 91 |     speaker_to_color = {}
 92 |     for i_speaker, speaker in enumerate(sorted(list(speakers_set))):
 93 |         speaker_to_color[speaker] = cmaplist[
 94 |             int(i_speaker/n_speakers * (len(cmaplist) - 1))
 95 |             ]
 96 |     # speakers_to_id = dict(
 97 |     #     zip(sorted(list(speakers_set)), range(len(speakers_set)))
 98 |     #     )
 99 |     # speakers_to_color = {}
100 |     # for speaker in speakers_to_id:
101 |     #     speakers_to_color[speaker] = 
102 | 
103 |     # Cluster
104 |     print("Clustering")
105 |     Z = linkage(X, method="ward", metric="euclidean")
106 | 
107 |     # Plot dendrogram
108 |     print("Plotting")
109 |     plt.figure()
110 |     R = dendrogram(
111 |         Z,
112 |         leaf_rotation=90,
113 |         leaf_font_size=8,
114 |         labels=labels
115 |         )
116 |     leaves = R["leaves"]
117 | 
118 |     ax = plt.gca()
119 |     x_labels = ax.get_xmajorticklabels()
120 |     for i, x in enumerate(x_labels):
121 |         x.set_color(speaker_to_color[speakers[leaves[i]]])
122 |         # c = 
123 |         # print(x.get_text(), labels[leaves[i]], speakers[leaves[i]])
124 |         # x.set_color(colorDict[x.get_text()])
125 | 
126 |     plt.show()
127 | 
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | Multilingual Acoustic Word Embeddings on GlobalPhone
  2 | ====================================================
  3 | 
  4 | Overview
  5 | --------
  6 | Multilingual acoustic word embedding approaches are implemented and evaluated
  7 | on the GlobalPhone corpus. The experiments are described in:
  8 | 
  9 | - H. Kamper, Y. Matusevych, and S.J. Goldwater, "Multilingual acoustic word
 10 |   embedding models for processing zero-resource languages," in *Proc. ICASSP*,
 11 |   2020. [[arXiv](https://arxiv.org/abs/2002.02109)]
 12 | - H. Kamper, Y. Matusevych, and S. J. Goldwater, "Improved acoustic word
 13 |   embeddings for zero-resource languages using multilingual transfer,"
 14 |   *arXiv preprint arXiv:2006.02295*, 2020.
 15 |   [[arXiv](https://arxiv.org/abs/2006.02295)]
 16 | 
 17 | Please cite these papers if you use the code.
 18 | 
 19 | 
 20 | Disclaimer
 21 | ----------
 22 | The code provided here is not pretty. But I believe that research should be
 23 | reproducible. I provide no guarantees with the code, but please let me know if
 24 | you have any problems, find bugs or have general comments.
 25 | 
 26 | 
 27 | Download datasets
 28 | -----------------
 29 | The [GlobalPhone](https://csl.anthropomatik.kit.edu/english/globalphone.php)
 30 | corpus and forced alignments of the data needs to be obtained. GlobalPhone
 31 | needs to be paid for. If you have proof of payment, we can give you access to
 32 | the forced alignments. Save the data and forced alignments in a separate
 33 | directory and update the `paths.py` file to point to the data directories.
 34 | 
 35 | 
 36 | Install dependencies
 37 | --------------------
 38 | You will require the following:
 39 | 
 40 | - [Python 3](https://www.python.org/downloads/)
 41 | - [TensorFlow 1.13.1](https://www.tensorflow.org/)
 42 | - [LibROSA](http://librosa.github.io/librosa/)
 43 | - [Cython](https://cython.org/)
 44 | - [tqdm](https://tqdm.github.io/)
 45 | - [speech_dtw](https://github.com/kamperh/speech_dtw/)
 46 | - [shorten](http://etree.org/shnutils/shorten/dist/src/shorten-3.6.1.tar.gz)
 47 | 
 48 | To install `speech_dtw` (required for same-different evaluation) and `shorten`
 49 | (required for processing audio), run `./install_local.sh`.
 50 | 
 51 | You can install all the other dependencies in a conda environment by running:
 52 | 
 53 |     conda env create -f environment.yml
 54 |     conda activate tf1.13
 55 | 
 56 | 
 57 | Extract speech features
 58 | -----------------------
 59 | Update the paths in `paths.py` to point to the data directories. Extract MFCC
 60 | features in the `features/` directory as follows:
 61 | 
 62 |     cd features
 63 |     ./extract_features.py SP
 64 | 
 65 | You need to run `extract_features.py` for all languages; run it without any
 66 | arguments to see all 16 language codes.
 67 | 
 68 | UTD pairs can also be analysed here, by running e.g.:
 69 | 
 70 |     ./analyse_utd_pairs.py SP
 71 | 
 72 | 
 73 | Evaluate frame-level features using the same-different task
 74 | -----------------------------------------------------------
 75 | This is optional. To perform frame-level same-different evaluation based on
 76 | dynamic time warping (DTW), follow [samediff/readme.md](samediff/readme.md).
 77 | 
 78 | 
 79 | Obtain downsampled acoustic word embeddings
 80 | -------------------------------------------
 81 | Extract and evaluate downsampled acoustic word embeddings by running the steps
 82 | in [downsample/readme.md](downsample/readme.md).
 83 | 
 84 | 
 85 | Train neural acoustic word embeddings
 86 | -------------------------------------
 87 | Train and evaluate neural network acoustic word embedding models by running the
 88 | steps in [embeddings/readme.md](embeddings/readme.md).
 89 | 
 90 | 
 91 | Analyse embedding models
 92 | ------------------------
 93 | Analyse different properties/aspects of the acoustic word embedding models by
 94 | running the steps in [blackbox/readme.md](blackbox/readme.md).
 95 | 
 96 | 
 97 | Query-by-example search
 98 | -----------------------
 99 | Perform query-by-example search experiments by running the steps in
100 | [qbe/readme.md](qbe/readme.md).
101 | 
102 | 
103 | Unit tests
104 | ----------
105 | In the root project directory, run `make test` to run unit tests.
106 | 
107 | 
108 | References
109 | ----------
110 | - https://github.com/eginhard/cae-utd-utils
111 | 
112 | 
113 | Contributors
114 | ------------
115 | - [Herman Kamper](http://www.kamperh.com/)
116 | - [Yevgen Matusevych](https://homepages.inf.ed.ac.uk/ymatusev/)
117 | - [Sharon Goldwater](https://homepages.inf.ed.ac.uk/sgwater/)
118 | 
119 | 
120 | License
121 | -------
122 | The code is distributed under the Creative Commons Attribution-ShareAlike
123 | license ([CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)).
124 | 


--------------------------------------------------------------------------------
/embeddings/eval_samediff.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Perform same-different evaluation of fixed-dimensional representations.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2015, 2016, 2018, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | from scipy.spatial.distance import pdist
 14 | import argparse
 15 | import numpy as np
 16 | import sys
 17 | 
 18 | sys.path.append(path.join("..", "..", "src", "speech_dtw", "utils"))
 19 | 
 20 | import samediff
 21 | 
 22 | 
 23 | #-----------------------------------------------------------------------------#
 24 | #                              UTILITY FUNCTIONS                              #
 25 | #-----------------------------------------------------------------------------#
 26 | 
 27 | def check_argv():
 28 |     """Check the command line arguments."""
 29 |     parser = argparse.ArgumentParser(
 30 |         description=__doc__.strip().split("\n")[0], add_help=False
 31 |         )
 32 |     parser.add_argument("npz_fn", type=str, help="NumPy archive")
 33 |     parser.add_argument(
 34 |         "--metric", choices=["cosine", "euclidean", "hamming", "chebyshev",
 35 |         "kl"], default="cosine", help="distance metric (default: %(default)s)"
 36 |         )
 37 |     parser.add_argument(
 38 |         "--mean_ap", dest="mean_ap", action="store_true",
 39 |         help="also compute mean average precision (this is significantly "
 40 |         "more resource intensive)"
 41 |         )
 42 |     parser.add_argument(
 43 |         "--mvn", action="store_true",
 44 |         help="mean and variance normalise (default: False)"
 45 |         )
 46 |     if len(sys.argv) == 1:
 47 |         parser.print_help()
 48 |         sys.exit(1)
 49 |     return parser.parse_args()
 50 | 
 51 | 
 52 | #-----------------------------------------------------------------------------#
 53 | #                                MAIN FUNCTION                                #
 54 | #-----------------------------------------------------------------------------#
 55 | 
 56 | def main():
 57 |     args = check_argv()
 58 | 
 59 |     print(datetime.now())
 60 | 
 61 |     print("Reading:", args.npz_fn)
 62 |     npz = np.load(args.npz_fn)
 63 | 
 64 |     print(datetime.now())
 65 | 
 66 |     print("Ordering embeddings")
 67 |     n_embeds = 0
 68 |     X = []
 69 |     ids = []
 70 |     for label in sorted(npz):
 71 |         ids.append(label)
 72 |         X.append(npz[label])
 73 |         n_embeds += 1
 74 |     X = np.array(X)
 75 |     print("No. embeddings:", n_embeds)
 76 |     print("Embedding dimensionality:", X.shape[1])
 77 | 
 78 |     if args.mvn:
 79 |         normed = (X - X.mean(axis=0)) / X.std(axis=0)
 80 |         X = normed
 81 | 
 82 |     print(datetime.now())
 83 | 
 84 |     print("Calculating distances")
 85 |     metric = args.metric
 86 |     if metric == "kl":
 87 |         import scipy.stats
 88 |         metric = scipy.stats.entropy
 89 |     distances = pdist(X, metric=metric)
 90 | 
 91 |     print(datetime.now())
 92 | 
 93 |     print("Getting labels and speakers")
 94 |     labels = []
 95 |     speakers = []
 96 |     for utt_id in ids:
 97 |         utt_id = utt_id.split("_")
 98 |         word = utt_id[0]
 99 |         speaker = utt_id[1]
100 |         labels.append(word)
101 |         speakers.append(speaker)
102 | 
103 |     if args.mean_ap:
104 |         print(datetime.now())
105 |         print("Calculating mean average precision")
106 |         mean_ap, mean_prb, ap_dict = samediff.mean_average_precision(
107 |             distances, labels
108 |             )
109 |         print("Mean average precision:", mean_ap)
110 |         print("Mean precision-recall breakeven:", mean_prb)
111 | 
112 |     print(datetime.now())
113 | 
114 |     print("Calculating average precision")
115 |     # matches = samediff.generate_matches_array(labels)  # Temp
116 |     word_matches = samediff.generate_matches_array(labels)
117 |     speaker_matches = samediff.generate_matches_array(speakers)
118 |     print("No. same-word pairs:", sum(word_matches))
119 |     print("No. same-speaker pairs:", sum(speaker_matches))
120 |     
121 |     sw_ap, sw_prb, swdp_ap, swdp_prb = samediff.average_precision_swdp(
122 |         distances[np.logical_and(word_matches, speaker_matches)],
123 |         distances[np.logical_and(word_matches, speaker_matches == False)],
124 |         distances[word_matches == False]
125 |         )
126 |     print("-"*79)
127 |     print("Average precision: {:.8f}".format(sw_ap))
128 |     print("Precision-recall breakeven: {:.8f}".format(sw_prb))
129 |     print("SWDP average precision: {:.8f}".format(swdp_ap))
130 |     print("SWDP precision-recall breakeven: {:.8f}".format(swdp_prb))
131 |     print("-"*79)
132 | 
133 |     print(datetime.now())
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main()
138 | 


--------------------------------------------------------------------------------
/qbe/data_prep_dense_seg.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Prepare the data for dense segmental QbE search.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2017, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | import argparse
 14 | import pickle
 15 | import numpy as np
 16 | import os
 17 | import sys
 18 | 
 19 | 
 20 | #-----------------------------------------------------------------------------#
 21 | #                              UTILITY FUNCTIONS                              #
 22 | #-----------------------------------------------------------------------------#
 23 | 
 24 | def check_argv():
 25 |     """Check the command line arguments."""
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__.strip().split("\n")[0], add_help=False
 28 |         )
 29 |     parser.add_argument(
 30 |         "language", type=str, help="GlobalPhone language",
 31 |         choices=["HA"]
 32 |         )
 33 |     parser.add_argument(
 34 |         "--min_frames", type=int,
 35 |         help="minimum number of frames (default: %(default)s)", default=20
 36 |         )
 37 |     parser.add_argument(
 38 |         "--max_frames", type=int,
 39 |         help="maximum number of frames (default: %(default)s)", default=60
 40 |         )
 41 |     parser.add_argument(
 42 |         "--step", type=int,
 43 |         help="frame step (default: %(default)s)", default=3
 44 |         )
 45 |     parser.add_argument(
 46 |         "--n_splits", type=int,
 47 |         help="number of search collection splits (default: %(default)s)",
 48 |         default=2
 49 |         )
 50 |     if len(sys.argv) == 1:
 51 |         parser.print_help()
 52 |         sys.exit(1)
 53 |     return parser.parse_args()
 54 | 
 55 | 
 56 | #-----------------------------------------------------------------------------#
 57 | #                                MAIN FUNCTION                                #
 58 | #-----------------------------------------------------------------------------#
 59 | 
 60 | def main():
 61 |     args = check_argv()
 62 | 
 63 |     print(datetime.now())
 64 | 
 65 |     output_dir = path.join("data", args.language)
 66 |     if not path.isdir(output_dir):
 67 |         os.makedirs(output_dir)
 68 |     segtag = "min_{}.max_{}.step_{}".format(
 69 |         args.min_frames, args.max_frames, args.step
 70 |         )
 71 | 
 72 |     # Subset search collection
 73 |     search_dict_fn = path.join("data", args.language, "search.npz")
 74 |     print("Reading:", search_dict_fn)
 75 |     search_dict = np.load(search_dict_fn)
 76 |     search_keys = sorted(search_dict.keys())
 77 |     print("No. search utterances:", len(search_keys))
 78 | 
 79 |     # Dense search segments list
 80 |     seglist_fn = path.join(
 81 |         output_dir, "search.seglist." + segtag + ".pkl"
 82 |         )
 83 |     if not path.isfile(seglist_fn):
 84 |         print("Getting segmentation lists")
 85 |         seglist_dict = {}
 86 |         n_intervals = 0
 87 |         for utt_key in search_keys:
 88 |             seglist = []
 89 |             length = search_dict[utt_key].shape[0]
 90 |             i_start = 0
 91 |             while i_start < length:
 92 |                 i_end = i_start + args.min_frames
 93 |                 while i_end <= length and i_end - i_start <= args.max_frames:
 94 |                     seglist.append((i_start, i_end))
 95 |                     i_end += args.step
 96 |                     n_intervals += 1
 97 |                 i_start += args.step
 98 |             seglist_dict[utt_key] = seglist
 99 |         print("No. segmentation intervals:", n_intervals)
100 |         print("Writing:", seglist_fn)
101 |         with open(seglist_fn, "wb") as f:
102 |             pickle.dump(seglist_dict, f, -1)
103 |     else:
104 |         print("Using existing file:", seglist_fn)
105 | 
106 |     # Split the search collection
107 |     split_dict_fn = path.join(
108 |         "data", args.language, "search." + str(args.n_splits - 1) + ".npz"
109 |         )
110 |     if not path.isfile(split_dict_fn):
111 |         n_items = int(np.ceil(np.float(len(search_keys)) / args.n_splits))
112 |         n_total = 0
113 |         for i_split in range(args.n_splits):
114 |             split_search_keys = search_keys[i_split*n_items:(i_split + 1)*n_items]
115 |             split_dict = {}
116 |             for utt_key in split_search_keys:
117 |                 split_dict[utt_key] = search_dict[utt_key]
118 |             split_dict_fn = path.join(
119 |                 "data", args.language, "search." + str(i_split) + ".npz"
120 |                 )
121 |             print("Writing:", split_dict_fn)
122 |             np.savez(split_dict_fn, **split_dict)
123 |             n_total += len(split_dict)
124 |         print(
125 |             "Wrote {} out of {} utterances".format(len(search_dict.keys()),
126 |             n_total)
127 |             )
128 |     else:
129 |         print("Using existing splits:", split_dict_fn)
130 | 
131 |     print(datetime.now())
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/blackbox/logreg_pronlength.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Use logistic regression for classifying the number of phones in a word.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | from sklearn.linear_model import LogisticRegression
 14 | from sklearn.metrics import accuracy_score, classification_report
 15 | from tqdm import tqdm
 16 | import argparse
 17 | import numpy as np
 18 | import sys
 19 | 
 20 | from analyse_pairs import read_pronunciations
 21 | 
 22 | 
 23 | #-----------------------------------------------------------------------------#
 24 | #                              UTILITY FUNCTIONS                              #
 25 | #-----------------------------------------------------------------------------#
 26 | 
 27 | def check_argv():
 28 |     """Check the command line arguments."""
 29 |     parser = argparse.ArgumentParser(
 30 |         description=__doc__.strip().split("\n")[0], add_help=False
 31 |         )
 32 |     parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings")
 33 |     parser.add_argument(
 34 |         "language", type=str, help="the pronunciations for this GlobalPhone "
 35 |         "language is used", choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA",
 36 |         "KO", "PL", "PO", "RU", "SP", "SW", "TH", "TU", "VN"]
 37 |         )
 38 |     if len(sys.argv) == 1:
 39 |         parser.print_help()
 40 |         sys.exit(1)
 41 |     return parser.parse_args()
 42 | 
 43 | 
 44 | #-----------------------------------------------------------------------------#
 45 | #                                MAIN FUNCTION                                #
 46 | #-----------------------------------------------------------------------------#
 47 | 
 48 | def main():
 49 |     args = check_argv()
 50 | 
 51 |     print("Reading:", args.npz_fn)
 52 |     embeddings = np.load(args.npz_fn)
 53 | 
 54 |     assert False, "need to test on different data, since here we have speaker"
 55 |     " overlap (read in additional test file)"
 56 | 
 57 |     # # Temp
 58 |     # import random
 59 |     # data = {}
 60 |     # a = list(embeddings)
 61 |     # random.shuffle(a)
 62 |     # for key in a[:100]:
 63 |     #     data[key] = embeddings[key]
 64 |     # embeddings = data
 65 | 
 66 |     print("Ordering embeddings:")
 67 |     n_embeds = 0
 68 |     X = []
 69 |     utt_keys = []
 70 |     words = []
 71 |     speakers = []
 72 |     for utt_key in tqdm(sorted(embeddings)):
 73 |         utt_keys.append(utt_key)
 74 |         X.append(embeddings[utt_key])
 75 |         utt_key = utt_key.split("_")
 76 |         word = utt_key[0]
 77 |         speaker = utt_key[1]
 78 |         words.append(word)
 79 |         speakers.append(speaker)
 80 |     X = np.array(X)
 81 |     print("No. embeddings:", X.shape[0])
 82 |     print("Embedding dimensionality:", X.shape[1])
 83 | 
 84 |     # Pronunciations
 85 |     pron_fn = path.join("lists", args.language, "dev.prons")
 86 |     print("Reading:", pron_fn)
 87 |     pronunciations = read_pronunciations(pron_fn)
 88 |     pron_labels = []
 89 |     pron_lengths = []
 90 |     for utt_key in utt_keys:
 91 |         pron_labels.append(pronunciations[utt_key])
 92 |         pron_lengths.append(len(pronunciations[utt_key]))
 93 |     print("Minimum length:", min(pron_lengths))
 94 |     print("Maximum length:", max(pron_lengths))
 95 | 
 96 |     # Convert words to IDs
 97 |     length_set = set(pron_lengths)
 98 |     length_to_id = dict(
 99 |         zip(sorted(list(length_set)), range(len(length_set)))
100 |         )
101 |     id_to_length = dict([[v,k] for k, v in length_to_id.items()])
102 |     y = []
103 |     for length in pron_lengths:
104 |         y.append(length_to_id[length])
105 |     y = np.array(y, dtype=int)
106 | 
107 |     # Split training and test sets 80/20
108 |     indices = np.arange(X.shape[0])
109 |     np.random.seed(2)
110 |     np.random.shuffle(indices)
111 |     n_train = int(round(X.shape[0]*0.8))
112 |     X_train = X[indices[:n_train]]
113 |     X_test = X[indices[n_train:]]
114 |     y_train = y[indices[:n_train]]
115 |     y_test = y[indices[n_train:]]
116 |     print("Training data shape:", X_train.shape)
117 |     print("Test data shape:", X_test.shape)
118 |     print(id_to_length, max(y_test), min(y_test))
119 | 
120 |     # Multi-class logistic regression
121 |     print(datetime.now())
122 |     print("Fitting multi-class logistic regression model")
123 |     logreg = LogisticRegression(
124 |         C=1e5, solver="lbfgs", multi_class="multinomial"
125 |         # solver="lbfgs", multi_class="ovr", max_iter=200
126 |         )
127 |     logreg.fit(X_train, y_train)
128 |     print(datetime.now())
129 | 
130 |     # Predict classes
131 |     y_pred = logreg.predict(X_test)
132 |     accuracy = accuracy_score(y_test, y_pred)
133 | 
134 |     print("Length classification accuracy: {:.2f}%".format(accuracy*100))
135 |     print(
136 |         classification_report(y_test, y_pred,
137 |         target_names=[str(id_to_length[i]) + " phone" for i in range(max(y) +
138 |         1)])
139 |         )
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     main()
144 | 


--------------------------------------------------------------------------------
/qbe/get_dense_seg_costs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Calculate costs for dense search.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2017, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | from scipy.spatial.distance import cdist
 14 | from tqdm import tqdm
 15 | import argparse
 16 | import pickle
 17 | import numpy as np
 18 | import os
 19 | import sys
 20 | import timeit
 21 | 
 22 | 
 23 | #-----------------------------------------------------------------------------#
 24 | #                              UTILITY FUNCTIONS                              #
 25 | #-----------------------------------------------------------------------------#
 26 | 
 27 | def check_argv():
 28 |     """Check the command line arguments."""
 29 |     parser = argparse.ArgumentParser(
 30 |         description=__doc__.strip().split("\n")[0], add_help=False
 31 |         )
 32 |     parser.add_argument("eval_dir", type=str, help="evaluation directory")
 33 |     parser.add_argument(
 34 |         "--metric",
 35 |         choices=["cosine", "euclidean", "hamming", "chebyshev",
 36 |         "symsumxentropy"],
 37 |         default="cosine", help="distance metric (default: %(default)s)"
 38 |         )
 39 |     if len(sys.argv) == 1:
 40 |         parser.print_help()
 41 |         sys.exit(1)
 42 |     return parser.parse_args()
 43 | 
 44 | 
 45 | def sweep_min(query_vec, search_array, metric):
 46 |     """
 47 |     Return the minimum cost between `query_vec` and rows of `search_array`.
 48 |     """
 49 |     if metric == "symsumxentropy":
 50 |         return np.min(
 51 |             cdist_sumxentropy(np.array([query_vec]), search_array, True)
 52 |             )
 53 |     else:
 54 |         return np.min(cdist(np.array([query_vec]), search_array, metric))
 55 | 
 56 | 
 57 | 
 58 | def cdist_sumxentropy(queries_array, search_array, symmetric=False):
 59 |     distances = np.zeros((queries_array.shape[0], search_array.shape[0]))
 60 |     for i_query, query_vec in enumerate(queries_array):
 61 |         for i_search, search_vec in enumerate(search_array):
 62 |             if symmetric:
 63 |                 distances[i_query, i_search] = sumxentroy(
 64 |                     search_vec, query_vec
 65 |                     ) + sumxentroy(query_vec, search_vec)
 66 |             else:
 67 |                 distances[i_query, i_search] = sumxentroy(
 68 |                     search_vec, query_vec
 69 |                     )
 70 |     return distances
 71 | 
 72 | 
 73 | #-----------------------------------------------------------------------------#
 74 | #                                MAIN FUNCTION                                #
 75 | #-----------------------------------------------------------------------------#
 76 | 
 77 | def main():
 78 |     args = check_argv()
 79 |     
 80 |     print(datetime.now())
 81 | 
 82 |     # Read queries
 83 |     fn = path.join(args.eval_dir, "queries.npz")
 84 |     if not path.isfile(fn):
 85 |         import re
 86 |         fn = path.join(
 87 |             re.sub("min\_.*step\_\d*\.", "", args.eval_dir), "queries.npz"
 88 |             )
 89 |     print("Reading:", fn)
 90 |     queries_dict = np.load(fn)
 91 |     queries_keys = sorted(list(queries_dict))
 92 |     queries_list = [queries_dict[i] for i in queries_keys]
 93 |     print("No. queries:", len(queries_list))
 94 |     print("Query array shape:", queries_dict[list(queries_dict)[0]].shape)
 95 | 
 96 |     # Read search collection
 97 |     fn = path.join(args.eval_dir, "search.npz")
 98 |     print("Reading:", fn)
 99 |     search_dict = np.load(fn)
100 |     search_keys = sorted(list(search_dict))
101 |     search_list = [search_dict[i] for i in search_keys]
102 |     print("No. search utterances:", len(search_list))
103 |     print("Search array shape:", search_dict[list(search_dict)[0]].shape)
104 | 
105 |     # print(datetime.now())
106 | 
107 |     print("Calculating costs:")
108 |     start_time = timeit.default_timer()
109 |     costs = []
110 |     for query_vec in tqdm(queries_list):
111 |         for search_array in search_list:
112 |             costs.append(sweep_min(
113 |                 query_vec, search_array, args.metric
114 |                 ))
115 |     end_time = timeit.default_timer()
116 |     n_search = len(search_list)
117 |     costs = [
118 |         costs[i*n_search:(i + 1)*n_search] for i in
119 |         range(int(np.floor(len(costs)/n_search)))
120 |         ]
121 |     duration = end_time - start_time
122 |     print(
123 |         "Avg. duration per comparison: {:.8f} sec".format(duration /
124 |         (len(queries_list) * len(search_list)))
125 |         )
126 | 
127 |     # Write costs
128 |     fn = path.join(args.eval_dir, "cost_dict." + args.metric + ".pkl")
129 |     print("Writing:", fn)
130 |     cost_dict = {}
131 |     for i_query, key_query in enumerate(queries_keys):
132 |         if key_query not in cost_dict:
133 |             cost_dict[key_query] = {}
134 |         for i_search, key_search in enumerate(search_keys):
135 |             cost_dict[key_query][key_search] = costs[i_query][i_search]
136 |     # print(datetime.now())
137 |     with open(fn, "wb") as f:
138 |         pickle.dump(cost_dict, f, -1)
139 | 
140 |     print(datetime.now())
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     main()
145 | 


--------------------------------------------------------------------------------
/data/train_spk.list:
--------------------------------------------------------------------------------
 1 | BG 018 020 021 023 025 026 027 032 035 039 041 042 043 045 046 047 048 049 050 052 053 054 056 060 062 064 065 066 067 069 070 071 072 073 075 077 078 079 080 082 083 085 087 088 089 091 092 093 094 096 097 098 099 101 102 103 104 105 107 111 112 113 114
 2 | CH 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 033 034 035 036 037 038 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 3 | CR 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 022 023 024 025 026 027 028 029 030 031 032 049 050 052 055 056 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 092 093 094
 4 | CZ 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082
 5 | FR 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 090 099 100
 6 | GE 005 006 007 009 011 012 013 014 015 016 017 019 022 023 024 025 027 028 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 074 075 076 077
 7 | HA 001 003 004 005 006 007 008 009 010 011 012 013 015 016 017 019 020 021 022 023 024 026 027 029 032 033 035 036 037 039 040 041 042 043 044 045 048 049 051 054 056 057 059 060 061 063 064 065 066 067 068 069 071 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103
 8 | KO 001 002 003 004 005 007 008 009 010 011 013 014 015 016 017 018 020 021 022 023 024 026 027 028 030 031 033 034 035 036 037 038 039 041 043 044 046 047 048 049 050 052 053 054 055 056 057 058 059 060 062 063 065 066 067 068 070 071 072 073 074 075 076 077 078 079 081 083 085 087 089 090 092 093 094 095 096 097 099 100
 9 | PL 002 003 006 007 008 010 013 014 015 016 017 018 019 020 021 022 024 025 026 028 029 032 034 035 036 037 038 039 042 045 047 048 049 051 052 053 054 055 056 057 058 059 060 061 062 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 091 092 093 094 095 096 099 100
10 | PO 001 002 003 004 005 006 007 008 009 010 011 013 014 015 016 017 018 019 021 022 023 024 025 026 029 030 031 033 034 036 037 042 043 044 045 047 048 049 051 053 054 055 056 058 059 060 068 069 070 071 101 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 125 126 127 128 129 130 131 136 140 141 144 145 146 147 148 149 150 212
11 | RU 001 003 004 006 007 008 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 028 029 030 031 032 034 035 037 038 039 040 041 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 064 066 067 068 070 071 072 073 074 075 076 077 079 080 081 082 083 084 085 086 087 088 089 090 091 093 094 095 096 098 099 100 101 105 114 115 116 117 119 120 121 123
12 | SP 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100
13 | SW 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 050 051 052 053 054 055 056 057 058 059 070 071 072 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100
14 | TH 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 024 026 027 029 030 031 032 033 034 035 036 038 039 040 041 042 043 044 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 062 063 064 065 066 067 068 069 070 071 072 074 075 076 077 078 079 080 081 082 083 084 086 087 088 089 090
15 | TU 004 007 009 010 011 012 017 018 020 021 022 023 024 026 027 028 029 033 034 035 036 038 040 042 043 044 045 047 048 049 050 051 052 053 054 055 057 058 059 060 061 062 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100
16 | VN 001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 019 020 021 022 023 024 025 026 027 028 029 030 031 033 034 035 036 037 038 039 040 041 042 043 044 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 070 071 073 074 075 076 077 080 081 082 083 084 085 086 087 088 089 090 091 093 095 097 099 100 101 104 105 108 109 111 112 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
17 | 


--------------------------------------------------------------------------------
/embeddings/link_mfcc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Create links to the MFCC files.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2019
  9 | """
 10 | 
 11 | from os import path
 12 | import numpy as np
 13 | import os
 14 | 
 15 | import argparse
 16 | import sys
 17 | 
 18 | relative_features_dir = path.join("..", "..", "..", "features")
 19 | sixteen_languages = [
 20 |     "BG", "CH", "CR", "CZ", "FR", "GE", "HA", "KO", "PL", "PO", "RU", "SP",
 21 |     "SW", "TH", "TU", "VN"
 22 |     ]
 23 | 
 24 | #-----------------------------------------------------------------------------#
 25 | #                              UTILITY FUNCTIONS                              #
 26 | #-----------------------------------------------------------------------------#
 27 | 
 28 | def check_argv():
 29 |     """Check the command line arguments."""
 30 |     parser = argparse.ArgumentParser(
 31 |         description=__doc__.strip().split("\n")[0], add_help=False
 32 |         )
 33 |     parser.add_argument(
 34 |         "language", type=str, help="GlobalPhone language",
 35 |         choices=sixteen_languages + ["all"]
 36 |         )
 37 |     if len(sys.argv) == 1:
 38 |         parser.print_help()
 39 |         sys.exit(1)
 40 |     return parser.parse_args()
 41 | 
 42 | 
 43 | def link_features(npz_fn, link_fn, link_dir):
 44 |     assert (
 45 |         path.isfile(path.join(link_dir, npz_fn))
 46 |         ), "missing file: {}".format(path.join(link_dir, npz_fn))
 47 |     if not path.isfile(link_fn):
 48 |         print("Linking:", npz_fn, "to", link_fn)
 49 |         os.symlink(npz_fn, link_fn)
 50 |     else:
 51 |         print("Using existing link:", link_fn)
 52 | 
 53 | 
 54 | #-----------------------------------------------------------------------------#
 55 | #                                MAIN FUNCTION                                #
 56 | #-----------------------------------------------------------------------------#
 57 | 
 58 | def main():
 59 |     args = check_argv()
 60 | 
 61 |     if args.language == "all":
 62 |         languages = sixteen_languages
 63 |     else:
 64 |         languages = [args.language]
 65 | 
 66 |     for language in languages:
 67 | 
 68 |         print("Linking features for", language)
 69 | 
 70 |         # Create link directory
 71 |         link_dir = path.join("data", language)
 72 |         if not path.isdir(link_dir):
 73 |             os.makedirs(link_dir)
 74 | 
 75 |         # Training: All features
 76 |         npz_fn = path.join(
 77 |             relative_features_dir, "mfcc", language, language.lower() +
 78 |             ".train.npz"
 79 |             )
 80 |         link_fn = path.join(link_dir, "train.all.npz")
 81 |         link_features(npz_fn, link_fn, link_dir)
 82 | 
 83 |         # Training: Ground truth words
 84 |         npz_fn = path.join(
 85 |             relative_features_dir, "mfcc", language, language.lower() +
 86 |             ".train.gt_words.npz"
 87 |             )
 88 |         link_fn = path.join(link_dir, "train.gt.npz")
 89 |         link_features(npz_fn, link_fn, link_dir)
 90 | 
 91 |         # Training: UTD words
 92 |         npz_fn = path.join(
 93 |             relative_features_dir, "mfcc", language, language.lower() +
 94 |             ".train.utd_terms.npz"
 95 |             )
 96 |         if path.isfile(path.join(link_dir, npz_fn)):
 97 |             # Not all languages have UTD output
 98 |             link_fn = path.join(link_dir, "train.utd.npz")
 99 |             link_features(npz_fn, link_fn, link_dir)
100 | 
101 |         # Training: UTD words with fixed labels
102 |         npz_fn = path.join(
103 |             relative_features_dir, "mfcc", language, language.lower() +
104 |             ".train.utd_terms.fixed_labels.npz"
105 |             )
106 |         if path.isfile(path.join(link_dir, npz_fn)):
107 |             # Not all languages have UTD output
108 |             link_fn = path.join(link_dir, "train.utd.fixed_labels.npz")
109 |             link_features(npz_fn, link_fn, link_dir)
110 | 
111 |         # Training: UTD words with fixed segment intervals
112 |         npz_fn = path.join(
113 |             relative_features_dir, "mfcc", language, language.lower() +
114 |             ".train.utd_terms.fixed_segs.npz"
115 |             )
116 |         if path.isfile(path.join(link_dir, npz_fn)):
117 |             # Not all languages have UTD output
118 |             link_fn = path.join(link_dir, "train.utd.fixed_segs.npz")
119 |             link_features(npz_fn, link_fn, link_dir)
120 | 
121 |         # Training: UTD words with fixed labels and segment intervals
122 |         npz_fn = path.join(
123 |             relative_features_dir, "mfcc", language, language.lower() +
124 |             ".train.utd_terms.fixed_labels_segs.npz"
125 |             )
126 |         if path.isfile(path.join(link_dir, npz_fn)):
127 |             # Not all languages have UTD output
128 |             link_fn = path.join(link_dir, "train.utd.fixed_labels_segs.npz")
129 |             link_features(npz_fn, link_fn, link_dir)
130 | 
131 |         # Validation: Ground truth words
132 |         npz_fn = path.join(
133 |             relative_features_dir, "mfcc", language, language.lower() +
134 |             ".dev.gt_words.npz"
135 |             )
136 |         link_fn = path.join(link_dir, "val.npz")
137 |         link_features(npz_fn, link_fn, link_dir)
138 | 
139 |         # Testing: Ground truth words
140 |         npz_fn = path.join(
141 |             relative_features_dir, "mfcc", language, language.lower() +
142 |             ".eval.gt_words.npz"
143 |             )
144 |         link_fn = path.join(link_dir, "test.npz")
145 |         link_features(npz_fn, link_fn, link_dir)
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 | 


--------------------------------------------------------------------------------
/notebooks/sandbox_splitnet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sandbox"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Herman Kamper, Stellenbosch University, 2019."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Preliminaries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "The autoreload extension is already loaded. To reload it, use:\n",
 34 |       "  %reload_ext autoreload\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "%matplotlib inline\n",
 40 |     "%load_ext autoreload\n",
 41 |     "%autoreload 2\n",
 42 |     "\n",
 43 |     "from os import path\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "import numpy as np\n",
 46 |     "import os\n",
 47 |     "import sys\n",
 48 |     "import tensorflow as tf\n",
 49 |     "\n",
 50 |     "sys.path.append(path.join(\"..\", \"src\"))\n",
 51 |     "import tflego\n",
 52 |     "\n",
 53 |     "from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import warnings\n",
 63 |     "warnings.filterwarnings(\"ignore\")\n",
 64 |     "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"2\"\n",
 65 |     "tf.logging.set_verbosity(tf.logging.ERROR)\n",
 66 |     "if type(tf.contrib) != type(tf):\n",
 67 |     "    tf.contrib._warning = None"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Split network based on condition"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "tf.reset_default_graph()\n",
 84 |     "\n",
 85 |     "# Random seed\n",
 86 |     "np.random.seed(1)\n",
 87 |     "tf.set_random_seed(1)\n",
 88 |     "\n",
 89 |     "# Parameters\n",
 90 |     "n_data = 7\n",
 91 |     "d_in = 5\n",
 92 |     "n_languages = 3\n",
 93 |     "n_classes = 2\n",
 94 |     "test_data = np.asarray(np.random.randn(n_data, d_in), dtype=NP_DTYPE)\n",
 95 |     "test_language = np.asarray([0, 0, 0, 1, 1, 2, 2], dtype=NP_ITYPE)  # want to split accordingly\n",
 96 |     "test_class =    np.asarray([0, 1, 0, 1, 0, 0, 1], dtype=NP_ITYPE)  # output class\n",
 97 |     "\n",
 98 |     "# Model\n",
 99 |     "x = tf.placeholder(TF_DTYPE, [None, d_in])\n",
100 |     "language = tf.placeholder(TF_ITYPE, [None])\n",
101 |     "y = tf.placeholder(TF_ITYPE, [None])\n",
102 |     "ff = tflego.build_feedforward(\n",
103 |     "    x, [10, 9]\n",
104 |     "    )\n",
105 |     "split_networks = []\n",
106 |     "for i_lang in range(n_languages):\n",
107 |     "    with tf.variable_scope(\"split_{}\".format(i_lang)):\n",
108 |     "        split_network = tflego.build_feedforward(\n",
109 |     "            ff, [6, n_classes]\n",
110 |     "            )\n",
111 |     "        if i_lang == 0:\n",
112 |     "            split_network *= 0\n",
113 |     "        elif i_lang == 1:\n",
114 |     "            split_network *= np.inf\n",
115 |     "        elif i_lang == 2:\n",
116 |     "            pass\n",
117 |     "    split_networks.append(split_network)\n",
118 |     "    \n",
119 |     "output = tf.where(tf.equal(language, 0), split_networks[0], \n",
120 |     "    tf.where(tf.equal(language, 1), split_networks[1], split_networks[2])\n",
121 |     "    )\n",
122 |     "\n",
123 |     "# output = tf.where(tf.equal(language, 0), (language + 1)*55, language*0)\n",
124 |     "# output = tf.where(tf.equal(language, 0), 55, \n",
125 |     "#     tf.where(tf.equal(language, 1), 66,\n",
126 |     "#     tf.where(tf.equal(language, 2), 9, -1\n",
127 |     "#     )))\n"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 5,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "init = tf.global_variables_initializer()\n",
137 |     "with tf.Session() as session:\n",
138 |     "    session.run(init)\n",
139 |     "    np_output = output.eval({x: test_data, language: test_language})"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 6,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "array([[-0.        , -0.        ],\n",
151 |        "       [-0.        , -0.        ],\n",
152 |        "       [-0.        , -0.        ],\n",
153 |        "       [        inf,        -inf],\n",
154 |        "       [        inf,        -inf],\n",
155 |        "       [ 1.7773362 , -0.6447714 ],\n",
156 |        "       [ 1.8913155 , -0.20415437]], dtype=float32)"
157 |       ]
158 |      },
159 |      "execution_count": 6,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "np_output"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": []
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.5.2"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 2
197 | }
198 | 


--------------------------------------------------------------------------------
/embeddings/apply_model_to_npz.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Encode the given NumPy archive using the specified model.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2018, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | import argparse
 14 | import pickle
 15 | import numpy as np
 16 | import os
 17 | import sys
 18 | import tensorflow as tf
 19 | 
 20 | sys.path.append(path.join("..", "src"))
 21 | 
 22 | from apply_model import build_model
 23 | from link_mfcc import sixteen_languages
 24 | from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE
 25 | import batching
 26 | import data_io
 27 | 
 28 | 
 29 | #-----------------------------------------------------------------------------#
 30 | #                            APPLY MODEL FUNCTIONS                            #
 31 | #-----------------------------------------------------------------------------#
 32 | 
 33 | 
 34 | def apply_model(model_fn, npz_fn):
 35 | 
 36 |     # Load the model options
 37 |     model_dir = path.split(model_fn)[0]
 38 |     options_dict_fn = path.join(model_dir, "options_dict.pkl")
 39 |     print("Reading:", options_dict_fn)
 40 |     with open(options_dict_fn, "rb") as f:
 41 |         options_dict = pickle.load(f)
 42 | 
 43 |     # Load data
 44 |     x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz(
 45 |         npz_fn
 46 |         )
 47 | 
 48 |     if "cnn" in options_dict["script"]:
 49 | 
 50 |         # Pad and flatten data
 51 |         x_data, _ = data_io.pad_sequences(
 52 |             x_data, options_dict["max_length"], True
 53 |             )
 54 |         x_data = np.transpose(x_data, (0, 2, 1))
 55 |         x_data = x_data.reshape((-1, options_dict["d_in"]))
 56 | 
 57 |         # Build model
 58 |         x = tf.placeholder(TF_DTYPE, [None, options_dict["d_in"]])
 59 |         model = build_model(x, None, options_dict)
 60 | 
 61 |         # Embed data
 62 |         batch_iterator = batching.LabelledIterator(
 63 |             x_data, None, x_data.shape[0], False
 64 |             )
 65 |         saver = tf.train.Saver()
 66 |         with tf.Session() as session:
 67 |             saver.restore(session, model_fn)
 68 |             for batch_x in batch_iterator:
 69 |                 np_z = session.run(
 70 |                     [model["encoding"]], feed_dict={x: batch_x})[0]
 71 |                 break  # single batch
 72 | 
 73 |     else:  # rnn
 74 |         
 75 |         # Truncate and limit dimensionality
 76 |         data_io.trunc_and_limit_dim(
 77 |             x_data, lengths, options_dict["n_input"],
 78 |             options_dict["max_length"]
 79 |             )
 80 | 
 81 |         # Build model
 82 |         x = tf.placeholder(TF_DTYPE, [None, None, options_dict["n_input"]])
 83 |         x_lengths = tf.placeholder(TF_ITYPE, [None])
 84 |         model = build_model(x, x_lengths, options_dict)
 85 | 
 86 |         # Embed data
 87 |         batch_iterator = batching.SimpleIterator(x_data, len(x_data), False)
 88 |         saver = tf.train.Saver()
 89 |         with tf.Session() as session:
 90 |             saver.restore(session, model_fn)
 91 |             for batch_x_padded, batch_x_lengths in batch_iterator:
 92 |                 np_x = batch_x_padded
 93 |                 np_x_lengths = batch_x_lengths
 94 |                 np_z = session.run(
 95 |                     [model["encoding"]], feed_dict={x: np_x, x_lengths:
 96 |                     np_x_lengths}
 97 |                     )[0]
 98 |                 break  # single batch
 99 | 
100 |     embed_dict = {}
101 |     for i, utt_key in enumerate([keys[i] for i in batch_iterator.indices]):
102 |         embed_dict[utt_key] = np_z[i]
103 | 
104 |     return embed_dict
105 | 
106 | 
107 | #-----------------------------------------------------------------------------#
108 | #                              UTILITY FUNCTIONS                              #
109 | #-----------------------------------------------------------------------------#
110 | 
111 | def check_argv():
112 |     """Check the command line arguments."""
113 |     parser = argparse.ArgumentParser(
114 |         description=__doc__.strip().split("\n")[0], add_help=False
115 |         )
116 |     parser.add_argument("model_fn", type=str, help="model checkpoint filename")
117 |     parser.add_argument("npz_fn", type=str, help="the NumPy archive to encode")
118 |     parser.add_argument(
119 |         "--output_npz_fn", type=str,
120 |         help="if provided, the output is written to this NumPy archive "
121 |         "instead of the model directory"
122 |         )
123 |     if len(sys.argv) == 1:
124 |         parser.print_help()
125 |         sys.exit(1)
126 |     return parser.parse_args()
127 | 
128 | 
129 | #-----------------------------------------------------------------------------#
130 | #                                MAIN FUNCTION                                #
131 | #-----------------------------------------------------------------------------#
132 | 
133 | def main():
134 |     args = check_argv()
135 | 
136 |     # Do not output TensorFlow info and warning messages
137 |     import warnings
138 |     warnings.filterwarnings("ignore")
139 |     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
140 |     tf.logging.set_verbosity(tf.logging.ERROR)
141 |     if type(tf.contrib) != type(tf):
142 |         tf.contrib._warning = None
143 | 
144 |     # Embed data
145 |     embed_dict = apply_model(args.model_fn, args.npz_fn)
146 | 
147 |     # Save embeddings
148 |     model_dir, model_fn = path.split(args.model_fn)
149 |     if args.output_npz_fn is None:
150 |         npz_fn = path.join(
151 |             model_dir, path.splitext(model_fn)[0] + "." +
152 |             path.split(args.npz_fn)[-1]
153 |             )
154 |     else:
155 |         npz_fn = args.output_npz_fn
156 |     print("Writing:", npz_fn)
157 |     np.savez_compressed(npz_fn, **embed_dict)
158 |     print(datetime.now())
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     main()
163 | 


--------------------------------------------------------------------------------
/features/analyse_utd_pairs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Analyse UTD pairs for the indicated language.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2020
  9 | """
 10 | 
 11 | from os import path
 12 | from tqdm import tqdm
 13 | import argparse
 14 | import codecs
 15 | import glob
 16 | import numpy as np
 17 | import os
 18 | import shutil
 19 | import sys
 20 | 
 21 | sys.path.append("..")
 22 | 
 23 | from extract_features import get_overlap
 24 | from paths import gp_data_dir, gp_alignments_dir
 25 | 
 26 | 
 27 | #-----------------------------------------------------------------------------#
 28 | #                              UTILITY FUNCTIONS                              #
 29 | #-----------------------------------------------------------------------------#
 30 | 
 31 | def check_argv():
 32 |     """Check the command line arguments."""
 33 |     parser = argparse.ArgumentParser(
 34 |         description=__doc__.strip().split("\n")[0], add_help=False
 35 |         )
 36 |     parser.add_argument(
 37 |         "language", type=str, help="GlobalPhone language",
 38 |         choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA", "KO", "PL", "PO",
 39 |         "RU", "SP", "SW", "TH", "TU", "VN"]
 40 |         )
 41 |     if len(sys.argv) == 1:
 42 |         parser.print_help()
 43 |         sys.exit(1)
 44 |     return parser.parse_args()
 45 | 
 46 | 
 47 | #-----------------------------------------------------------------------------#
 48 | #                                MAIN FUNCTION                                #
 49 | #-----------------------------------------------------------------------------#
 50 | 
 51 | def main():
 52 |     args = check_argv()
 53 |     subset = "train"
 54 | 
 55 |     # Read UTD terms
 56 |     utd_list_fn = path.join("lists", args.language, "train.utd_terms.list")
 57 |     print("Reading:", utd_list_fn)
 58 |     # overlap_dict[speaker_utt][(start, end)] is list a tuples of
 59 |     # (label, (start, end), overlap, cluster_label)
 60 |     overlap_dict = {}
 61 |     with codecs.open(utd_list_fn, "r", "utf-8") as utd_list_f:
 62 |         for line in utd_list_f:
 63 |             term, speaker, utt, start_end = line.strip().split("_")
 64 |             start, end = start_end.split("-")
 65 |             start = int(start)
 66 |             end = int(end)
 67 |             if not speaker + "_" + utt in overlap_dict:
 68 |                 overlap_dict[speaker + "_" + utt] = {}
 69 |             overlap_dict[speaker + "_" + utt][(start, end, term)] = []
 70 | 
 71 |     # Read forced alignments
 72 |     fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm")
 73 |     print("Reading:", fa_fn)
 74 |     fa_dict = {}
 75 |     with codecs.open(fa_fn, "r", "utf-8") as fa_f:
 76 |         for line in fa_f:
 77 |             utt_key, _, start, duration, label = line.strip().split()
 78 |             start = float(start)
 79 |             duration = float(duration)
 80 |             end = start + duration
 81 |             start_frame = int(round(start*100))
 82 |             end_frame = int(round(end*100))
 83 |             if (label != "<unk>" and label != "sil" and label != "?" and
 84 |                     label != "spn"):
 85 |                 if not utt_key in fa_dict:
 86 |                     fa_dict[utt_key] = {}
 87 |                 fa_dict[utt_key][start_frame, end_frame] = label
 88 | 
 89 |     # Find ground truth terms with maximal overlap
 90 |     print("Getting ground truth terms with overlap:")
 91 |     overlap_label_dict = {}
 92 |     for utt_key in tqdm(fa_dict):
 93 |         # print(utt_key)
 94 |         if utt_key not in overlap_dict:
 95 |             continue
 96 |         for (fa_start, fa_end) in fa_dict[utt_key]:
 97 |             for (utd_start, utd_end, utd_term) in overlap_dict[utt_key]:
 98 |                 overlap = get_overlap(
 99 |                     utd_start, utd_end, fa_start, fa_end
100 |                     )
101 |                 if overlap == 0:
102 |                     continue
103 |                 overlap_dict[utt_key][(utd_start, utd_end, utd_term)].append((
104 |                     fa_dict[utt_key][(fa_start, fa_end)],
105 |                     (fa_start, fa_end), overlap
106 |                     ))
107 |                 term_key = "{}_{}_{:06d}-{:06d}".format(
108 |                     utd_term, utt_key, utd_start, utd_end
109 |                     )
110 |                 if not term_key in overlap_label_dict:
111 |                     overlap_label_dict[term_key] = set()
112 |                 overlap_label_dict[term_key].add(
113 |                     fa_dict[utt_key][(fa_start, fa_end)]
114 |                     )
115 | 
116 |     # Read UTD pairs
117 |     pairs_fn = path.join("lists", args.language, "train.utd_pairs.list")
118 |     pairs = []
119 |     n_pairs = 0
120 |     n_correct = 0
121 |     n_missing = 0
122 |     with codecs.open(pairs_fn, "r", "utf-8") as pairs_f:
123 |         for line in pairs_f:
124 |             term1, term2 = line.strip().split(" ")
125 |             pairs.append((term1, term2))
126 |             if (term1 not in overlap_label_dict or term2 not in
127 |                     overlap_label_dict):
128 |                 n_missing += 1
129 |                 continue
130 |             if (len(overlap_label_dict[term1].intersection(
131 |                     overlap_label_dict[term2])) > 0):
132 |                 n_correct += 1
133 |             n_pairs += 1
134 |     print("Correct pairs: {:.2f}%".format(n_correct/n_pairs*100.0))
135 |     print("No. missing pairs: {} out of {}".format(n_missing, n_pairs))
136 | 
137 | 
138 | 
139 |     # # Construct list of UTD labels and list of list of overlapping GT terms
140 |     # labels = []
141 |     # overlap_lists = []
142 |     # for utt_key in tqdm(overlap_dict):
143 |     #     for (utd_start, utd_end, utd_term) in overlap_dict[utt_key]:
144 |     #         overlap_list = overlap_dict[utt_key][
145 |     #             (utd_start, utd_end, utd_term)
146 |     #             ]
147 |     #         if len(overlap_list) == 0:
148 |     #             continue
149 |     #         labels.append(utd_term)
150 |     #         overlap_lists.append([i[0] for i in overlap_list])
151 | 
152 | 
153 | 
154 | if __name__ == "__main__":
155 |     main()
156 | 


--------------------------------------------------------------------------------
/src/plotting.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Some of these functions are based on
  3 | http://deeplearning.net/tutorial/code/utils.py.
  4 | 
  5 | Author: Herman Kamper
  6 | Contact: kamperh@gmail.com
  7 | Date: 2015, 2016
  8 | """
  9 | 
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | 
 14 | def scale_unit_interval(mat, eps=1e-8):
 15 |     """Scales all values in `mat` to be between 0 and 1."""
 16 |     mat = mat.copy()
 17 |     mat -= mat.min()
 18 |     mat *= 1.0 / (mat.max() + eps)
 19 |     return mat
 20 | 
 21 | 
 22 | def array_to_pixels(mat):
 23 |     """Convert the given array to pixel values after scaling."""
 24 |     mat = scale_unit_interval(mat)
 25 |     out_array = np.zeros(mat.shape, dtype="uint8")
 26 |     for i in range(mat.shape[0]):
 27 |         for j in range(mat.shape[1]):
 28 |             out_array[i, j] = mat[i, j] * 255
 29 |     return out_array
 30 | 
 31 | 
 32 | def tile_images(X, image_shape, tile_shape, tile_spacing=(1, 1),
 33 |         scale_rows_unit_interval=True):
 34 |     """
 35 |     Transform the 2-D matrix `X`, which has one flattened data instance or
 36 |     filter per row, into a matrix of pixel values with the data instances or
 37 |     filters layed out as tiles.
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     X : 2-D matrix or 4-D tensor
 42 |         The data to transform. If the tensor is given, the data from the
 43 |         last two dimensions are tiled.
 44 |     image_shape : (height, width)
 45 |         Each row is reshaped to this dimensionality.
 46 |     tile_shape : (n_rows, n_columns)
 47 |         Number of rows and columns to have in the output.
 48 |     scale_rows_unit_interval : bool
 49 |         Should each row be scaled to interval of [0, 1] before plotting
 50 | 
 51 |     Return
 52 |     ------
 53 |     out_array : matrix of type int
 54 |         Can be passed directly to `PIL.Image.fromarray`.
 55 |     """
 56 | 
 57 |     assert len(image_shape) == 2
 58 |     assert len(tile_shape) == 2
 59 |     assert len(tile_spacing) == 2
 60 |     assert len(X.shape) == 2 or len(X.shape) == 4
 61 | 
 62 |     if len(X.shape) == 4:
 63 |         n_filters_out, n_channels_in, image_h, image_w = X.shape
 64 |         image_shape = image_h, image_w
 65 |         X = X.copy()
 66 |         X = X.reshape(n_filters_out*n_channels_in, image_h*image_w)
 67 | 
 68 |     # Dimensions
 69 |     image_h, image_w = image_shape
 70 |     spacing_h, spacing_w = tile_spacing
 71 |     n_tiles_h, n_tiles_w = tile_shape
 72 | 
 73 |     # Output dimensionality 
 74 |     out_shape = [0, 0]
 75 |     out_shape[0] = (image_h + spacing_h) * n_tiles_h - spacing_h
 76 |     out_shape[1] = (image_w + spacing_w) * n_tiles_w - spacing_w
 77 | 
 78 |     # Output matrix
 79 |     out_array = np.zeros(out_shape, dtype="uint8")
 80 | 
 81 |     # Lay out tiles
 82 |     for i_tile in xrange(n_tiles_h):
 83 |         for j_tile in xrange(n_tiles_w):
 84 |             cur_image = X[i_tile * n_tiles_w + j_tile].reshape(image_shape)
 85 |             if scale_rows_unit_interval:
 86 |                 cur_image = scale_unit_interval(cur_image)
 87 |             i = i_tile * (image_h + spacing_h)
 88 |             j = j_tile * (image_w + spacing_w)
 89 |             out_array[i:i + image_h, j:j + image_w] = cur_image * 255
 90 | 
 91 |     return out_array
 92 | 
 93 | 
 94 | def make_patch_spines_invisible(ax):
 95 |     ax.set_frame_on(True)
 96 |     ax.patch.set_visible(False)
 97 |     for sp in ax.spines.values():
 98 |         sp.set_visible(False)
 99 | 
100 | 
101 | def plot_raw_embeds(embed_dict, types=None, mvn=False, **kwargs):
102 |     """Plot all the embeddings of type `types`; if None, plot everything."""
103 | 
104 |     # Get embeddings
105 |     embeddings = []
106 |     labels = []
107 |     for key in embed_dict:
108 |         if "_" in key:
109 |             label = key.split("_")[0]
110 |         else:
111 |             label = key
112 |         if types is None:
113 |             labels.append(label)
114 |             embeddings.append(embed_dict[key])
115 |         elif label in types:
116 |             labels.append(label)
117 |             embeddings.append(embed_dict[key])
118 |     n_embeds = len(embeddings)
119 |     embeddings = np.array(embeddings)
120 | 
121 |     # Mean and variance normalise
122 |     if mvn:
123 |         embeddings = (
124 |             embeddings - embeddings.mean(axis=0)
125 |             )/embeddings.std(axis=0)
126 | 
127 |     # Now sort by label
128 |     sort_order = np.argsort(np.array(labels))
129 |     sorted_labels = np.array(labels)[sort_order]
130 | 
131 |     # Get cluster tick positions
132 |     type_ticks = [0]
133 |     for i in range(len(sorted_labels) - 1):
134 |         if sorted_labels[i] != sorted_labels[i + 1]:
135 |             type_ticks.append(i + 1)
136 |     type_ticks.append(n_embeds)
137 | 
138 |     # Get label positions and labels
139 |     type_label_ticks = []
140 |     type_labels = []
141 |     for i in sorted(list(set(labels))):
142 |         where = np.where(sorted_labels == i)[0]
143 |         if len(where) == 0:
144 |             continue
145 |         pos = int(np.mean(where))
146 |         type_label_ticks.append(pos)
147 |         type_labels.append(i)
148 | 
149 |     # Variables used for plotting
150 |     labels_offset = 1.04
151 |     par2_linewidth = 0.5
152 | 
153 |     fig, host = plt.subplots(**kwargs)
154 |     par2 = host.twinx()
155 |     par2.spines["right"].set_position(("axes", labels_offset))
156 |     make_patch_spines_invisible(par2)
157 |     par2.spines["right"].set_visible(True)
158 |     par2.set_ylim([0, n_embeds])
159 |     par2.invert_yaxis()
160 |     par2.set_yticks(type_ticks)
161 |     par2.set_yticklabels([])
162 |     par2.tick_params(axis="y", width=par2_linewidth, length=10)
163 |     par2.spines["right"].set_linewidth(par2_linewidth)
164 |     par2.set_yticks(type_label_ticks, minor=True)
165 |     par2.set_yticklabels(type_labels, minor=True)
166 |     par2.set_ylabel("Word types")
167 |     for line in par2.yaxis.get_minorticklines():
168 |         line.set_visible(False)
169 | 
170 |     cax = host.imshow(
171 |         embeddings[sort_order], interpolation="nearest",
172 |         aspect="auto"
173 |         )
174 |     host.set_yticks([])
175 |     host.set_ylabel("Word embedding vectors")
176 |     host.set_xlabel("Embedding dimensions")
177 | 


--------------------------------------------------------------------------------
/features/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions.
  3 | 
  4 | Author: Herman Kamper
  5 | Contact: kamperh@gmail.com
  6 | Date: 2019
  7 | """
  8 | 
  9 | from tqdm import tqdm
 10 | import codecs
 11 | import numpy as np
 12 | import subprocess
 13 | 
 14 | 
 15 | shell = lambda command: subprocess.Popen(
 16 |     command, shell=True, stdout=subprocess.PIPE
 17 |     ).communicate()[0]
 18 | 
 19 | 
 20 | def filter_words(fa_fn, output_fn, min_frames=50, min_chars=5):
 21 |     """
 22 |     Find words of at least `min_frames` frames and `min_chars` characters.
 23 | 
 24 |     Ground truth words are extracted from the forced alignment file `fa_fn` and
 25 |     written to the word list file `output_fn`.
 26 |     """
 27 |     print("Reading:", fa_fn)
 28 |     print("Writing:", output_fn)
 29 |     n_tokens = 0
 30 |     with codecs.open(fa_fn, "r", "utf-8") as fa_f:
 31 |         with codecs.open(output_fn, "w", "utf-8") as output_f:
 32 |             for line in fa_f:
 33 |                 utt_key, _, start, duration, label = line.strip().split()
 34 |                 start = float(start)
 35 |                 duration = float(duration)
 36 |                 end = start + duration
 37 |                 start_frame = int(round(start*100))
 38 |                 end_frame = int(round(end*100))
 39 |                 if (end_frame - start_frame >= min_frames and len(label) >=
 40 |                         min_chars and label != "<unk>" and label != "sil"
 41 |                         and label != "?" and label != "spn"):
 42 |                     output_f.write(
 43 |                         "{}_{}_{:06d}-{:06d}\n".format(label, utt_key,
 44 |                         start_frame, end_frame + 1)
 45 |                         )
 46 |                     n_tokens += 1
 47 |     print("No. tokens:", n_tokens)
 48 | 
 49 | 
 50 | def segments_from_npz(input_npz_fn, segments_fn, output_npz_fn):
 51 |     """
 52 |     Cut segments from a NumPy archive and save in a new archive.
 53 | 
 54 |     As keys, the archives use the format "label_spkr_utterance_start-end".
 55 |     """
 56 | 
 57 |     # Read the .npz file
 58 |     print("Reading npz:", input_npz_fn)
 59 |     input_npz = np.load(input_npz_fn)
 60 | 
 61 |     # Create input npz segments dict
 62 |     utterance_segs = {}  # utterance_segs["s08_02b_029657-029952"]
 63 |                          # is (29657, 29952)
 64 |     for key in input_npz.keys():
 65 |         s = key.split("_")
 66 |         if len(s) == 3:
 67 |             # Format: s08_02b_029657-029952
 68 |             utterance_segs[key] = tuple([int(i) for i in s[-1].split("-")])
 69 |         elif len(s) == 2:
 70 |             # Format: s08_02b
 71 |             utterance_segs[key] = (0, input_npz[key].shape[0])
 72 | 
 73 |     # Create target segments dict
 74 |     print("Reading segments:", segments_fn)
 75 |     target_segs = {}  # target_segs["years_s01_01a_004951-005017"]
 76 |                       # is ("s01_01a", 4951, 5017)
 77 |     for line in open(segments_fn):
 78 |         line_split = line.split("_")
 79 |         utterance = line_split[-3] + "_" + line_split[-2]
 80 |         start, end = line_split[-1].split("-")
 81 |         start = int(start)
 82 |         end = int(end)
 83 |         target_segs[line.strip()] = (utterance, start, end)
 84 | 
 85 |     print("Extracting segments:")
 86 |     output_npz = {}
 87 |     n_target_segs = 0
 88 |     for target_seg_key in tqdm(sorted(target_segs)):
 89 |         utterance, target_start, target_end = target_segs[target_seg_key]
 90 |         for utterance_key in [
 91 |                 i for i in utterance_segs.keys() if (i +
 92 |                 "_").startswith(utterance + "_")]:
 93 |                 # If like below: "GE008_128" also matches "GE008_12"
 94 |                 # i for i in utterance_segs.keys() if i.startswith(utterance)]:
 95 |             utterance_start, utterance_end = utterance_segs[utterance_key]
 96 |             if (target_start >= utterance_start and target_start <
 97 |                     utterance_end):
 98 |                 start = target_start - utterance_start
 99 |                 end = target_end - utterance_start
100 |                 output_npz[target_seg_key] = input_npz[
101 |                     utterance_key
102 |                     ][start:end]
103 |                 n_target_segs += 1
104 |                 break
105 | 
106 |     print(
107 |         "Extracted " + str(n_target_segs) + " out of " + str(len(target_segs))
108 |         + " segments"
109 |         )
110 |     print("Writing:", output_npz_fn)
111 |     np.savez(output_npz_fn, **output_npz)
112 | 
113 | 
114 | def terms_from_pairs(pairs_fn, output_list_fn):
115 | 
116 |     print("Reading:", pairs_fn)
117 |     terms = set()
118 |     with open(pairs_fn) as f:
119 |         for line in f:
120 |             line = line.replace("###", " ")
121 |             (cluster, utt1, start1, end1, cluster2, utt2, start2, end2) = (
122 |                 line.strip().split(" ")
123 |                 )
124 |             start1 = int(start1)
125 |             end1 = int(end1)
126 |             start2 = int(start2)
127 |             end2 = int(end2)
128 |             terms.add((cluster, utt1, start1, end1))
129 |             terms.add((cluster, utt2, start2, end2))
130 | 
131 |     print("Writing:", output_list_fn)
132 |     with open(output_list_fn, "w") as f:
133 |         for cluster, utt, start, end in terms:
134 |             f.write(
135 |                 cluster + "_" + utt + "_" + "%06d" % start + "-" + "%06d" % end
136 |                 + "\n"
137 |                 )
138 | 
139 | 
140 | def format_enno_pairs(enno_pairs_fn, output_pairs_fn):
141 |     print("Reading:", enno_pairs_fn)
142 |     print("Writing:", output_pairs_fn)
143 |     with codecs.open(enno_pairs_fn, "r", "utf-8") as enno_f:
144 |         with codecs.open(output_pairs_fn, "w", "utf-8") as output_f:
145 |             for line in enno_f:
146 |                 line = line.replace("###", " ")
147 |                 (cluster1, utt1, start1, end1, cluster2, utt2, start2, end2) = (
148 |                     line.strip().split(" ")
149 |                     )
150 |                 start1 = int(start1)
151 |                 end1 = int(end1)
152 |                 start2 = int(start2)
153 |                 end2 = int(end2)
154 |                 output_f.write(
155 |                     "{}_{}_{:06d}-{:06d} "
156 |                     "{}_{}_{:06d}-{:06d}\n".format(cluster1, utt1, start1,
157 |                     end1, cluster2, utt2, start2, end2)
158 |                     )
159 | 


--------------------------------------------------------------------------------
/features/features.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions for extracting filterbank and MFCC features.
  3 | 
  4 | Author: Herman Kamper
  5 | Contact: kamperh@gmail.com
  6 | Date: 2019
  7 | """
  8 | 
  9 | from os import path
 10 | from tqdm import tqdm
 11 | import glob
 12 | import numpy as np
 13 | import scipy.io.wavfile as wav
 14 | 
 15 | 
 16 | def extract_fbank_dir(dir):
 17 |     """
 18 |     Extract filterbanks for all audio files in `dir` and return a dictionary.
 19 | 
 20 |     Each dictionary key will be the filename of the associated audio file
 21 |     without the extension. Mel-scale log filterbanks are extracted.
 22 |     """
 23 |     import librosa
 24 |     feat_dict = {}
 25 |     for wav_fn in tqdm(sorted(glob.glob(path.join(dir, "*.wav")))):
 26 |         signal, sample_rate = librosa.core.load(wav_fn, sr=None)
 27 |         signal = preemphasis(signal, coeff=0.97)
 28 |         fbank = np.log(librosa.feature.melspectrogram(
 29 |             signal, sr=sample_rate, n_mels=40,
 30 |             n_fft=int(np.floor(0.025*sample_rate)),
 31 |             hop_length=int(np.floor(0.01*sample_rate)), fmin=64, fmax=8000,
 32 |             ))
 33 |         # from python_speech_features import logfbank
 34 |         # samplerate, signal = wav.read(wav_fn)
 35 |         # fbanks = logfbank(
 36 |         #     signal, samplerate=samplerate, winlen=0.025, winstep=0.01,
 37 |         #     nfilt=45, nfft=2048, lowfreq=0, highfreq=None, preemph=0,
 38 |         #     winfunc=np.hamming
 39 |         #     )
 40 |         key = path.splitext(path.split(wav_fn)[-1])[0]
 41 |         feat_dict[key] = fbank.T
 42 |     return feat_dict
 43 | 
 44 | 
 45 | def extract_mfcc_dir(dir):
 46 |     """
 47 |     Extract MFCCs for all audio files in `dir` and return a dictionary.
 48 | 
 49 |     Each dictionary key will be the filename of the associated audio file
 50 |     without the extension. Deltas and double deltas are also extracted.
 51 |     """
 52 |     import librosa
 53 |     feat_dict = {}
 54 |     for wav_fn in tqdm(sorted(glob.glob(path.join(dir, "*.wav")))):
 55 |         signal, sample_rate = librosa.core.load(wav_fn, sr=None)
 56 |         if len(signal) == 0:
 57 |             continue
 58 |         signal = preemphasis(signal, coeff=0.97)
 59 |         mfcc = librosa.feature.mfcc(
 60 |             signal, sr=sample_rate, n_mfcc=13, n_mels=24,  #dct_type=3,
 61 |             n_fft=int(np.floor(0.025*sample_rate)),
 62 |             hop_length=int(np.floor(0.01*sample_rate)), fmin=64, fmax=8000,
 63 |             #htk=True
 64 |             )
 65 |         # mfcc = librosa.feature.mfcc(
 66 |         #     signal, sr=sample_rate, n_mfcc=13,
 67 |         #     n_fft=int(np.floor(0.025*sample_rate)),
 68 |         #     hop_length=int(np.floor(0.01*sample_rate))
 69 |         #     )
 70 |         if mfcc.shape[1] < 9:  # need at least 9 frames for deltas
 71 |             continue
 72 |         mfcc_delta = librosa.feature.delta(mfcc)
 73 |         mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)
 74 |         key = path.splitext(path.split(wav_fn)[-1])[0]
 75 |         feat_dict[key] = np.hstack([mfcc.T, mfcc_delta.T, mfcc_delta_delta.T])
 76 | 
 77 |         # # Temp
 78 |         # if "SP005_49" in wav_fn:
 79 |         #     print(key)
 80 |         #     print(feat_dict[key].shape)
 81 |         #     assert False
 82 | 
 83 |         # from python_speech_features import delta
 84 |         # from python_speech_features import mfcc
 85 |         # sample_rate, signal = wav.read(wav_fn)
 86 |         # mfccs = mfcc(
 87 |         #     signal, samplerate=sample_rate, winlen=0.025, winstep=0.01,
 88 |         #     numcep=13, nfilt=24, nfft=None, lowfreq=0, highfreq=None,
 89 |         #     preemph=0.97, ceplifter=22, appendEnergy=True, winfunc=np.hamming
 90 |         #     )
 91 |         # d_mfccs = delta(mfccs, 2)
 92 |         # dd_mfccs = delta(d_mfccs, 2)
 93 |         # key = path.splitext(path.split(wav_fn)[-1])[0]
 94 |         # feat_dict[key] = np.hstack([mfccs, d_mfccs, dd_mfccs])
 95 | 
 96 |         # import matplotlib.pyplot as plt
 97 |         # plt.imshow(feat_dict[key][2000:2200,:])
 98 |         # plt.show()
 99 |         # assert False
100 |     return feat_dict
101 | 
102 | 
103 | def extract_vad(feat_dict, vad_dict):
104 |     """
105 |     Remove silence based on voice activity detection (VAD).
106 | 
107 |     The `vad_dict` should have the same keys as `feat_dict` with the active
108 |     speech regions given as lists of tuples of (start, end) frame, with the end
109 |     excluded.
110 |     """
111 |     output_dict = {}
112 |     for utt_key in tqdm(sorted(feat_dict)):
113 |         if utt_key not in vad_dict:
114 |             print("Warning: Missing VAD for utterance", utt_key)
115 |             continue
116 |         for (start, end) in vad_dict[utt_key]:
117 |             segment_key = utt_key + "_{:06d}-{:06d}".format(start, end)
118 |             output_dict[segment_key] = feat_dict[utt_key][start:end, :]
119 |     return output_dict
120 | 
121 | 
122 | def speaker_mvn(feat_dict):
123 |     """
124 |     Perform per-speaker mean and variance normalisation.
125 | 
126 |     It is assumed that each of the keys in `feat_dict` starts with a speaker
127 |     identifier followed by an underscore.
128 |     """
129 | 
130 |     speakers = set([key.split("_")[0] for key in feat_dict])
131 | 
132 |     # Separate features per speaker
133 |     speaker_features = {}
134 |     for utt_key in sorted(feat_dict):
135 |         speaker = utt_key.split("_")[0]
136 |         if speaker not in speaker_features:
137 |             speaker_features[speaker] = []
138 |         speaker_features[speaker].append(feat_dict[utt_key])
139 | 
140 |     # Determine means and variances per speaker
141 |     speaker_mean = {}
142 |     speaker_std = {}
143 |     for speaker in speakers:
144 |         features = np.vstack(speaker_features[speaker])
145 |         speaker_mean[speaker] = np.mean(features, axis=0)
146 |         speaker_std[speaker] = np.std(features, axis=0)
147 | 
148 |     # Normalise per speaker
149 |     output_dict = {}
150 |     for utt_key in tqdm(sorted(feat_dict)):
151 |         speaker = utt_key.split("_")[0]
152 |         output_dict[utt_key] = (
153 |             (feat_dict[utt_key] - speaker_mean[speaker]) / 
154 |             speaker_std[speaker]
155 |             )
156 | 
157 |     return output_dict
158 | 
159 | 
160 | def preemphasis(signal, coeff=0.97):
161 |     """Perform preemphasis on the input `signal`."""    
162 |     return np.append(signal[0], signal[1:] - coeff*signal[:-1])
163 | 


--------------------------------------------------------------------------------
/blackbox/dp_align.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions and classes for aligning two lists using dynamic programming.
  3 | 
  4 | The algorithm is based on on a slight variation of the method given at:
  5 | http://www.avatar.se/molbioinfo2001/dynprog/adv_dynamic.html. By default NIST
  6 | insertion, deletion and substitution penalties are used.
  7 | 
  8 | Author: Herman Kamper
  9 | Contact: kamperh@gmail.com
 10 | Date: 2011, 2014, 2015, 2019
 11 | """
 12 | 
 13 | import numpy as np
 14 | 
 15 | 
 16 | #-----------------------------------------------------------------------------#
 17 | #                         DYNAMIC PROGRAMMING CLASSES                         #
 18 | #-----------------------------------------------------------------------------#
 19 | 
 20 | class DPEntry:
 21 |     """Alignment type ("d", "i", "s", or "m") and an integer score."""
 22 |     def __init__(self, align="m", score=0):
 23 |         self.align = align
 24 |         self.score = score
 25 | 
 26 | 
 27 | class DPError(object):
 28 |     """
 29 |     Attributes
 30 |     ----------
 31 |     n_del : int
 32 |     n_ins : int
 33 |     n_sub : int
 34 |     n_match : int
 35 |     n_total : int
 36 |     """
 37 | 
 38 |     def __init__(self, n_del=0, n_ins=0, n_sub=0, n_match=0, n_total=0):
 39 |         self.n_del = n_del
 40 |         self.n_ins = n_ins
 41 |         self.n_sub = n_sub
 42 |         self.n_match = n_match
 43 |         self.n_total = n_total
 44 | 
 45 |     def __add__(self, other):
 46 |         """Add this DPError to another."""
 47 |         if type(other) == DPError:
 48 |             self.n_del += other.n_del
 49 |             self.n_ins += other.n_ins
 50 |             self.n_sub += other.n_sub
 51 |             self.n_match += other.n_match
 52 |             self.n_total += other.n_total
 53 |         return self
 54 | 
 55 |     __radd__ = __add__
 56 |     __iadd__ = __add__
 57 | 
 58 |     def __str__(self):
 59 |         """Returns a string representation of the alignment error."""
 60 |         return (
 61 |             "H = " + str(self.n_match) + ", D = " + str(self.n_del) + ", S = "
 62 |             + str(self.n_sub) + ", I = " + str(self.n_ins)+ ", N = " +
 63 |             str(self.n_total)
 64 |             )
 65 | 
 66 |     def get_levenshtein(self):
 67 |         """Returns the Levenshtein distance of the alignment."""
 68 |         return self.n_del + self.n_sub + self.n_ins
 69 | 
 70 |     def get_accuracy(self):
 71 |         """
 72 |         Calculates the accuracy given the stored errors using the formula:
 73 |         Accuracy = (Matches - Insertions) / Total
 74 |         """
 75 |         return float(self.n_match - self.n_ins) / self.n_total
 76 | 
 77 |     def get_wer(self):
 78 |         """
 79 |         Calculates the word error rate (WER) using:
 80 |         WER = (Substitutions + Deletions + Insertions) / Total
 81 |         """
 82 |         return float(self.n_sub + self.n_del + self.n_ins) / self.n_total
 83 | 
 84 | 
 85 | #-----------------------------------------------------------------------------#
 86 | #                    DYNAMIC PROGRAMMING ALIGNMENT FUNCTION                   #
 87 | #-----------------------------------------------------------------------------#
 88 | 
 89 | def dp_align(ref_list, test_list, ins_penalty=3, del_penalty=3, sub_penalty=4):
 90 |     """
 91 |     Performs dynamic programming alignment of `ref_list` to `test_list`.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     ref_list : list
 96 |     test_list : list
 97 |     """
 98 | 
 99 |     # Initialise the alignment matrix
100 |     dp_matrix = np.empty(
101 |         [len(test_list) + 1, len(ref_list) + 1], dtype = object
102 |         )
103 |     for i in range(len(test_list) + 1):
104 |         for j in range(len(ref_list) + 1):
105 |             dp_matrix[i][j] = DPEntry()
106 | 
107 |     # Initialise the origin
108 |     dp_matrix[0][0].score = 0
109 |     dp_matrix[0][0].align = "m"
110 | 
111 |     # The first row is all delections:
112 |     for j in range(1, len(ref_list) + 1):
113 |         dp_matrix[0][j].score = j*del_penalty
114 |         dp_matrix[0][j].align = "d"
115 | 
116 |     # Fill dp_matrix
117 |     for i in range(1, len(test_list) + 1):
118 | 
119 |         # First column is all insertions
120 |         dp_matrix[i][0].score = i*ins_penalty
121 |         dp_matrix[i][0].align = "i"
122 | 
123 |         for j in range(1, len(ref_list) + 1):
124 |             del_score = dp_matrix[i, j - 1].score + del_penalty
125 |             ins_score = dp_matrix[i - 1, j].score + ins_penalty
126 | 
127 |             if test_list[i - 1] == ref_list[j - 1]:
128 | 
129 |                 # Considering a match
130 |                 match_score = dp_matrix[i - 1, j - 1].score
131 | 
132 |                 # Test for a match
133 |                 if match_score <= del_score and match_score <= ins_score:
134 |                     dp_matrix[i, j].score = match_score
135 |                     dp_matrix[i, j].align = "m"
136 |                 # Test for a deletion
137 |                 elif del_score <= ins_score:
138 |                     dp_matrix[i, j].score = del_score
139 |                     dp_matrix[i, j].align = "d"
140 |                 # Test for an insertion (only option left)
141 |                 else:
142 |                     dp_matrix[i, j].score = ins_score
143 |                     dp_matrix[i, j].align = "i"
144 | 
145 |             else:
146 | 
147 |                 # Considering a substitution
148 |                 sub_score = dp_matrix[i - 1, j - 1].score + sub_penalty
149 | 
150 |                 # Test for a substitution
151 |                 if sub_score < del_score and sub_score <= ins_score:
152 |                     dp_matrix[i, j].score = sub_score
153 |                     dp_matrix[i, j].align = "s"
154 |                 # Test for a deletion
155 |                 elif del_score <= ins_score:
156 |                     dp_matrix[i, j].score = del_score
157 |                     dp_matrix[i, j].align = "d"
158 |                 # Test for an insertion (only option left)
159 |                 else:
160 |                     dp_matrix[i, j].score = ins_score
161 |                     dp_matrix[i, j].align = "i"
162 | 
163 |     # Perform alignment by tracking through the dp_matrix
164 |     dp_errors = DPError()
165 |     dp_errors.n_total = len(ref_list)
166 |     i = len(test_list)
167 |     j = len(ref_list)
168 |     while i > 0 or j > 0:
169 |         if dp_matrix[i, j].align == "m":
170 |             #print test_list[i - 1], ref_list[j - 1]
171 |             i -= 1
172 |             j -= 1
173 |             dp_errors.n_match += 1
174 |         elif dp_matrix[i, j].align == "s":
175 |             #print test_list[i - 1], ref_list[j - 1]
176 |             i -= 1
177 |             j -= 1
178 |             dp_errors.n_sub += 1
179 |         elif dp_matrix[i, j].align == "d":
180 |             #print "-", ref_list[j - 1]
181 |             j -= 1
182 |             dp_errors.n_del += 1
183 |         elif dp_matrix[i, j].align == "i":
184 |             #print test_list[i - 1], "-"
185 |             i -= 1
186 |             dp_errors.n_ins += 1
187 | 
188 |     # Return the alignment results
189 |     return dp_errors
190 | 
191 | 
192 | #-----------------------------------------------------------------------------#
193 | #                                MAIN FUNCTION                                #
194 | #-----------------------------------------------------------------------------#
195 | 
196 | def main():
197 |     a = dp_align(
198 |         "recycling", "recycle", ins_penalty=1, del_penalty=1, sub_penalty=1
199 |         )
200 |     print(
201 |         "Levenshtein distance between recycling and recycle:",
202 |         a.get_levenshtein()
203 |         )
204 | 
205 | 
206 | if __name__ == "__main__":
207 |     main()
208 | 


--------------------------------------------------------------------------------
/embeddings/data_io.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data input and output functions.
  3 | 
  4 | Author: Herman Kamper
  5 | Contact: kamperh@gmail.com
  6 | Date: 2018, 2019
  7 | """
  8 | 
  9 | from collections import Counter
 10 | from os import path
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | import random
 14 | import sys
 15 | 
 16 | sys.path.append(path.join("..", "src"))
 17 | 
 18 | from tflego import NP_DTYPE
 19 | 
 20 | 
 21 | def load_data_from_npz(npz_fn, min_length=None):
 22 |     print("Reading:", npz_fn)
 23 |     npz = np.load(npz_fn)
 24 |     x = []
 25 |     labels = []
 26 |     speakers = []
 27 |     lengths = []
 28 |     keys = []
 29 |     n_items = 0
 30 |     for utt_key in tqdm(sorted(npz)):
 31 |         cur_x = npz[utt_key]
 32 |         if min_length is not None and len(cur_x) <= min_length:
 33 |             continue
 34 |         keys.append(utt_key)
 35 |         x.append(cur_x)
 36 |         utt_key_split = utt_key.split("_")
 37 |         word = utt_key_split[0]
 38 |         speaker = utt_key_split[1]
 39 |         labels.append(word)
 40 |         speakers.append(speaker)
 41 |         lengths.append(len(cur_x))
 42 |         n_items += 1
 43 |     # print("No. items:", n_items)
 44 |     print("E.g. item shape:", x[0].shape)
 45 |     return (x, labels, lengths, keys, speakers)
 46 | 
 47 | 
 48 | def filter_data(data, labels, lengths, keys, speakers,
 49 |         n_min_tokens_per_type=None, n_max_types=None, n_max_tokens=None,
 50 |         n_max_tokens_per_type=None):
 51 |     """
 52 |     Filter the output from `load_data_from_npz` based on specifications.
 53 | 
 54 |     Each filter is applied independelty, so they could influence each other.
 55 |     E.g. `n_max_tokens` could further reduce the number of types if it is used
 56 |     in conjunction with `n_max_types`.
 57 | 
 58 |     Return
 59 |     ------
 60 |     data, labels, lengths keys, speakers : list, list, list, list
 61 |         The filtered lists.
 62 |     """
 63 | 
 64 |     random.seed(1)
 65 | 
 66 |     if n_max_types is not None:
 67 | 
 68 |         print("Maximum no. of types:", n_max_types)
 69 | 
 70 |         # Find valid types
 71 |         types = [i[0] for i in Counter(labels).most_common(n_max_types)]
 72 | 
 73 |         # Filter
 74 |         filtered_data = []
 75 |         filtered_labels = []
 76 |         filtered_lengths = []
 77 |         filtered_keys = []
 78 |         filtered_speakers = []
 79 |         for i in range(len(data)):
 80 |             if labels[i] in types:
 81 |                 filtered_data.append(data[i])
 82 |                 filtered_labels.append(labels[i])
 83 |                 filtered_lengths.append(lengths[i])
 84 |                 filtered_keys.append(keys[i])
 85 |                 filtered_speakers.append(speakers[i])
 86 | 
 87 |         data = filtered_data
 88 |         labels = filtered_labels
 89 |         lengths = filtered_lengths
 90 |         keys = filtered_keys
 91 |         speakers = filtered_speakers
 92 | 
 93 |     if n_max_tokens_per_type is not None:
 94 | 
 95 |         print("Maximum tokens per type:", n_max_tokens_per_type)
 96 | 
 97 |         # Filter
 98 |         filtered_data = []
 99 |         filtered_labels = []
100 |         filtered_lengths = []
101 |         filtered_keys = []
102 |         filtered_speakers = []
103 |         indices = list(range(len(data)))
104 |         random.shuffle(indices)
105 |         tokens_per_type = Counter()
106 |         for i in indices:
107 |             if tokens_per_type[labels[i]] < n_max_tokens_per_type:
108 |                 filtered_data.append(data[i])
109 |                 filtered_labels.append(labels[i])
110 |                 filtered_lengths.append(lengths[i])
111 |                 filtered_keys.append(keys[i])
112 |                 filtered_speakers.append(speakers[i])
113 |                 tokens_per_type[labels[i]] += 1
114 | 
115 |         data = filtered_data
116 |         labels = filtered_labels
117 |         lengths = filtered_lengths
118 |         keys = filtered_keys
119 |         speakers = filtered_speakers
120 | 
121 |     if n_max_tokens is not None:
122 | 
123 |         print("Maximum no. of tokens:", n_max_tokens)
124 | 
125 |         # Filter
126 |         filtered_data = []
127 |         filtered_labels = []
128 |         filtered_lengths = []
129 |         filtered_keys = []
130 |         filtered_speakers = []
131 |         indices = list(range(len(data)))
132 |         random.shuffle(indices)
133 |         # for i in range(len(data)):
134 |         for i in indices[:n_max_tokens]:
135 |             filtered_data.append(data[i])
136 |             filtered_labels.append(labels[i])
137 |             filtered_lengths.append(lengths[i])
138 |             filtered_keys.append(keys[i])
139 |             filtered_speakers.append(speakers[i])
140 | 
141 |         data = filtered_data
142 |         labels = filtered_labels
143 |         lengths = filtered_lengths
144 |         keys = filtered_keys
145 |         speakers = filtered_speakers
146 | 
147 |     if n_min_tokens_per_type is not None:
148 | 
149 |         print("Minimum tokens per type:", n_min_tokens_per_type)
150 | 
151 |         # Find valid types
152 |         types = []
153 |         counts = Counter(labels)
154 |         for key in counts:
155 |             if counts[key] >= n_min_tokens_per_type:
156 |                 types.append(key)
157 | 
158 |         # Filter
159 |         filtered_data = []
160 |         filtered_labels = []
161 |         filtered_lengths = []
162 |         filtered_keys = []
163 |         filtered_speakers = []
164 |         for i in range(len(data)):
165 |             if labels[i] in types:
166 |                 filtered_data.append(data[i])
167 |                 filtered_labels.append(labels[i])
168 |                 filtered_lengths.append(lengths[i])
169 |                 filtered_keys.append(keys[i])
170 |                 filtered_speakers.append(speakers[i])
171 | 
172 |         data = filtered_data
173 |         labels = filtered_labels
174 |         lengths = filtered_lengths
175 |         keys = filtered_keys
176 |         speakers = filtered_speakers
177 | 
178 |     print("No. types:", len(Counter(labels)))
179 |     print("No. tokens:", len(labels))
180 |     return (data, labels, lengths, keys, speakers)
181 | 
182 | 
183 | def trunc_and_limit_dim(x, lengths, d_frame, max_length):
184 |     for i, seq in enumerate(x):
185 |         x[i] = x[i][:max_length, :d_frame]
186 |         if max_length is not None:
187 |             lengths[i] = min(lengths[i], max_length)
188 | 
189 | 
190 | def pad_sequences(x, n_padded, center_padded=True, return_mask=False):
191 |     """Return the padded sequences and their original lengths."""
192 |     padded_x = np.zeros((len(x), n_padded, x[0].shape[1]), dtype=NP_DTYPE)
193 |     if return_mask:
194 |         mask_x = np.zeros((len(x), n_padded), dtype=NP_DTYPE)
195 |     lengths = []
196 |     for i_data, cur_x in enumerate(x):
197 |         length = cur_x.shape[0]
198 |         if center_padded:
199 |             padding = int(np.round((n_padded - length) / 2.))
200 |             if length <= n_padded:
201 |                 padded_x[i_data, padding:padding + length, :] = cur_x
202 |                 if return_mask:
203 |                     mask_x[i_data, padding:padding + length] = 1
204 |             else:
205 |                 # Cut out snippet from sequence exceeding n_padded
206 |                 padded_x[i_data, :, :] = cur_x[-padding:-padding + n_padded]
207 |                 if return_mask:
208 |                     mask_x[i_data, :] = 1
209 |             lengths.append(min(length, n_padded))
210 |         else:
211 |             length = min(length, n_padded)
212 |             padded_x[i_data, :length, :] = cur_x[:length, :]
213 |             if return_mask:
214 |                 mask_x[i_data, :length] = 1
215 |             lengths.append(length)
216 |     if return_mask:
217 |         return padded_x, lengths, mask_x
218 |     else:
219 |         return padded_x, lengths
220 | 


--------------------------------------------------------------------------------
/embeddings/apply_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Encode the set using the specified model.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2018, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | import argparse
 14 | import pickle
 15 | import numpy as np
 16 | import os
 17 | import sys
 18 | import tensorflow as tf
 19 | 
 20 | sys.path.append(path.join("..", "src"))
 21 | 
 22 | from link_mfcc import sixteen_languages
 23 | from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE
 24 | import batching
 25 | import data_io
 26 | 
 27 | 
 28 | #-----------------------------------------------------------------------------#
 29 | #                            APPLY MODEL FUNCTIONS                            #
 30 | #-----------------------------------------------------------------------------#
 31 | 
 32 | def build_model(x, x_lengths, options_dict):
 33 |     model_dict = {}
 34 |     if options_dict["script"] == "train_cae_rnn":
 35 |         import train_cae_rnn
 36 |         cae = train_cae_rnn.build_cae_from_options_dict(
 37 |             x, x_lengths, x_lengths, options_dict
 38 |             )
 39 |         model_dict["output"] = cae["y"]
 40 |         model_dict["encoding"] = cae["z"]
 41 |         model_dict["mask"] = cae["mask"]
 42 |     elif options_dict["script"] == "train_vae":
 43 |         import train_vae
 44 |         vae = train_vae.build_vae_from_options_dict(x, x_lengths, options_dict)
 45 |         model_dict["output"] = vae["decoder_output"]
 46 |         model_dict["encoding"] = vae["latent_layer"]["z_mean"]
 47 |         model_dict["mask"] = vae["mask"]
 48 |     elif options_dict["script"] == "train_siamese_rnn":
 49 |         import train_siamese_rnn
 50 |         siamese = train_siamese_rnn.build_siamese_from_options_dict(
 51 |             x, x_lengths, options_dict
 52 |             )
 53 |         model_dict["encoding"] = siamese["output"]
 54 |     elif options_dict["script"] == "train_siamese_cnn":
 55 |         import train_siamese_cnn
 56 |         siamese = train_siamese_cnn.build_siamese_cnn_from_options_dict(
 57 |             x, options_dict
 58 |             )
 59 |         model_dict["encoding"] = siamese["output"]
 60 |     elif options_dict["script"] == "train_rnn":
 61 |         import train_rnn
 62 |         rnn = train_rnn.build_rnn_from_options_dict(
 63 |             x, x_lengths, options_dict
 64 |             )
 65 |         model_dict["encoding"] = rnn["encoding"]
 66 |     elif options_dict["script"] == "train_rnn_split":
 67 |         import train_rnn_split
 68 |         rnn = train_rnn_split.build_rnn_from_options_dict(
 69 |             x, x_lengths, options_dict
 70 |             )
 71 |         model_dict["encoding"] = rnn["encoding"]
 72 |     else:
 73 |         assert False, "model type not supported"
 74 |     return model_dict
 75 | 
 76 | 
 77 | def apply_model(model_fn, subset, language):
 78 | 
 79 |     # assert language is None  # to-do
 80 | 
 81 |     # Load the model options
 82 |     model_dir = path.split(model_fn)[0]
 83 |     options_dict_fn = path.join(model_dir, "options_dict.pkl")
 84 |     print("Reading:", options_dict_fn)
 85 |     with open(options_dict_fn, "rb") as f:
 86 |         options_dict = pickle.load(f)
 87 | 
 88 |     # Load data
 89 |     npz_fn = path.join("data", language, subset + ".npz")
 90 |     x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz(
 91 |         npz_fn
 92 |         )
 93 | 
 94 |     if "cnn" in options_dict["script"]:
 95 | 
 96 |         # Pad and flatten data
 97 |         x_data, _ = data_io.pad_sequences(
 98 |             x_data, options_dict["max_length"], True
 99 |             )
100 |         x_data = np.transpose(x_data, (0, 2, 1))
101 |         x_data = x_data.reshape((-1, options_dict["d_in"]))
102 | 
103 |         # Build model
104 |         x = tf.placeholder(TF_DTYPE, [None, options_dict["d_in"]])
105 |         model = build_model(x, None, options_dict)
106 | 
107 |         # Embed data
108 |         batch_iterator = batching.LabelledIterator(
109 |             x_data, None, x_data.shape[0], False
110 |             )
111 |         saver = tf.train.Saver()
112 |         with tf.Session() as session:
113 |             saver.restore(session, model_fn)
114 |             for batch_x in batch_iterator:
115 |                 np_z = session.run(
116 |                     [model["encoding"]], feed_dict={x: batch_x})[0]
117 |                 break  # single batch
118 | 
119 |     else:  # rnn
120 |         
121 |         # Truncate and limit dimensionality
122 |         data_io.trunc_and_limit_dim(
123 |             x_data, lengths, options_dict["n_input"],
124 |             options_dict["max_length"]
125 |             )
126 | 
127 |         # Build model
128 |         x = tf.placeholder(TF_DTYPE, [None, None, options_dict["n_input"]])
129 |         x_lengths = tf.placeholder(TF_ITYPE, [None])
130 |         model = build_model(x, x_lengths, options_dict)
131 | 
132 |         # Embed data
133 |         batch_iterator = batching.SimpleIterator(x_data, len(x_data), False)
134 |         saver = tf.train.Saver()
135 |         with tf.Session() as session:
136 |             saver.restore(session, model_fn)
137 |             for batch_x_padded, batch_x_lengths in batch_iterator:
138 |                 np_x = batch_x_padded
139 |                 np_x_lengths = batch_x_lengths
140 |                 np_z = session.run(
141 |                     [model["encoding"]], feed_dict={x: np_x, x_lengths:
142 |                     np_x_lengths}
143 |                     )[0]
144 |                 break  # single batch
145 | 
146 |     embed_dict = {}
147 |     for i, utt_key in enumerate([keys[i] for i in batch_iterator.indices]):
148 |         embed_dict[utt_key] = np_z[i]
149 | 
150 |     return embed_dict
151 | 
152 | 
153 | #-----------------------------------------------------------------------------#
154 | #                              UTILITY FUNCTIONS                              #
155 | #-----------------------------------------------------------------------------#
156 | 
157 | def check_argv():
158 |     """Check the command line arguments."""
159 |     parser = argparse.ArgumentParser(
160 |         description=__doc__.strip().split("\n")[0], add_help=False
161 |         )
162 |     parser.add_argument("model_fn", type=str, help="model checkpoint filename")
163 |     parser.add_argument(
164 |         "language", type=str, help="language to apply model to",
165 |         choices=sixteen_languages
166 |         )
167 |     parser.add_argument(
168 |         "subset", type=str, help="subset to apply model to",
169 |         choices=["val", "test"]
170 |         )
171 |     if len(sys.argv) == 1:
172 |         parser.print_help()
173 |         sys.exit(1)
174 |     return parser.parse_args()
175 | 
176 | 
177 | #-----------------------------------------------------------------------------#
178 | #                                MAIN FUNCTION                                #
179 | #-----------------------------------------------------------------------------#
180 | 
181 | def main():
182 |     args = check_argv()
183 | 
184 |     # Do not output TensorFlow info and warning messages
185 |     import warnings
186 |     warnings.filterwarnings("ignore")
187 |     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
188 |     tf.logging.set_verbosity(tf.logging.ERROR)
189 |     if type(tf.contrib) != type(tf):
190 |         tf.contrib._warning = None
191 | 
192 |     # Embed data
193 |     embed_dict = apply_model(args.model_fn, args.subset, args.language)
194 | 
195 |     # Save embeddings
196 |     model_dir, model_fn = path.split(args.model_fn)
197 |     if args.language is None:
198 |         npz_fn =  args.subset + ".npz"
199 |     else:
200 |         npz_fn = args.language + "." + args.subset + ".npz"
201 |     npz_fn = path.join(model_dir, path.splitext(model_fn)[0] + "." + npz_fn)
202 |     print("Writing:", npz_fn)
203 |     np.savez_compressed(npz_fn, **embed_dict)
204 |     print(datetime.now())
205 | 
206 | 
207 | if __name__ == "__main__":
208 |     main()
209 | 


--------------------------------------------------------------------------------
/qbe/sandbox.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sandox: QbE keyword lists"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Herman Kamper, Stellenbosch University, 2018-2019."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Preliminaries"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 11,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "The autoreload extension is already loaded. To reload it, use:\n",
 34 |       "  %reload_ext autoreload\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "%matplotlib inline\n",
 40 |     "%load_ext autoreload\n",
 41 |     "%autoreload 2\n",
 42 |     "\n",
 43 |     "from collections import Counter\n",
 44 |     "from os import path\n",
 45 |     "import codecs\n",
 46 |     "import matplotlib.pyplot as plt\n",
 47 |     "import numpy as np\n",
 48 |     "import random"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## Keywords"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 2,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "dev_keywords_fn = \"../features/mfcc/HA/ha.dev.gt_words.npz\"\n",
 65 |     "test_fn = \"../features/mfcc/HA/ha.eval.npz\"\n",
 66 |     "dev_keywords_features = np.load(dev_keywords_fn)\n",
 67 |     "test_features = np.load(test_fn)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "def read_forced_alignment(globalphone_fa_fn):\n",
 77 |     "    \"\"\"Return a dictionary of transcriptions obtained from a GlobalPhone forced alignment file.\"\"\"\n",
 78 |     "    transcription_dict = {}\n",
 79 |     "    with codecs.open(globalphone_fa_fn, \"r\", \"utf-8\") as f:\n",
 80 |     "        for line in f:\n",
 81 |     "            line = line.strip().split(\" \")\n",
 82 |     "            utterance_key = line[0]\n",
 83 |     "            label = line[4].lower()\n",
 84 |     "            if utterance_key not in transcription_dict:\n",
 85 |     "                transcription_dict[utterance_key] = []\n",
 86 |     "            transcription_dict[utterance_key].append(label)\n",
 87 |     "    return transcription_dict    \n",
 88 |     "\n",
 89 |     "test_transcription = read_forced_alignment(\"/home/kamperh/endgame/datasets/globalphone_alignments/HA/eval.ctm\")"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "test_counter = Counter()\n",
 99 |     "for utterance_key in test_transcription:\n",
100 |     "    for word in test_transcription[utterance_key]:\n",
101 |     "        test_counter[word] += 1"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 43,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "No. words more than 9: 111\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "n = 9\n",
119 |     "more_than_n = set()\n",
120 |     "for word, count in test_counter.most_common():\n",
121 |     "    if count >= n:\n",
122 |     "        more_than_n.add(word)\n",
123 |     "print(\"No. words more than {}: {}\".format(n, len(more_than_n)))"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 44,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "dev_counter = Counter()\n",
133 |     "dev_words = set()\n",
134 |     "for segment_key in dev_keywords_features:\n",
135 |     "    word = segment_key.split(\"_\")[0].lower()\n",
136 |     "    dev_counter[word] += 1\n",
137 |     "    dev_words.add(word)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 45,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "No. words overlap: 33\n",
150 |       "aikin: 6 times in dev\n",
151 |       "amfani: 8 times in dev\n",
152 |       "amurka: 6 times in dev\n",
153 |       "arziki: 2 times in dev\n",
154 |       "babban: 2 times in dev\n",
155 |       "bayan: 6 times in dev\n",
156 |       "bayyana: 5 times in dev\n",
157 |       "birnin: 7 times in dev\n",
158 |       "cikin: 3 times in dev\n",
159 |       "daban: 1 times in dev\n",
160 |       "daular: 6 times in dev\n",
161 |       "domin: 2 times in dev\n",
162 |       "duniya: 8 times in dev\n",
163 |       "hankali: 7 times in dev\n",
164 |       "hanyar: 5 times in dev\n",
165 |       "harkokin: 12 times in dev\n",
166 |       "kasance: 6 times in dev\n",
167 |       "kasar: 14 times in dev\n",
168 |       "kasashe: 4 times in dev\n",
169 |       "kasashen: 13 times in dev\n",
170 |       "lokacin: 12 times in dev\n",
171 |       "majalisar: 8 times in dev\n",
172 |       "mutane: 18 times in dev\n",
173 |       "samun: 5 times in dev\n",
174 |       "sarki: 7 times in dev\n",
175 |       "sosai: 25 times in dev\n",
176 |       "tattalin: 4 times in dev\n",
177 |       "tsakanin: 11 times in dev\n",
178 |       "wajen: 2 times in dev\n",
179 |       "wanda: 1 times in dev\n",
180 |       "wannan: 5 times in dev\n",
181 |       "zaman: 1 times in dev\n",
182 |       "zamanin: 9 times in dev\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "overlap = more_than_n.intersection(dev_words)\n",
188 |     "print(\"No. words overlap:\", len(overlap))\n",
189 |     "for word in sorted(overlap):\n",
190 |     "    print(\"{}: {} times in dev\".format(word, dev_counter[word]))"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 48,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "n_keywords = 30\n",
200 |     "keywords = list(overlap)\n",
201 |     "random.seed(1)\n",
202 |     "random.shuffle(keywords)\n",
203 |     "keywords = keywords[:n_keywords]"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 49,
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "Keywords: ['amfani', 'amurka', 'arziki', 'babban', 'bayan', 'bayyana', 'birnin', 'daban', 'daular', 'domin', 'duniya', 'hankali', 'hanyar', 'harkokin', 'kasar', 'kasashe', 'kasashen', 'lokacin', 'majalisar', 'mutane', 'samun', 'sarki', 'sosai', 'tattalin', 'tsakanin', 'wajen', 'wanda', 'wannan', 'zaman', 'zamanin']\n",
216 |       "No. keywords: 30\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "print(\"Keywords:\", sorted(keywords))\n",
222 |     "print(\"No. keywords:\", len(keywords))"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 50,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "with codecs.open(\"keywords.txt\", \"w\", \"utf-8\") as f:\n",
232 |     "    for keyword in sorted(keywords):\n",
233 |     "        f.write(keyword + \"\\n\")"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": []
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.5.2"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 2
265 | }
266 | 


--------------------------------------------------------------------------------
/embeddings/analyse_embeds.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Analyse a given file with embedding tokens.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2015, 2018, 2019
  9 | """
 10 | 
 11 | from os import path
 12 | from sklearn import decomposition, ensemble, manifold
 13 | import argparse
 14 | import matplotlib.pyplot as plt
 15 | import numpy as np
 16 | import random
 17 | import sys
 18 | 
 19 | basedir = path.dirname(path.abspath(__file__))
 20 | sys.path.append(path.join(basedir, "..", "src"))
 21 | 
 22 | import plotting
 23 | 
 24 | 
 25 | #-----------------------------------------------------------------------------#
 26 | #                                   PLOTTING                                  #
 27 | #-----------------------------------------------------------------------------#
 28 | 
 29 | def plot_raw_embeds(npz, types=None):
 30 |     """Plot all the embeddings of type `types`, if None plot everything."""
 31 | 
 32 |     # Get embeddings
 33 |     embeddings = []
 34 |     labels = []
 35 |     for key in npz:
 36 |         if "_" in key:
 37 |             label = key.split("_")[0]
 38 |         else:
 39 |             label = key
 40 |         if types is None:
 41 |             labels.append(label)
 42 |             embeddings.append(npz[key])
 43 |         elif label in types:
 44 |             labels.append(label)
 45 |             embeddings.append(npz[key])
 46 |     n_embeds = len(embeddings)
 47 | 
 48 |     # Now sort by label
 49 |     sort_order = np.argsort(np.array(labels))
 50 |     sorted_labels = np.array(labels)[sort_order]
 51 | 
 52 |     # Get cluster tick positions
 53 |     type_ticks = [0]
 54 |     for i in range(len(sorted_labels) - 1):
 55 |         if sorted_labels[i] != sorted_labels[i + 1]:
 56 |             type_ticks.append(i + 1)
 57 |     type_ticks.append(n_embeds)
 58 | 
 59 |     # Get label positions and labels
 60 |     type_label_ticks = []
 61 |     type_labels = []
 62 |     for i in sorted(list(set(labels))):
 63 |         where = np.where(sorted_labels == i)[0]
 64 |         if len(where) == 0:
 65 |             continue
 66 |         pos = int(np.mean(where))
 67 |         type_label_ticks.append(pos)
 68 |         type_labels.append(i)
 69 | 
 70 |     # print("Plotting all embeddings")
 71 | 
 72 |     # Variables used for plotting
 73 |     labels_offset = 1.04
 74 |     par2_linewidth = 0.5
 75 | 
 76 |     fig, host = plt.subplots()
 77 |     par2 = host.twinx()
 78 |     par2.spines["right"].set_position(("axes", labels_offset))
 79 |     plotting.make_patch_spines_invisible(par2)
 80 |     par2.spines["right"].set_visible(True)
 81 |     par2.set_ylim([0, n_embeds])
 82 |     par2.invert_yaxis()
 83 |     par2.set_yticks(type_ticks)
 84 |     par2.set_yticklabels([])
 85 |     par2.tick_params(axis="y", width=par2_linewidth, length=10)
 86 |     par2.spines["right"].set_linewidth(par2_linewidth)
 87 |     par2.set_yticks(type_label_ticks, minor=True)
 88 |     par2.set_yticklabels(type_labels, minor=True)
 89 |     par2.set_ylabel("Word types")
 90 |     for line in par2.yaxis.get_minorticklines():
 91 |         line.set_visible(False)
 92 |     cax = host.imshow(
 93 |         np.array(embeddings)[sort_order], interpolation="nearest",
 94 |         aspect="auto"
 95 |         )
 96 |     host.set_yticks([])
 97 |     # host.set_xticklabels([])
 98 |     host.set_ylabel("Word embedding vector")
 99 |     host.set_xlabel("Embedding dimensions")
100 |     # fig.colorbar(cax, orientation="horizontal")
101 | 
102 | 
103 | # From http://scikit-learn.org/stable/_downloads/plot_lle_digits.py.
104 | def plot_embeds_2d(embeds_dict, types=None):
105 |     print("Computing PCA projection")
106 |     embeddings, labels = get_embeds_and_labels(embeds_dict, types)
107 |     X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(
108 |         embeddings
109 |         )
110 |     plot_labelled_2d_data(X_pca, labels, "PCA")
111 | 
112 |     print("Computing t-SNE embedding")
113 |     embeddings, labels = get_embeds_and_labels(embeds_dict, types)
114 |     tsne = manifold.TSNE(
115 |         n_components=2, perplexity=20, init="random", random_state=1
116 |         )
117 |     X_tsne = tsne.fit_transform(embeddings)
118 |     plot_labelled_2d_data(X_tsne, labels, "t-SNE")
119 | 
120 |     # print("Computing Spectral embedding")
121 |     # embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
122 |     #                                       eigen_solver="arpack")
123 |     # X_se = embedder.fit_transform(embeddings)
124 |     # plot_labelled_2d_data(X_se, labels)
125 | 
126 |     # print("Computing Totally Random Trees embedding")
127 |     # hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
128 |     #                                        max_depth=5)
129 |     # X_transformed = hasher.fit_transform(embeddings)
130 |     # pca = decomposition.TruncatedSVD(n_components=2)
131 |     # X_reduced = pca.fit_transform(X_transformed)
132 |     # plot_labelled_2d_data(X_reduced, labels)
133 | 
134 |     # print("Computing MDS embedding")
135 |     # clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
136 |     # X_mds = clf.fit_transform(embeddings)
137 |     # plot_labelled_2d_data(X_mds, labels)
138 | 
139 |     print("Computing Isomap embedding")
140 |     n_neighbors = 10
141 |     X_iso = manifold.Isomap(
142 |         n_neighbors, n_components=2).fit_transform(embeddings
143 |         )
144 |     plot_labelled_2d_data(
145 |         X_iso, labels, "Isomap (" + str(n_neighbors) + " neighbours)"
146 |         )
147 | 
148 | 
149 | def plot_labelled_2d_data(X, labels, title=None):
150 |     plt.figure()
151 |     classes = set(labels)
152 |     for label in sorted(classes):
153 |         indices = np.where(np.array(labels) == label)[0]
154 |         plt.scatter(X[indices, 0], X[indices, 1], label=label)
155 |     if title is not None:
156 |         plt.title(title)
157 |     plt.legend(loc="best", ncol=2)
158 | 
159 | 
160 | def plot_data_labelled(X, labels, title=None):
161 |     ordered_labels = sorted(set(labels))
162 |     n_labels = len(set(labels))
163 | 
164 |     x_min, x_max = np.min(X, 0), np.max(X, 0)
165 |     X = (X - x_min) / (x_max - x_min)
166 | 
167 |     plt.figure()
168 |     ax = plt.subplot(111)
169 |     for i in range(X.shape[0]):
170 |         plt.text(
171 |             X[i, 0], X[i, 1], str(labels[i]),
172 |             color=plt.cm.Set1(1.0*ordered_labels.index(labels[i]) / n_labels),
173 |             fontdict={"weight": "bold", "size": 9}
174 |             )
175 | 
176 |     if title is not None:
177 |         plt.title(title)
178 | 
179 |     plt.xticks([]), plt.yticks([])
180 | 
181 | 
182 | #-----------------------------------------------------------------------------#
183 | #                              UTILITY FUNCTIONS                              #
184 | #-----------------------------------------------------------------------------#
185 | 
186 | def check_argv():
187 |     """Check the command line arguments."""
188 |     parser = argparse.ArgumentParser(
189 |         description=__doc__.strip().split("\n")[0], add_help=False
190 |         )
191 |     parser.add_argument("npz_fn", type=str, help="")
192 |     parser.add_argument(
193 |         "--word_type", type=str,
194 |         help="show a plot for these word types, given as "
195 |         "comma-seperated values"
196 |         )
197 |     parser.add_argument(
198 |         "--plot_rnd", type=int,
199 |         help="plot this number of randomly selected embeddings"
200 |         )
201 |     parser.add_argument(
202 |         "--plot_all", action="store_true", help="plot all embeddings"
203 |         )
204 |     parser.add_argument(
205 |         "--normalize", dest="normalize", action="store_true",
206 |         help="normalize embeddings to unit sphere before calculating "
207 |         "distances (default is not to do this)"
208 |         )
209 |     parser.set_defaults(normalize=False)
210 |     if len(sys.argv) == 1:
211 |         parser.print_help()
212 |         sys.exit(1)
213 |     return parser.parse_args()
214 | 
215 | 
216 | def get_embeds_and_labels(embeds_dict, types=None):
217 |     embeddings = []
218 |     labels = []
219 |     for utt in embeds_dict:
220 |         if "_" in utt:
221 |             label = buckeye_utt_to_label(utt)
222 |         else:
223 |             label = utt
224 |         if types is None:
225 |             labels.append(label)
226 |             embeddings.append(embeds_dict[utt])
227 |         elif label in types:
228 |             labels.append(label)
229 |             embeddings.append(embeds_dict[utt])
230 |     embeddings = np.array(embeddings)
231 |     return embeddings, labels
232 | 
233 | 
234 | def buckeye_utt_to_label(utt):
235 |     return utt.split("_")[0]
236 | 
237 | 
238 | #-----------------------------------------------------------------------------#
239 | #                                MAIN FUNCTION                                #
240 | #-----------------------------------------------------------------------------#
241 | 
242 | def main():
243 |     args = check_argv()
244 | 
245 |     print("Reading:", args.npz_fn)
246 |     npz = np.load(args.npz_fn)
247 | 
248 |     if args.normalize:
249 |         print("Normalizing embeddings")
250 |         norm_npz = {}
251 |         for key in npz:
252 |             embed = npz[key]
253 |             norm_npz[key] = embed/np.linalg.norm(embed)
254 |         npz = norm_npz
255 | 
256 |     print(
257 |         "Minimum embedding value:", np.min([np.min(npz[key]) for key in npz])
258 |         )
259 |     print(
260 |         "Maximum embedding value:", np.max([np.max(npz[key]) for key in npz])
261 |         )
262 | 
263 |     if args.word_type:
264 |         if not "," in args.word_type:
265 |             # A single word type
266 |             print("Plotting embeddings for type:", args.word_type)
267 |             embeddings = []
268 |             for key in npz:
269 |                 if args.word_type in key:
270 |                     embed = npz[key]
271 |                     embeddings.append(embed)
272 |             print("No. embeddings matching type:", len(embeddings))
273 |             plt.imshow(embeddings, interpolation="nearest", aspect="auto")
274 |         else:
275 |             # Multiple word types
276 |             # plot_embeds_tsne(npz, args.word_type.split(","))
277 |             plot_raw_embeds(npz, args.word_type.split(","))
278 |             plot_embeds_2d(npz, args.word_type.split(","))
279 | 
280 |     # print("Example embedding:", npz[npz.keys()[0]])
281 | 
282 |     if args.plot_all:
283 |         plot_raw_embeds(npz)
284 |         # plot_embeds_2d(npz)
285 | 
286 |     if args.plot_rnd is not None:
287 |         print("Analysing", args.plot_rnd, "randomly sampled embeddings")
288 |         random.seed(42)
289 |         sample_keys = random.sample(npz.keys(), args.plot_rnd)
290 |         npz_sampled = {}
291 |         for key in sample_keys:
292 |             npz_sampled[key] = npz[key]
293 |         plot_raw_embeds(npz_sampled)
294 |         plot_embeds_2d(npz_sampled)
295 | 
296 |     if args.word_type or args.plot_all or args.plot_rnd:
297 |         plt.show()
298 | 
299 | 
300 | if __name__ == "__main__":
301 |     main()
302 | 


--------------------------------------------------------------------------------
/qbe/apply_model_dense.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Apply a model to dense segmentationi intervals.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2018, 2019
  9 | """
 10 | 
 11 | from datetime import datetime
 12 | from os import path
 13 | from tqdm import tqdm
 14 | import argparse
 15 | import pickle
 16 | import numpy as np
 17 | import os
 18 | import sys
 19 | import tensorflow as tf
 20 | 
 21 | sys.path.append(path.join("..", "src"))
 22 | sys.path.append(path.join("..", "embeddings"))
 23 | 
 24 | from apply_model import build_model
 25 | from tflego import NP_DTYPE, TF_DTYPE, NP_ITYPE, TF_ITYPE
 26 | import batching
 27 | import data_io
 28 | 
 29 | 
 30 | #-----------------------------------------------------------------------------#
 31 | #                            APPLY MODEL FUNCTIONS                            #
 32 | #-----------------------------------------------------------------------------#
 33 | 
 34 | """
 35 | def build_model(x, x_lengths, options_dict):
 36 |     model_dict = {}
 37 |     if options_dict["script"] == "train_cae_rnn":
 38 |         import train_cae_rnn
 39 |         cae = train_cae_rnn.build_cae_from_options_dict(
 40 |             x, x_lengths, x_lengths, options_dict
 41 |             )
 42 |         model_dict["output"] = cae["y"]
 43 |         model_dict["encoding"] = cae["z"]
 44 |         model_dict["mask"] = cae["mask"]
 45 |     elif options_dict["script"] == "train_vae":
 46 |         import train_vae
 47 |         vae = train_vae.build_vae_from_options_dict(x, x_lengths, options_dict)
 48 |         model_dict["output"] = vae["decoder_output"]
 49 |         model_dict["encoding"] = vae["latent_layer"]["z_mean"]
 50 |         model_dict["mask"] = vae["mask"]
 51 |     elif options_dict["script"] == "train_siamese_rnn":
 52 |         import train_siamese_rnn
 53 |         siamese = train_siamese_rnn.build_siamese_from_options_dict(
 54 |             x, x_lengths, options_dict
 55 |             )
 56 |         model_dict["encoding"] = siamese["output"]
 57 |     elif options_dict["script"] == "train_siamese_cnn":
 58 |         import train_siamese_cnn
 59 |         siamese = train_siamese_cnn.build_siamese_cnn_from_options_dict(
 60 |             x, options_dict
 61 |             )
 62 |         model_dict["encoding"] = siamese["output"]
 63 |     elif options_dict["script"] == "train_rnn":
 64 |         import train_rnn
 65 |         rnn = train_rnn.build_rnn_from_options_dict(
 66 |             x, x_lengths, options_dict
 67 |             )
 68 |         model_dict["encoding"] = rnn["encoding"]
 69 |     else:
 70 |         assert False, "model type not supported"
 71 |     return model_dict
 72 | """
 73 | 
 74 | 
 75 | def apply_model(model_fn, language, subset, segtag):
 76 | 
 77 |     # Load the model options
 78 |     model_dir = path.split(model_fn)[0]
 79 |     options_dict_fn = path.join(model_dir, "options_dict.pkl")
 80 |     print("Reading:", options_dict_fn)
 81 |     with open(options_dict_fn, "rb") as f:
 82 |         options_dict = pickle.load(f)
 83 | 
 84 |     # Load data and intervals
 85 |     npz_fn = path.join("data", language, subset + ".npz")
 86 |     x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz(
 87 |         npz_fn
 88 |         )
 89 |     seglist_fn = path.join(
 90 |         "data", language, "search.seglist." + segtag + ".pkl"
 91 |         )
 92 |     print("Reading:", seglist_fn)
 93 |     with open(seglist_fn, "rb") as f:
 94 |         seglist_dict = pickle.load(f)
 95 |     seglists = [seglist_dict[i] for i in keys]
 96 |     print("No. utterances:", len(x_data))
 97 |     n_intervals = sum([len(i) for i in seglists])
 98 |     print("No. intervals:", n_intervals)
 99 | 
100 |     # assert False
101 |     # print("Reading:", input_npz_fn)
102 |     # features_dict = np.load(input_npz_fn)
103 |     # seglist_fn = path.join(
104 |     #     "data", language, "search.seglist." + segtag + ".pkl"
105 |     #     )
106 |     # print("Reading:", seglist_fn)
107 |     # with open(seglist_fn, "rb") as f:
108 |     #     seglist_dict = pickle.load(f)
109 |     # utterances = sorted(features_dict.keys())
110 |     # input_sequences = [features_dict[i] for i in utterances]
111 |     # seglists = [seglist_dict[i] for i in utterances]
112 |     # print("No. utterances:", len(input_sequences))
113 |     # n_intervals = sum([len(i) for i in seglists])
114 |     # print("No. intervals:", n_intervals)
115 | 
116 |     # if "cnn" in options_dict["script"]:
117 |     #     assert False, "to-do"
118 |     # else:  # rnn
119 | 
120 |     # print("No. utterances:", len(input_sequences))
121 |     # n_intervals = sum([len(i) for i in seglists])
122 |     # print("No. intervals:", n_intervals)
123 | 
124 | 
125 |     # # Load data
126 |     # npz_fn = path.join("data", language, subset + ".npz")
127 |     # x_data, labels, lengths, keys, speakers = data_io.load_data_from_npz(
128 |     #     npz_fn
129 |     #     )
130 | 
131 | 
132 |     if "cnn" in options_dict["script"]:
133 | 
134 |         assert False, "to-do"
135 | 
136 |         # Pad and flatten data
137 |         x_data, _ = data_io.pad_sequences(
138 |             x_data, options_dict["max_length"], True
139 |             )
140 |         x_data = np.transpose(x_data, (0, 2, 1))
141 |         x_data = x_data.reshape((-1, options_dict["d_in"]))
142 | 
143 |         # Build model
144 |         x = tf.placeholder(TF_DTYPE, [None, options_dict["d_in"]])
145 |         model = build_model(x, None, options_dict)
146 | 
147 |         # Embed data
148 |         batch_iterator = batching.LabelledIterator(
149 |             x_data, None, x_data.shape[0], False
150 |             )
151 |         saver = tf.train.Saver()
152 |         with tf.Session() as session:
153 |             saver.restore(session, model_fn)
154 |             for batch_x in batch_iterator:
155 |                 np_z = session.run(
156 |                     [model["encoding"]], feed_dict={x: batch_x})[0]
157 |                 break  # single batch
158 | 
159 |     else:  # rnn
160 |         
161 |         # Truncate and limit dimensionality
162 |         data_io.trunc_and_limit_dim(
163 |             x_data, lengths, options_dict["n_input"], None
164 |             )
165 | 
166 |         class DenseBatchFeedIterator(object):
167 | 
168 |             def __init__(self, input_sequences, seglists):
169 |                 self.input_sequences = input_sequences
170 |                 self.n_input = self.input_sequences[0].shape[-1]
171 |                 self.seglists = seglists
172 | 
173 |             def __iter__(self):
174 |                 for i_utt in range(len(self.input_sequences)):
175 |                     
176 |                     # Get intervals
177 |                     seglist = self.seglists[i_utt]
178 |                     input_sequence = self.input_sequences[i_utt]
179 | 
180 |                     # Get segments for intervals
181 |                     segments = []
182 |                     for i, j in seglist:
183 |                         segments.append(input_sequence[i:j, :])
184 | 
185 |                     batch_x_lengths = [i.shape[0] for i in segments]
186 | 
187 |                     # Pad to maximum length in batch
188 |                     batch_x_padded = np.zeros(
189 |                         (len(batch_x_lengths), np.max(batch_x_lengths),
190 |                         self.n_input), dtype=NP_DTYPE
191 |                         )
192 |                     for i, length in enumerate(batch_x_lengths):
193 |                         seq = segments[i]
194 |                         batch_x_padded[i, :length, :] = seq
195 | 
196 |                     yield (batch_x_padded, batch_x_lengths)
197 | 
198 |         batch_iterator = DenseBatchFeedIterator(x_data, seglists)
199 | 
200 |         # Build model
201 |         x = tf.placeholder(TF_DTYPE, [None, None, options_dict["n_input"]])
202 |         x_lengths = tf.placeholder(TF_ITYPE, [None])
203 |         model = build_model(x, x_lengths, options_dict)
204 | 
205 |         # Embed data
206 |         # batch_iterator = batching.SimpleIterator(x_data, len(x_data), False)
207 |         saver = tf.train.Saver()
208 |         n_outputs = 0
209 |         embed_dict = {}
210 |         with tf.Session() as session:
211 |             saver.restore(session, model_fn)
212 |             # print(datetime.now())
213 |             print(
214 |                 "Applying model to segments ({} iterations):".format(
215 |                 len(x_data))
216 |                 )
217 |             for i_batch, (batch_x_padded, batch_x_lengths) in \
218 |                     tqdm(enumerate(batch_iterator)):
219 |                 cur_output = session.run(
220 |                     [model["encoding"]], feed_dict={x: batch_x_padded,
221 |                     x_lengths: batch_x_lengths}
222 |                     )[0]
223 |                 utt_key = keys[i_batch]
224 |                 seglist = seglists[i_batch]
225 |                 embeddings = []
226 |                 for i in range(cur_output.shape[0]):
227 |                     embeddings.append(cur_output[i, :])
228 |                     n_outputs += 1
229 |                 embed_dict[utt_key] = np.array(embeddings)
230 |             # print(datetime.now())
231 | 
232 |             # for batch_x_padded, batch_x_lengths in batch_iterator:
233 |             #     np_x = batch_x_padded
234 |             #     np_x_lengths = batch_x_lengths
235 |             #     np_z = session.run(
236 |             #         [model["encoding"]], feed_dict={x: np_x, x_lengths:
237 |             #         np_x_lengths}
238 |             #         )[0]
239 |             #     break  # single batch
240 | 
241 |     print("Processed {} out of {} inputs".format(n_outputs, n_intervals))
242 |     
243 |     return embed_dict
244 | 
245 | 
246 | #-----------------------------------------------------------------------------#
247 | #                              UTILITY FUNCTIONS                              #
248 | #-----------------------------------------------------------------------------#
249 | 
250 | def check_argv():
251 |     """Check the command line arguments."""
252 |     parser = argparse.ArgumentParser(
253 |         description=__doc__.strip().split("\n")[0], add_help=False
254 |         )
255 |     parser.add_argument("model_fn", type=str, help="model checkpoint filename")
256 |     parser.add_argument(
257 |         "language", type=str, help="GlobalPhone language",
258 |         choices=["HA"]
259 |         )
260 |     parser.add_argument(
261 |         "subset", type=str, help="subset to apply model to",
262 |         choices=["search.0", "search.1", "search.test"]
263 |         )
264 |     parser.add_argument(
265 |         "--segtag", type=str,
266 |         help="a tag to identify the dense segments lists "
267 |         "(default: %(default)s)", default="min_20.max_60.step_3"
268 |         )
269 |     if len(sys.argv) == 1:
270 |         parser.print_help()
271 |         sys.exit(1)
272 |     return parser.parse_args()
273 | 
274 | 
275 | #-----------------------------------------------------------------------------#
276 | #                                MAIN FUNCTION                                #
277 | #-----------------------------------------------------------------------------#
278 | 
279 | def main():
280 |     args = check_argv()
281 | 
282 |     # Do not output TensorFlow info and warning messages
283 |     import warnings
284 |     warnings.filterwarnings("ignore")
285 |     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
286 |     tf.logging.set_verbosity(tf.logging.ERROR)
287 |     if type(tf.contrib) != type(tf):
288 |         tf.contrib._warning = None
289 | 
290 |     # Embed data
291 |     embed_dict = apply_model(
292 |         args.model_fn, args.language, args.subset, args.segtag
293 |         )
294 | 
295 |     # Save embeddings
296 |     model_dir, model_fn = path.split(args.model_fn)
297 |     model_key = path.split(path.normpath(model_dir))[1]
298 |     output_dir = path.join("exp", args.language, model_key + "." + args.segtag)
299 |     if not path.isdir(output_dir):
300 |         os.makedirs(output_dir)
301 |     npz_fn = path.join(output_dir, args.subset + ".npz")
302 |     print("Writing:", npz_fn)
303 |     np.savez_compressed(npz_fn, **embed_dict)
304 |     print(datetime.now())
305 | 
306 | 
307 | if __name__ == "__main__":
308 |     main()
309 | 


--------------------------------------------------------------------------------
/blackbox/extract_analysis_features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Extract MFCC features for a GlobalPhone language for further analysis.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2019
  9 | """
 10 | 
 11 | from collections import Counter
 12 | from os import path
 13 | import argparse
 14 | import codecs
 15 | import numpy as np
 16 | import os
 17 | import random
 18 | import sys
 19 | 
 20 | sys.path.append("..")
 21 | sys.path.append(path.join("..", "features"))
 22 | 
 23 | from paths import gp_alignments_dir
 24 | import utils
 25 | 
 26 | 
 27 | #-----------------------------------------------------------------------------#
 28 | #                              UTILITY FUNCTIONS                              #
 29 | #-----------------------------------------------------------------------------#
 30 | 
 31 | def check_argv():
 32 |     """Check the command line arguments."""
 33 |     parser = argparse.ArgumentParser(
 34 |         description=__doc__.strip().split("\n")[0], add_help=False
 35 |         )
 36 |     parser.add_argument(
 37 |         "language", type=str, help="GlobalPhone language",
 38 |         choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA", "KO", "PL", "PO",
 39 |         "RU", "SP", "SW", "TH", "TU", "VN"]
 40 |         )
 41 |     parser.add_argument(
 42 |         "--analyse", action="store_true",
 43 |         help="intermediate list analysis", default=False
 44 |         )
 45 |     if len(sys.argv) == 1:
 46 |         parser.print_help()
 47 |         sys.exit(1)
 48 |     return parser.parse_args()
 49 | 
 50 | 
 51 | def read_fa(fa_fn):
 52 |     """
 53 |     Return a dict of list of (start_time, end_time, label) with utterance keys.
 54 |     """
 55 |     fa_dict = {}
 56 |     with codecs.open(fa_fn) as f:
 57 |         for line in f:
 58 |             utt_key, _, start, duration, label = line.strip().split()
 59 |             start = float(start)
 60 |             duration = float(duration)
 61 |             end = start + duration
 62 |             if not utt_key in fa_dict:
 63 |                 fa_dict[utt_key] = []
 64 |             fa_dict[utt_key].append((start, end, label))
 65 |     return fa_dict
 66 | 
 67 | 
 68 | def pronunciations_from_fa(word_fa_fn, phone_fa_fn):
 69 |     """
 70 |     Return a dict of word tokens with pronunciations using forced alignments.
 71 | 
 72 |     The dictionary keys are the word token keys and the values are lists of
 73 |     phone labels.
 74 |     """
 75 | 
 76 |     # Read forced alignments
 77 |     # phone_fa[utt_key] is list of (start_time, end_time, phone)
 78 |     print("Reading:", phone_fa_fn)
 79 |     phone_fa = read_fa(phone_fa_fn) 
 80 |     print("Reading:", word_fa_fn)
 81 |     word_fa = read_fa(word_fa_fn)
 82 | 
 83 |     # For each word
 84 |     pronunciations_dict = {}
 85 |     for utt_key in sorted(word_fa):
 86 |         for word_start, word_end, word in word_fa[utt_key]:
 87 | 
 88 |             if word == "<unk>":
 89 |                 continue
 90 | 
 91 |             # Find phone sequence
 92 |             phone_sequence = []
 93 |             for (phone_start, phone_end, phone) in phone_fa[utt_key]:
 94 |                 if (phone_start >= word_start and phone_start < word_end and
 95 |                         phone != "sil"):
 96 |                     # Phone is in word
 97 |                     phone = phone.split("_")[0]
 98 |                     phone_sequence.append(phone)
 99 |             assert len(phone_sequence) != 0, "pronunciation not found"
100 |             word_start_frame = int(round(word_start*100))
101 |             word_end_frame = int(round(word_end*100))
102 |             segment_key = "{}_{}_{:06d}-{:06d}".format(
103 |                 word, utt_key, word_start_frame, word_end_frame + 1
104 |                 )
105 |             pronunciations_dict[segment_key] = phone_sequence
106 | 
107 |     return pronunciations_dict
108 | 
109 | 
110 | def filter_segment_keys(segment_keys, n_min_tokens_per_type=0,
111 |         n_max_tokens_per_type=np.inf, n_max_tokens=np.inf):
112 | 
113 |     random.seed(1)
114 |     random.shuffle(segment_keys)
115 |     labels = [i.split("_")[0] for i in segment_keys]
116 | 
117 |     # Find valid types
118 |     valid_types = []
119 |     counts = Counter(labels)
120 |     for key in counts:
121 |         if counts[key] >= n_min_tokens_per_type:
122 |             valid_types.append(key)
123 | 
124 |     # Filter
125 |     filtered_keys = []
126 |     tokens_per_type = Counter()
127 |     for i in range(len(labels)):
128 |         label = labels[i]
129 |         if (label in valid_types and tokens_per_type[label] <=
130 |                 n_max_tokens_per_type):
131 |             filtered_keys.append(segment_keys[i])
132 |             tokens_per_type[label] += 1
133 | 
134 |     if n_max_tokens != np.inf:
135 |         random.shuffle(filtered_keys)
136 |         filtered_keys = filtered_keys[:n_max_tokens]
137 | 
138 |     return filtered_keys
139 | 
140 | 
141 | #-----------------------------------------------------------------------------#
142 | #                                MAIN FUNCTION                                #
143 | #-----------------------------------------------------------------------------#
144 | 
145 | def main():
146 |     args = check_argv()
147 |     feat_type = "mfcc"
148 | 
149 |     list_dir = path.join("lists", args.language)
150 |     if not path.isdir(list_dir):
151 |         os.makedirs(list_dir)
152 |     feat_dir = path.join(feat_type, args.language)
153 |     if not path.isdir(feat_dir):
154 |         os.makedirs(feat_dir)
155 | 
156 |     # All ground truth word segments with pronunciations
157 |     for subset in ["dev"]:  #, "eval", "train"]:
158 | 
159 |         list_fn = path.join(list_dir, subset + ".all_gt_words.list")
160 |         pronunciations_fn = path.join(list_dir, subset + ".prons")
161 | 
162 |         # Read forced alignments and obtain pronunciations
163 |         word_fa_fn = path.join(
164 |             gp_alignments_dir, args.language, subset + ".ctm"
165 |             )
166 |         phone_fa_fn = path.join(
167 |             # gp_alignments_dir, args.language, subset + ".phone.ctm"
168 |             gp_alignments_dir, args.language, subset + ".phone.ipa.ctm"
169 |             )
170 |         if not path.isfile(phone_fa_fn):
171 |             print("Warning: IPA pronunciations not found")
172 |             phone_fa_fn = path.join(
173 |                 gp_alignments_dir, args.language, subset + ".phone.ctm"
174 |                 )            
175 |         pronunciations_dict = pronunciations_from_fa(
176 |             word_fa_fn, phone_fa_fn
177 |             )
178 | 
179 |         # Write pronunciation list
180 |         if not path.isfile(pronunciations_fn):
181 |             print("Writing:", pronunciations_fn)
182 |             with codecs.open(pronunciations_fn, "w", "utf-8") as f:
183 |                 for segment_key in sorted(pronunciations_dict):
184 |                     f.write(
185 |                         segment_key + " " +
186 |                         ",".join(pronunciations_dict[segment_key]) + "\n"
187 |                         )
188 |         else:
189 |             print("Using existing file:", pronunciations_fn)
190 | 
191 |         # Write word list
192 |         if not path.isfile(list_fn):
193 |             print("Writing:", list_fn)
194 |             with codecs.open(list_fn, "w", "utf-8") as f:
195 |                 for segment_key in sorted(pronunciations_dict):
196 |                     f.write(segment_key + "\n")
197 |         else:
198 |             print("Using existing file:", list_fn)
199 | 
200 |         # Write individual phone list
201 |         phone_list_fn = path.join(list_dir, subset + ".phone.list")
202 |         if not path.isfile(phone_list_fn):
203 |             utils.filter_words(
204 |                 phone_fa_fn, phone_list_fn, min_frames=5, min_chars=0
205 |                 )
206 |         else:
207 |             print("Using existing file:", phone_list_fn)
208 | 
209 |         # Filter phones
210 |         print("Reading:", phone_list_fn)
211 |         phone_segment_keys = []
212 |         with codecs.open(phone_list_fn, "r", "utf-8") as f:
213 |             for line in f:
214 |                 phone_segment_keys.append(line.strip())
215 |         phone_filtered_keys = filter_segment_keys(
216 |             phone_segment_keys, n_max_tokens=5000
217 |             )
218 |         phone_filtered_list_fn = path.join(
219 |             list_dir, subset + ".filter1_phone.list"
220 |             )
221 |         print("Writing:", phone_filtered_list_fn)
222 |         if not path.isfile(phone_filtered_list_fn):
223 |             with codecs.open(phone_filtered_list_fn, "w", "utf-8") as f:
224 |                 for segment_key in sorted(phone_filtered_keys):
225 |                     f.write(segment_key + "\n")
226 |         else:
227 |             print("Using existing file:", phone_filtered_list_fn)
228 | 
229 |         # Extract phone segments from the MFCC NumPy archives
230 |         input_npz_fn = path.join(
231 |             "..", "features", feat_type, args.language, args.language.lower() +
232 |             "." + subset + ".npz"
233 |             )
234 |         output_npz_fn = path.join(
235 |             feat_dir, args.language.lower() + "." + subset +
236 |             ".filter1_phone.npz"
237 |             )
238 |         if not path.isfile(output_npz_fn):
239 |             utils.segments_from_npz(
240 |                 input_npz_fn, phone_filtered_list_fn, output_npz_fn
241 |                 )
242 |         else:
243 |             print("Using existing file:", output_npz_fn)
244 | 
245 |         if args.analyse:
246 |             import matplotlib.pyplot as plt
247 |             import numpy as np
248 | 
249 |             # Most common words
250 |             labels = [i.split("_")[0] for i in pronunciations_dict]
251 |             counter = Counter(labels)
252 |             print("No. word types:", len(counter))
253 |             print("No. word tokens:", len(labels))
254 |             print("Most common words:", counter.most_common(10))
255 | 
256 |             # Histogram of word count
257 |             counts = counter.values()
258 |             plt.figure()
259 |             plt.hist(counts, 50)
260 |             plt.yscale("log")
261 |             plt.ylabel("No. of types with this many tokens")
262 |             plt.xlabel("No. of tokens")
263 | 
264 |             # # Temp
265 |             # # Most common words
266 |             # labels = [i.split("_")[0] for i in filtered_keys]
267 |             # counter = Counter(labels)
268 |             # print("No. word types:", len(counter))
269 |             # print("No. word tokens:", len(labels))
270 |             # print("Most common words:", counter.most_common(10))
271 | 
272 |             # # Histogram of word count
273 |             # counts = counter.values()
274 |             # plt.figure()
275 |             # plt.hist(counts, 50)
276 |             # plt.yscale("log")
277 |             # plt.ylabel("No. of types with this many tokens")
278 |             # plt.xlabel("No. of tokens")
279 | 
280 |             plt.show()
281 | 
282 |         # Filter 1
283 |         print("Applying filter 1")
284 |         n_min_tokens_per_type = 10
285 |         n_max_tokens_per_type = 25
286 |         filtered_keys = filter_segment_keys(
287 |             list(pronunciations_dict), n_min_tokens_per_type,
288 |             n_max_tokens_per_type
289 |             )
290 |         print("No. tokens:", len(filtered_keys))
291 |         print(
292 |             "No. types:", len(set([i.split("_")[0] for i in filtered_keys]))
293 |             )
294 |         filtered_list_fn = path.join(list_dir, subset + ".filter1_gt.list")
295 |         print("Writing:", filtered_list_fn)
296 |         if not path.isfile(filtered_list_fn):
297 |             with codecs.open(filtered_list_fn, "w", "utf-8") as f:
298 |                 for segment_key in sorted(filtered_keys):
299 |                     f.write(segment_key + "\n")
300 |         else:
301 |             print("Using existing file:", filtered_list_fn)
302 | 
303 |         # Extract word segments from the MFCC NumPy archives
304 |         input_npz_fn = path.join(
305 |             "..", "features", feat_type, args.language, args.language.lower() +
306 |             "." + subset + ".npz"
307 |             )
308 |         output_npz_fn = path.join(
309 |             feat_dir, args.language.lower() + "." + subset + ".filter1_gt.npz"
310 |             )
311 |         if not path.isfile(output_npz_fn):
312 |             utils.segments_from_npz(
313 |                 input_npz_fn, filtered_list_fn, output_npz_fn
314 |                 )
315 |         else:
316 |             print("Using existing file:", output_npz_fn)
317 | 
318 |         # dev.filtered_gt_words.list
319 | 
320 | if __name__ == "__main__":
321 |     main()
322 | 


--------------------------------------------------------------------------------
/blackbox/analyse_pairs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Analyse all pair-wise distances and compare to a number of other properties.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2019
  9 | """
 10 | 
 11 | from os import path
 12 | from scipy.spatial.distance import pdist
 13 | from tqdm import tqdm
 14 | import argparse
 15 | import codecs
 16 | import matplotlib.pyplot as plt
 17 | import numpy as np
 18 | import random
 19 | import sys
 20 | 
 21 | sys.path.append(path.join("..", "..", "src", "speech_dtw", "utils"))
 22 | 
 23 | from dp_align import DPEntry, DPError
 24 | import dp_align
 25 | import samediff
 26 | 
 27 | 
 28 | #-----------------------------------------------------------------------------#
 29 | #                              UTILITY FUNCTIONS                              #
 30 | #-----------------------------------------------------------------------------#
 31 | 
 32 | def check_argv():
 33 |     """Check the command line arguments."""
 34 |     parser = argparse.ArgumentParser(
 35 |         description=__doc__.strip().split("\n")[0], add_help=False
 36 |         )
 37 |     parser.add_argument("npz_fn", type=str, help="NumPy archive of embeddings")
 38 |     parser.add_argument(
 39 |         "--pronunciation", type=str,
 40 |         help="if provided, the pronunciations for this GlobalPhone "
 41 |         "language is used", choices=["BG", "CH", "CR", "CZ", "FR", "GE", "HA",
 42 |         "KO", "PL", "PO", "RU", "SP", "SW", "TH", "TU", "VN"], default=None
 43 |         )
 44 |     if len(sys.argv) == 1:
 45 |         parser.print_help()
 46 |         sys.exit(1)
 47 |     return parser.parse_args()
 48 | 
 49 | 
 50 | def editdistance_array(labels):
 51 |     """
 52 |     Return an array of int in the same order as the distances from
 53 |     `scipy.spatial.distance.pdist` indicating the edit distance between all
 54 |     pairs of labels.
 55 |     """
 56 |     N = len(labels)
 57 |     edits = np.zeros(int(N*(N - 1)/2), dtype=int)
 58 | 
 59 |     # Calculate the edit distance for every pair of labels
 60 |     cur_edits_i = 0
 61 |     for n in tqdm(range(N - 1)):
 62 |         cur_label = labels[n]
 63 |         # distances = []
 64 |         for i_offset, test_label in enumerate(labels[n + 1:]):
 65 |             a = dp_align.dp_align(cur_label, test_label)
 66 |             edits[cur_edits_i + i_offset] = a.get_levenshtein()
 67 |             # print(
 68 |             #     "Distance between {} and {}: {}".format(cur_label, test_label,
 69 |             #     a.get_levenshtein())
 70 |             #     )
 71 |         # edits[cur_edits_i:cur_edits_i + (N - n) - 1] = distances
 72 |         # edits[cur_edits_i:cur_edits_i + (N - n) - 1] = np.asarray(
 73 |         #     labels[n + 1:]
 74 |         #     ) == cur_label
 75 |         cur_edits_i += N - n - 1
 76 | 
 77 |     return edits
 78 | 
 79 | 
 80 | def read_pronunciations(fn):
 81 |     pronunciations = {}
 82 |     with codecs.open(fn, "r", "utf-8") as f:
 83 |         for line in f:
 84 |             utt_key, pronunciation = line.strip().split()
 85 |             pronunciations[utt_key] = pronunciation.split(",")
 86 |     return pronunciations
 87 | 
 88 | 
 89 | #-----------------------------------------------------------------------------#
 90 | #                        SPECIALISED ALIGNMENT FUNCTION                       #
 91 | #-----------------------------------------------------------------------------#
 92 | 
 93 | def dp_align_edit_positions(ref_list, test_list, ins_penalty=3, del_penalty=3,
 94 |         sub_penalty=4):
 95 |     """
 96 |     Determines whether a edit operation occurs in the beginning, middle or end.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     ref_list : list
101 |     test_list : list
102 | 
103 |     Return
104 |     ------
105 |     dp_errors, edit_start, edit_middle, edit_end : DPError, (bool, bool, bool)
106 |     """
107 | 
108 |     # Initialise the alignment matrix
109 |     dp_matrix = np.empty(
110 |         [len(test_list) + 1, len(ref_list) + 1], dtype = object
111 |         )
112 |     for i in range(len(test_list) + 1):
113 |         for j in range(len(ref_list) + 1):
114 |             dp_matrix[i][j] = DPEntry()
115 | 
116 |     # Initialise the origin
117 |     dp_matrix[0][0].score = 0
118 |     dp_matrix[0][0].align = "m"
119 | 
120 |     # The first row is all delections:
121 |     for j in range(1, len(ref_list) + 1):
122 |         dp_matrix[0][j].score = j*del_penalty
123 |         dp_matrix[0][j].align = "d"
124 | 
125 |     # Fill dp_matrix
126 |     for i in range(1, len(test_list) + 1):
127 | 
128 |         # First column is all insertions
129 |         dp_matrix[i][0].score = i*ins_penalty
130 |         dp_matrix[i][0].align = "i"
131 | 
132 |         for j in range(1, len(ref_list) + 1):
133 |             del_score = dp_matrix[i, j - 1].score + del_penalty
134 |             ins_score = dp_matrix[i - 1, j].score + ins_penalty
135 | 
136 |             if test_list[i - 1] == ref_list[j - 1]:
137 | 
138 |                 # Considering a match
139 |                 match_score = dp_matrix[i - 1, j - 1].score
140 | 
141 |                 # Test for a match
142 |                 if match_score <= del_score and match_score <= ins_score:
143 |                     dp_matrix[i, j].score = match_score
144 |                     dp_matrix[i, j].align = "m"
145 |                 # Test for a deletion
146 |                 elif del_score <= ins_score:
147 |                     dp_matrix[i, j].score = del_score
148 |                     dp_matrix[i, j].align = "d"
149 |                 # Test for an insertion (only option left)
150 |                 else:
151 |                     dp_matrix[i, j].score = ins_score
152 |                     dp_matrix[i, j].align = "i"
153 | 
154 |             else:
155 | 
156 |                 # Considering a substitution
157 |                 sub_score = dp_matrix[i - 1, j - 1].score + sub_penalty
158 | 
159 |                 # Test for a substitution
160 |                 if sub_score < del_score and sub_score <= ins_score:
161 |                     dp_matrix[i, j].score = sub_score
162 |                     dp_matrix[i, j].align = "s"
163 |                 # Test for a deletion
164 |                 elif del_score <= ins_score:
165 |                     dp_matrix[i, j].score = del_score
166 |                     dp_matrix[i, j].align = "d"
167 |                 # Test for an insertion (only option left)
168 |                 else:
169 |                     dp_matrix[i, j].score = ins_score
170 |                     dp_matrix[i, j].align = "i"
171 | 
172 |     # Perform alignment by tracking through the dp_matrix
173 |     dp_errors = DPError()
174 |     dp_errors.n_total = len(ref_list)
175 |     i = len(test_list)
176 |     j = len(ref_list)
177 |     edit_start = False
178 |     edit_end = False
179 |     edit_middle = False
180 |     while i > 0 or j > 0:
181 |         if dp_matrix[i, j].align == "m":
182 |             i -= 1
183 |             j -= 1
184 |             dp_errors.n_match += 1
185 |         elif dp_matrix[i, j].align == "s":
186 |             if i == len(test_list) and j == len(ref_list):
187 |                 edit_end = True
188 |             elif i == 1 and j == 1:
189 |                 edit_start = True
190 |             else:
191 |                 edit_middle = True
192 |             i -= 1
193 |             j -= 1
194 |             dp_errors.n_sub += 1
195 |         elif dp_matrix[i, j].align == "d":
196 |             if i == len(test_list) and j == len(ref_list):
197 |                 edit_end = True
198 |             elif i == 0 and j == 1:
199 |                 edit_start = True
200 |             else:
201 |                 edit_middle = True
202 |             j -= 1
203 |             dp_errors.n_del += 1
204 |         elif dp_matrix[i, j].align == "i":
205 |             if i == len(test_list) and j == len(ref_list):
206 |                 edit_end = True
207 |             elif i == 1 and j == 0:
208 |                 edit_start = True
209 |             else:
210 |                 edit_middle = True
211 |             i -= 1
212 |             dp_errors.n_ins += 1
213 | 
214 |     # Return the alignment and edit positions
215 |     return dp_errors, edit_start, edit_middle, edit_end
216 | 
217 | 
218 | #-----------------------------------------------------------------------------#
219 | #                                MAIN FUNCTION                                #
220 | #-----------------------------------------------------------------------------#
221 | 
222 | def main():
223 |     args = check_argv()
224 | 
225 |     print("Reading:", args.npz_fn)
226 |     embeddings = np.load(args.npz_fn)
227 | 
228 |     # # Temp
229 |     # data = {}
230 |     # a = list(embeddings)
231 |     # random.shuffle(a)
232 |     # for key in a[:100]:
233 |     #     data[key] = embeddings[key]
234 |     # embeddings = data
235 | 
236 |     print("Ordering embeddings:")
237 |     n_embeds = 0
238 |     X = []
239 |     utt_keys = []
240 |     labels = []
241 |     speakers = []
242 |     for utt_key in tqdm(sorted(embeddings)):
243 |         utt_keys.append(utt_key)
244 |         X.append(embeddings[utt_key])
245 |         utt_key = utt_key.split("_")
246 |         label = utt_key[0]
247 |         speaker = utt_key[1]
248 |         labels.append(label)
249 |         speakers.append(speaker)
250 |     X = np.array(X)
251 |     print("No. embeddings:", X.shape[0])
252 |     print("Embedding dimensionality:", X.shape[1])
253 | 
254 |     # Normalise
255 |     normed = (X - X.mean(axis=0)) / X.std(axis=0)
256 |     X = normed
257 | 
258 |     print("Calculating distances")
259 |     distances = pdist(X, metric="cosine")
260 | 
261 |     # Plot: Matching words
262 |     print("Getting word matches")
263 |     word_matches = samediff.generate_matches_array(labels)
264 |     print("Total no. pairs:", word_matches.shape[0])
265 |     print("No. same-word pairs:", sum(word_matches))
266 |     distances_pos_avg = np.mean(distances[word_matches == True])
267 |     distances_neg_avg = np.mean(distances[word_matches == False])
268 |     distances_pos_std = np.std(distances[word_matches == True])
269 |     distances_neg_std = np.std(distances[word_matches == False])
270 |     plt.figure()
271 |     plt.bar(
272 |         [0, 1], [distances_neg_avg, distances_pos_avg],
273 |         yerr=[distances_neg_std, distances_pos_std]
274 |         )
275 |     plt.xticks([0, 1], ("No", "Yes"))
276 |     plt.xlabel("Matching words")
277 |     plt.ylabel("Cosine distance")
278 |     plt.ylim([0, 1.2])
279 | 
280 |     # Plot: Same speakers
281 |     print("Getting speaker matches")
282 |     speaker_matches = samediff.generate_matches_array(speakers)
283 |     print("No. same-speaker pairs:", sum(speaker_matches))    
284 |     distances_pos_avg = np.mean(
285 |         distances[np.logical_and(word_matches, speaker_matches)]
286 |         )
287 |     distances_neg_avg = np.mean(
288 |         distances[np.logical_and(word_matches, speaker_matches == False)]
289 |         )
290 |     distances_pos_std = np.std(
291 |         distances[np.logical_and(word_matches, speaker_matches)]
292 |         )
293 |     distances_neg_std = np.std(
294 |         distances[np.logical_and(word_matches, speaker_matches == False)]
295 |         )
296 |     # distances_pos_avg = np.mean(distances[speaker_matches == True])
297 |     # distances_neg_avg = np.mean(distances[speaker_matches == False])
298 |     # distances_pos_std = np.std(distances[speaker_matches == True])
299 |     # distances_neg_std = np.std(distances[speaker_matches == False])
300 |     plt.figure()
301 |     plt.bar(
302 |         [0, 1], [distances_neg_avg, distances_pos_avg],
303 |         yerr=[distances_neg_std, distances_pos_std]
304 |         )
305 |     plt.xticks([0, 1], ("No", "Yes"))
306 |     plt.xlabel("Matching speakers")
307 |     plt.ylabel("Cosine distance")
308 |     plt.ylim([0, 1.2])
309 |     plt.title("Distances between same-word pairs")
310 | 
311 |     # Plot: Edit distances
312 |     if args.pronunciation is not None:
313 | 
314 |         # Pronunciations
315 |         pron_fn = path.join("lists", args.pronunciation, "dev.prons")
316 |         print("Reading:", pron_fn)
317 |         pronunciations = read_pronunciations(pron_fn)
318 |         pron_labels = []
319 |         for utt_key in utt_keys:
320 |             pron_labels.append(pronunciations[utt_key])
321 | 
322 |         # Get distances
323 |         print("Getting edit distances:")
324 |         # edit_distances = editdistance_array(labels)
325 |         edit_distances = editdistance_array(pron_labels)
326 | 
327 |         # Plot distances
328 |         edits = sorted(set(edit_distances))
329 |         averages = []
330 |         stds = []
331 |         for edit in edits:
332 |             averages.append(np.mean(distances[edit_distances == edit]))
333 |             stds.append(np.std(distances[edit_distances == edit]))
334 |         plt.figure()
335 |         plt.bar(edits, averages, yerr=stds)
336 |         plt.ylim([0, 1.2])
337 |         plt.xlabel("Phone edit distance")
338 |         plt.ylabel("Cosine distance")
339 | 
340 |     plt.show()
341 | 
342 | 
343 | if __name__ == "__main__":
344 |     main()
345 | 


--------------------------------------------------------------------------------
/qbe/eval_qbe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Evaluate QbE performance for a given costs directory.
  5 | 
  6 | Author: Herman Kamper
  7 | Contact: kamperh@gmail.com
  8 | Date: 2017, 2019
  9 | """
 10 | 
 11 | from collections import Counter
 12 | from os import path
 13 | from scipy.interpolate import interp1d
 14 | from scipy.optimize import brentq
 15 | from tqdm import tqdm
 16 | import argparse
 17 | import codecs
 18 | import pickle
 19 | import numpy as np
 20 | import sklearn.metrics as metrics
 21 | import sys
 22 | 
 23 | sys.path.append("..")
 24 | 
 25 | from paths import gp_alignments_dir
 26 | 
 27 | 
 28 | #-----------------------------------------------------------------------------#
 29 | #                              UTILITY FUNCTIONS                              #
 30 | #-----------------------------------------------------------------------------#
 31 | 
 32 | def check_argv():
 33 |     """Check the command line arguments."""
 34 |     parser = argparse.ArgumentParser(
 35 |         description=__doc__.strip().split("\n")[0], add_help=False
 36 |         )
 37 |     parser.add_argument(
 38 |         "language", type=str, help="GlobalPhone language",
 39 |         choices=["HA"]
 40 |         )
 41 |     parser.add_argument(
 42 |         "cost_dict_fn", type=str,
 43 |         help="filename of the cost dictionary"
 44 |         )
 45 |     if len(sys.argv) == 1:
 46 |         parser.print_help()
 47 |         sys.exit(1)
 48 |     return parser.parse_args()
 49 | 
 50 | 
 51 | 
 52 | #-----------------------------------------------------------------------------#
 53 | #                             EVALUATION FUNCTIONS                            #
 54 | #-----------------------------------------------------------------------------#
 55 | 
 56 | def calculate_eer(y_true, y_score):
 57 |     # https://yangcha.github.io/EER-ROC/
 58 |     fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
 59 |     eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
 60 |     thresh = interp1d(fpr, thresholds)(eer)
 61 |     return eer
 62 | 
 63 | 
 64 | def calculate_auc(y_true, y_score):
 65 |     fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)
 66 |     return metrics.auc(fpr, tpr)
 67 | 
 68 | 
 69 | def eval_precision_recall_fscore(cost_dict, label_dict, threshold,
 70 |         analyse=False):
 71 |     """Evaluate precision and recall for a particular output."""
 72 | 
 73 |     # # Get average scores
 74 |     # avg_keyword_scores = {}
 75 |     # for keyword in cost_dict:
 76 |     #     scores = []
 77 |     #     for utt in cost_dict[keyword]:
 78 |     #         scores.append(cost_dict[keyword][utt])
 79 |     #     avg_keyword_scores[keyword] = np.mean(scores)
 80 |     # print(avg_keyword_scores)
 81 | 
 82 |     # For each utterance, which keywords above threshold
 83 |     threshold_dict = {}
 84 |     for keyword in cost_dict:
 85 |         for utt in cost_dict[keyword]:
 86 |             if utt not in threshold_dict:
 87 |                 threshold_dict[utt] = []
 88 |             if cost_dict[keyword][utt] <= threshold:
 89 |             # if (cost_dict[keyword][utt] <=
 90 |             #         avg_keyword_scores[keyword]*threshold):
 91 |                 threshold_dict[utt].append(keyword)
 92 |     keywords = cost_dict.keys()
 93 | 
 94 |     # Calculate precision and recall
 95 |     n_tp = 0
 96 |     n_pred = 0
 97 |     n_true = 0
 98 |     word_tokens_correct = []
 99 |     if analyse:
100 |         print()
101 |     for utt in sorted(threshold_dict):
102 |         if utt not in label_dict:
103 |             continue
104 |         y_pred = threshold_dict[utt]
105 |         y_true = [i for i in label_dict[utt].split() if i in keywords]
106 |         cur_tokens_correct = set([i for i in y_true if i in y_pred])
107 |         word_tokens_correct.extend(cur_tokens_correct)
108 |         n_tp += len(cur_tokens_correct)
109 |         n_pred += len(y_pred)
110 |         n_true += len(set(y_true))
111 |         if analyse:
112 |             if len(y_pred) > 0:
113 |                 print("-"*79)
114 |                 print("Utterance:", utt)
115 |                 print("Predicted:", sorted(y_pred))
116 |                 print("Ground truth:", y_true)
117 |                 if n_pred > 0:
118 |                     print(
119 |                         "Current precision: {} / {} = {:.4f}".format( n_tp,
120 |                         n_pred, float(n_tp)/n_pred*100.)
121 |                         )
122 |                 if n_true > 0:
123 |                     print(
124 |                         "Current recall: {} / {} = {:.4f}".format(
125 |                         n_tp, n_true, float(n_tp)/n_true*100.)
126 |                         )
127 |     precision = float(n_tp)/n_pred if n_pred != 0 else 0
128 |     recall = float(n_tp)/n_true
129 |     f_score = (
130 |         2*precision*recall/(precision + recall) if precision + recall != 0 else
131 |         0
132 |         )
133 | 
134 |     if analyse:
135 |         print("-"*79)
136 |         print
137 |         print(
138 |             "Most common correctly predicted words:",
139 |             Counter(word_tokens_correct).most_common(15)
140 |             )
141 | 
142 |     return n_tp, n_pred, n_true, precision, recall, f_score
143 | 
144 | 
145 | def eval_qbe(cost_dict, label_dict, analyse=False):
146 |     """
147 |     Return dictionaries of P@10, P@N and EER for each query item.
148 | 
149 |     The keys of each of the returned dictionaries are the unique keyword types,
150 |     with the value a list of the scores for each of the queries of that keyword
151 |     type.
152 |     """
153 | 
154 |     # Unique keywords with query keys
155 |     keyword_dict = {}
156 |     for query_key in cost_dict:
157 |         keyword = query_key.split("_")[0]
158 |         if keyword not in keyword_dict:
159 |             keyword_dict[keyword] = []
160 |         keyword_dict[keyword].append(query_key)
161 | 
162 |     # For each keywords
163 |     eer_dict = {}  # `eer_dict[keyword]` is a list of EER scores for each query
164 |                    # of that keyword type
165 |     auc_dict = {}
166 |     p_at_10_dict = {}
167 |     p_at_n_dict = {}
168 |     if analyse:
169 |         print()
170 |     for keyword in tqdm(sorted(keyword_dict)):
171 | 
172 |         eer_dict[keyword] = []
173 |         auc_dict[keyword] = []
174 |         p_at_10_dict[keyword] = []
175 |         p_at_n_dict[keyword] = []
176 | 
177 |         # For each query key
178 |         for query_key in sorted(keyword_dict[keyword]):
179 | 
180 |             # Rank search keys
181 |             utt_order = [
182 |                 utt_key for utt_key in sorted(cost_dict[query_key],
183 |                 key=cost_dict[query_key].get) if utt_key in label_dict
184 |                 ]
185 | 
186 |             # EER
187 |             y_true = []
188 |             for utt_key in utt_order:
189 |                 if keyword in label_dict[utt_key]:
190 |                     y_true.append(1)
191 |                 else:
192 |                     y_true.append(0)
193 |             y_score = [cost_dict[query_key][utt_key] for utt_key in utt_order]
194 |             cur_eer = calculate_eer(y_true, [-i for i in y_score])
195 |             cur_auc = calculate_auc(y_true, [-i for i in y_score])
196 |             eer_dict[keyword].append(cur_eer)
197 |             auc_dict[keyword].append(cur_auc)
198 | 
199 |             # P@10
200 |             cur_p_at_10 = float(sum(y_true[:10]))/10.
201 |             p_at_10_dict[keyword].append(cur_p_at_10)
202 | 
203 |             # P@N
204 |             cur_p_at_n = np.float64(sum(y_true[:sum(y_true)]))/sum(y_true)
205 |             p_at_n_dict[keyword].append(cur_p_at_n)
206 | 
207 |             if analyse:
208 |                 print("-"*79)
209 |                 print("Query:", query_key)
210 |                 print("Current P@10: {:.4f}".format(cur_p_at_10))
211 |                 print("Current P@N: {:.4f}".format(cur_p_at_n))
212 |                 print("Current EER: {:.4f}".format(cur_eer))
213 |                 print("Current AUC: {:.4f}".format(cur_auc))
214 |                 # print("Top 10 utterances: ", utt_order[:10])
215 |                 print("Top 10 utterances:")
216 |                 for i_utt, utt in enumerate(utt_order[:10]):
217 |                     print("{}: {}".format(
218 |                         # utt, " ".join(label_dict[utt])), end=''
219 |                         utt, label_dict[utt]), end=''
220 |                         )
221 |                     if y_true[i_utt] == 0:
222 |                         print(" *")
223 |                     else:
224 |                         print()
225 | 
226 |     if analyse:
227 |         print("-"*79)
228 |         print()
229 | 
230 |     return eer_dict, auc_dict, p_at_10_dict, p_at_n_dict
231 | 
232 | 
233 | def get_avg_scores(score_dict):
234 |     """
235 |     Return the overall average, and unweighted average, median and maximum
236 |     scores over all keyword types.
237 | 
238 |     Return
239 |     ------
240 |     avg_all_scores, avg_avg_scores, avg_median_scores, avg_max_scores
241 |     """
242 |     all_scores = []
243 |     avg_scores = []
244 |     median_scores = []
245 |     max_scores = []
246 |     min_scores = []
247 | 
248 |     for keyword in score_dict:
249 |         all_scores.extend(score_dict[keyword])
250 |         avg_scores.append(np.mean(score_dict[keyword]))
251 |         median_scores.append(np.median(score_dict[keyword]))
252 |         max_scores.append(np.max(score_dict[keyword]))
253 |         min_scores.append(np.min(score_dict[keyword]))
254 | 
255 |     avg_all_scores = np.mean(all_scores)
256 |     avg_avg_scores = np.mean(avg_scores)
257 |     avg_median_scores = np.mean(median_scores)
258 |     avg_max_scores = np.mean(max_scores)
259 |     avg_min_scores = np.mean(min_scores)
260 | 
261 |     return (
262 |         avg_all_scores, avg_avg_scores, avg_median_scores, avg_max_scores,
263 |         avg_min_scores
264 |         )
265 | 
266 | def read_forced_alignment(globalphone_fa_fn):
267 |     """
268 |     Return a dictionary of transcriptions obtained from a GlobalPhone forced
269 |     alignment file.
270 |     """
271 |     transcription_dict = {}
272 |     with codecs.open(globalphone_fa_fn, "r", "utf-8") as f:
273 |         for line in f:
274 |             line = line.strip().split(" ")
275 |             utterance_key = line[0]
276 |             label = line[4].lower()
277 |             if utterance_key not in transcription_dict:
278 |                 transcription_dict[utterance_key] = label
279 |                 # transcription_dict[utterance_key] = []
280 |             else:
281 |                 transcription_dict[utterance_key] += " " + label
282 |             # transcription_dict[utterance_key].append(label)
283 |     return transcription_dict
284 | 
285 | 
286 | #-----------------------------------------------------------------------------#
287 | #                                MAIN FUNCTION                                #
288 | #-----------------------------------------------------------------------------#
289 | 
290 | def main():
291 |     args = check_argv()
292 | 
293 |     fn = path.join(args.cost_dict_fn)
294 |     print("Reading:", fn)
295 |     with open(fn, "rb") as f:
296 |         cost_dict = pickle.load(f)
297 |     print(
298 |         "Keywords: " + ", ".join(sorted(set([i.split("_")[0] for i in
299 |         cost_dict.keys()])))
300 |         )
301 | 
302 |     globalphone_fa_fn = path.join(gp_alignments_dir, args.language, "eval.ctm")
303 |     print("Reading:", globalphone_fa_fn)
304 |     transcription_dict = read_forced_alignment(globalphone_fa_fn)
305 |     # print(transcription_dict)
306 | 
307 |     print("Evaluating:")
308 |     eer_dict, auc_dict, p_at_10_dict, p_at_n_dict = eval_qbe(
309 |         cost_dict, transcription_dict
310 |         )
311 | 
312 |     eer_overall, eer_avg, eer_median, eer_max, eer_min = get_avg_scores(
313 |         eer_dict
314 |         )
315 |     auc_overall, auc_avg, auc_median, auc_max, auc_min = get_avg_scores(
316 |         auc_dict
317 |         )
318 |     p_at_10_overall, p_at_10_avg, p_at_10_median, p_at_10_max, p_at_10_min = (
319 |         get_avg_scores(p_at_10_dict)
320 |         )
321 |     p_at_n_overall, p_at_n_avg, p_at_n_median, p_at_n_max, p_at_n_min = (
322 |         get_avg_scores(p_at_n_dict)
323 |         )
324 | 
325 |     print()
326 |     print("-"*79)
327 |     print(
328 |         "EER:  {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, "
329 |         "min: {:.4f}".format(eer_overall, eer_avg, eer_median, eer_max,
330 |         eer_min)
331 |         )
332 |     print(
333 |         "AUC:  {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, "
334 |         "min: {:.4f}".format(auc_overall, auc_avg, auc_median, auc_max,
335 |         auc_min)
336 |         )
337 |     print(
338 |         "P@10: {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, "
339 |         "min: {:.4f}".format(p_at_10_overall, p_at_10_avg, p_at_10_median,
340 |         p_at_10_max, p_at_10_min)
341 |         )
342 |     print(
343 |         "P@N:  {:.4f}, avg: {:.4f}, median: {:.4f}, max: {:.4f}, "
344 |         "min: {:.4f}".format(p_at_n_overall, p_at_n_avg, p_at_n_median,
345 |         p_at_n_max, p_at_n_min)
346 |         )
347 |     print("-"*79)
348 | 
349 | 
350 | if __name__ == "__main__":
351 |     main()
352 | 


--------------------------------------------------------------------------------