├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── _train_pro.sh
├── centerpush_nonullspace_phoneme_classification.sh
├── centerpush_nullspace_phoneme_classification.sh
├── cpc
    ├── README.md
    ├── __init__.py
    ├── cpc_default_config.py
    ├── criterion
    │   ├── __init__.py
    │   ├── clustering
    │   │   ├── __init__.py
    │   │   ├── clustering.py
    │   │   ├── clustering_quantization.py
    │   │   └── clustering_script.py
    │   ├── criterion.py
    │   ├── custom_layers.py
    │   ├── seq_alignment.py
    │   └── soft_align.py
    ├── dataset.py
    ├── eval
    │   ├── ABX.py
    │   ├── ABX
    │   │   ├── __init__.py
    │   │   ├── abx_group_computation.py
    │   │   ├── abx_iterators.py
    │   │   ├── dtw.pyx
    │   │   ├── test_data
    │   │   │   ├── 2107.npy
    │   │   │   ├── 23.npy
    │   │   │   ├── 407.npy
    │   │   │   ├── 42.npy
    │   │   │   ├── dummy_item_file.item
    │   │   │   └── dummy_item_within.item
    │   │   └── unit_tests.py
    │   ├── __init__.py
    │   ├── build_zeroSpeech_features.py
    │   ├── common_voices_eval.py
    │   ├── linear_separability.py
    │   └── utils
    │   │   └── adjust_sample_rate.py
    ├── feature_loader.py
    ├── model.py
    ├── stats
    │   ├── __init__.py
    │   ├── empty_stat.py
    │   ├── repr_diff_stat.py
    │   ├── stat_utils.py
    │   └── stats_collector.py
    ├── test_data
    │   ├── phone_labels.txt
    │   ├── seq_list.txt
    │   └── test_db
    │   │   ├── 2911
    │   │       └── 12359
    │   │       │   └── 2911-12359-0007.flac
    │   │   ├── 4051
    │   │       └── 11218
    │   │       │   └── 4051-11218-0044.flac
    │   │   ├── 4397
    │   │       └── 15668
    │   │       │   ├── 4397-15668-0003.flac
    │   │       │   └── 4397-15668-0007.flac
    │   │   ├── 5393
    │   │       └── 19218
    │   │       │   └── 5393-19218-0024.flac
    │   │   ├── 5678
    │   │       ├── 43301
    │   │       │   └── 5678-43301-0021.flac
    │   │       └── 43303
    │   │       │   ├── 5678-43303-0024.flac
    │   │       │   └── 5678-43303-0032.flac
    │   │   └── 6476
    │   │       └── 57446
    │   │           └── 6476-57446-0019.flac
    ├── train.py
    ├── transformers.py
    ├── unit_tests.py
    └── utils
    │   ├── __init__.py
    │   ├── capture_loader.py
    │   ├── misc.py
    │   └── unit_tests.py
├── cpc_ctc_visualization.ipynb
├── environment.yml
├── experiments
    ├── train_pro_1gpu.sh
    ├── train_pro_2gpu.sh
    └── train_pro_cpcctc_bases.sh
├── finetune_nullspace.sh
├── hubconf.py
├── jch_experiments
├── lineval_ls100.sh
├── run_clustering.sh
├── scripts
    ├── build_1hot_features.py
    ├── build_BERT_features.py
    ├── build_CPC_features.py
    ├── build_LSTM_features.py
    ├── compute_proba_BERT.py
    ├── compute_proba_LSTM.py
    ├── quantize_audio.py
    └── utils
    │   ├── lm_scoring.py
    │   └── utils_functions.py
├── setup.py
└── train_ls100.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | .idea
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at <opensource-conduct@fb.com>. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to CPC_audio
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `master`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## Coding Style  
30 | * 2 spaces for indentation rather than tabs
31 | * 80 character line length
32 | * ...
33 | 
34 | ## License
35 | By contributing to CPC_audio, you agree that your contributions will be licensed
36 | under the LICENSE file in the root directory of this source tree.
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/_train_pro.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script for the Prometheus slurm cluster
  4 | 
  5 | set -x
  6 | 
  7 | RVERB=""  # =-v
  8 | 
  9 | REMOTE_USER=plgjch
 10 | REMOTE_HOST=pro.cyfronet.pl
 11 | 
 12 | # location of the main repository (contains data/)
 13 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 14 | REMOTE_CPC_DIR=/net/people/plgjch/scratch/CPC_audio
 15 | REMOTE_MINICONDA_DIR=/net/archive/groups/plggneurony/os/miniconda3
 16 | REMOTE_LIBRISPEECH_DIR=/net/people/plgjch/lscratch/plgjch/LibriSpeech-wav
 17 | REMOTE_LIBRISPEECH100_SPLITS=/net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split
 18 | 
 19 | # top-level directory for experiments
 20 | REMOTE_EXPERIMENT_RUNDIR=/net/scratch/people/plgjch/cpc/
 21 | 
 22 | # adjust the main loop
 23 | # (it can go over .yaml files, over hyperparameters, etc.
 24 | for PARAMS in \
 25 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSkipEnd 0" \
 26 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0" \
 27 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 3" \
 28 | ; do
 29 | 
 30 | # low-level directory for experiments
 31 | EXP_TAG=remote_pro
 32 | PRINT_PARAMS = $(echo $PARAMS | tr -d ' ' | sed -e 's/-\+/_/g')
 33 | NAME=cpcctc${PRINT_PARAMS}
 34 | DIR=$EXP_TAG/$NAME
 35 | EXP_DIR=$REMOTE_EXPERIMENT_RUNDIR/$DIR
 36 | 
 37 | echo $EXP_DIR
 38 | 
 39 | continue
 40 | 
 41 | ssh -q $REMOTE_USER@$REMOTE_HOST mkdir -p $EXP_DIR
 42 | 
 43 | TMP_DIR=`mktemp -d`
 44 | mkdir $TMP_DIR/code
 45 | # symlink the data from the main dir
 46 | 
 47 | cat > $TMP_DIR/exp_train.sh <<EOF
 48 | #!/bin/bash -l
 49 | ## Job name
 50 | #SBATCH -J ${EXP_TAG}_${NAME}
 51 | ## Nodes
 52 | #SBATCH -N 1
 53 | ## CPU per Node
 54 | #SBATCH -c 6
 55 | ## GPU
 56 | #SBATCH --gres=gpu:2
 57 | ##
 58 | #SBATCH --mem=40GB
 59 | ##
 60 | #SBATCH --time=72:00:00
 61 | ##
 62 | #SBATCH -A plgzerospeech2021gpu
 63 | ##
 64 | #SBATCH -p plgrid-gpu
 65 | ##
 66 | #SBATCH --output="$EXP_DIR/exp_%j.out"
 67 | ##
 68 | #SBATCH --error="$EXP_DIR/exp_%j.out"
 69 | 
 70 | ## go to the exp dir
 71 | cd "$EXP_DIR/code"
 72 | 
 73 | /bin/hostname
 74 | 
 75 | eval "\$($REMOTE_MINICONDA_DIR/bin/conda shell.bash hook)"
 76 | conda activate 202102-cpc
 77 | 
 78 | set -e
 79 | set -x
 80 | 
 81 | export PYTHONPATH=$EXP_DIR/code
 82 | 
 83 | python -u cpc/train.py \
 84 |     --pathCheckpoint $EXP_DIR \
 85 |     --pathDB ${REMOTE_LIBRISPEECH_DIR}/train-clean-100 --file_extension .flac \
 86 |     --pathTrain ${REMOTE_LIBRISPEECH100_SPLITS}/train_split.txt \
 87 |     --pathVal ${REMOTE_LIBRISPEECH100_SPLITS}/test_split.txt \
 88 |     --n_process_loader 1 --max_size_loaded 4000000000 --batchSizeGPU 32 \
 89 |     --normMode layerNorm --dropout --rnnMode transformer  --nLevelsGRU 2  \
 90 |     `#--schedulerRamp 10` --nEpoch 75 \
 91 |     --CPCCTC --limitNegsInBatch 8  --CPCCTCNumMatched 12  --nPredicts $NPREDS $SELFLOOP 
 92 | 
 93 | CP=$(ls $EXP_DIR/checkpoint*.pt | sed -e 's/.*_\([0-9]\+\).pt/\1/' | sort -n | tail -1)
 94 | mkdir -p $EXP_DIR/lineval_${CP}
 95 | python -u cpc/eval/linear_separability.py \
 96 |     ${REMOTE_LIBRISPEECH_DIR}/train-clean-100 \
 97 |     ${REMOTE_LIBRISPEECH100_SPLITS}/train_split.txt \
 98 |     ${REMOTE_LIBRISPEECH100_SPLITS}/test_split.txt \
 99 |     $EXP_DIR/checkpoint_${CP}.pt \
100 |     --pathPhone ${REMOTE_LIBRISPEECH100_SPLITS}/converted_aligned_phones.txt \
101 |     --file_extension .wav \
102 |     --pathCheckpoint $EXP_DIR/lineval_${CP} \
103 |     2>&1 | tee -ai $EXP_DIR/lineval_${CP}/out.txt
104 | EOF
105 | 
106 | # Transmit the startup script
107 | rsync $RVERB -lrpt -e "ssh -q" $TMP_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/
108 | 
109 | # Transmit the rest
110 | rsync --exclude '.*' \
111 |       --exclude data \
112 |       --exclude pretrained_models \
113 |       --exclude '__pycache__' \
114 |       --exclude '*runs*' \
115 |       --exclude '*.pyc' \
116 |       --exclude '*.ipynb' \
117 |       --filter=':- .gitignore' \
118 |     $RVERB -lrpt -e "ssh -q" $CPC_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/code/
119 | 
120 | ssh -q $REMOTE_USER@$REMOTE_HOST sbatch \
121 |     `#--gres="" --time=00:10:00 -p plgrid-testing` \
122 |     $EXP_DIR/exp_train.sh
123 | 
124 | rm -Rf $TMP_DIR
125 | 
126 | done
127 | 
128 | echo "Queue status"
129 | ssh -q $REMOTE_USER@$REMOTE_HOST squeue
130 | 


--------------------------------------------------------------------------------
/centerpush_nonullspace_phoneme_classification.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | for deg in 0 0.2 0.3 0.4 0.5 0.6 0.7
 3 | do
 4 |     echo $deg
 5 |     mkdir ${centerpushDir}/phoneme_classif_nonull_${deg}/
 6 |     python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ \
 7 |     $zd/LibriSpeech/labels_split/train_split_100.txt \
 8 |     $zd/LibriSpeech/labels_split/test_split_100.txt \
 9 |     $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt \
10 |     --centerpushFile $zd/checkpoints/CPC-big-kmeans50/clustering_kmeans50/clustering_CPC_big_kmeans50.pt \
11 |     --centerpushDeg $deg \
12 |     --pathCheckpoint ${centerpushDir}/phoneme_classif_nonull_${deg}/ \
13 |     --mode phonemes --max_size_loaded 40000000 --n_process_loader 2 \
14 |     --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt \
15 |     --gru_level 2 --batchSizeGPU 32 | tee ${centerpushDir}/phoneme_classif_nonull_${deg}/log.txt
16 | done


--------------------------------------------------------------------------------
/centerpush_nullspace_phoneme_classification.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | for deg in 0 0.2 0.3 0.4 0.5 0.6 0.7
 3 | do
 4 |     echo $deg
 5 |     mkdir ${centerpushDir}/phoneme_classif_null_${deg}/
 6 |     python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ \
 7 |     $zd/LibriSpeech/labels_split/train_split_100.txt \
 8 |     $zd/LibriSpeech/labels_split/test_split_100.txt \
 9 |     $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt \
10 |     --centerpushFile $cpcClustDir/checkpoints/clustering_CPC_big_kmeans50_nullspace_64/clustering_CPC_big_kmeans50_nullspace_64.pt \
11 |     --centerpushDeg $deg \
12 |     --pathCheckpoint ${centerpushDir}/phoneme_classif_null_${deg}/ \
13 |     --mode phonemes_nullspace --max_size_loaded 40000000 --n_process_loader 2 \
14 |     --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt \
15 |     --path_speakers_factorized $nullspaceDir/linear_separability/cpc/gru_level2/cpc_official_speakers_factorized_64/checkpoint_9.pt \
16 |     --dim_inter 64 --gru_level 2 --batchSizeGPU 32 | tee ${centerpushDir}/phoneme_classif_null_${deg}/log.txt
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/cpc/README.md:
--------------------------------------------------------------------------------
  1 | # Repository's architecture
  2 | 
  3 | train.py : main script
  4 | 
  5 | dataset.py : defintion of the Librispeech dataset format
  6 | 
  7 | model.py : Basic encoders and AR models
  8 | 
  9 | feature_loader.py: different tools to load and save a CPC model.
 10 | 
 11 | transformers.py: an implementation of transformers
 12 | 
 13 | unit_tests.py : unit tests
 14 | 
 15 | criterion/: definition of the training criterions. Three criterion are currently available: CPC (unsupervised), speaker classification and phone classification.
 16 | 
 17 | eval/: evaluation scripts.
 18 | 
 19 | utils/: system utilities and misc.
 20 | 
 21 | 
 22 | ## Stats module (initial) description
 23 | 
 24 | Under `stats` there are utils for computing stats. `stats/repr_diff_stat.py` is an example, `stats/stats_collector.py` is used to aggregate stats given as arguments to `train.py` and therefore each stat needs to be registered in `stats/stats_utils.py` similarly as `reprDiffStat` (`stats/repr_diff_stat.py`) is. 
 25 | 
 26 | To compute stats for `train.py` run, use `--captureSetStats` which needs to be passed in format `stat1Name:arg1,arg2,arg3_stat2Name:arg1,arg2` where args are stat-specific (example: `reprDiff:cosine,ctx_repr,0.05,../reprDiffHistograms`).
 27 | 
 28 | When specified like that (with `--captureSetStats`), stats are computed for "capture dataset" along with data capturing each specified number of epochs. One can specify to compute only stats and not capture data, but then captureDS still needs to be configured as described below under "CPC-CTC data capturing description". Example how to specify capture dataset: `--pathCaptureDS /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt --captureEachEpochs 2`.
 29 | 
 30 | 
 31 | ## Linear separability automation description:
 32 | 
 33 | This can be combined with data capturing described in the section below
 34 | 
 35 | There are some args added to train.py, in group_supervised_data and group_supervised_metric. In case I forget something here they also have some description there.
 36 | - --supervised_classif_metric is the flag to specify that additional linear separability task should be performed. Additionally, one/both of --speaker_sep, --path_phone_data should be specified to indicate which linear separabilities to perform - --speaker_sep for speaker classification and --path_phone_data for phoneme classification (this should be the path to the .txt file with phone alignments in their format, which they mention in the main readme of the repo)
 37 | - linear separability task can be run in two modes, either once on the trained checkpoint (--only_classif_metric) or each --linsep_classif_each_epochs epochs during main CPC training. To automatically perform linsep once each training, e.g. --linsep_classif_each_epochs 180 can be specified (now linsep is not done at 0. epoch)
 38 | - path where to store logs from linear separability task needs to be specified with --linsep_logs_dir; additionally, logging freqeuncy in epochs can be specified with --linsep_task_logging_step and those will be save under \<--linsep_logs_dir\>/\<CPC_epoch\>/phone  or \<--linsep_logs_dir\>/\<CPC_epoch\>/speaker
 39 | - path where to save classification models (state from best epoch for each separate classification training performed after X epoch of CPC training) can be specified with --linsep_checkpoint_dir and those will be save under \<linsep_checkpoint_dir\>/\<CPC_epoch\>/phone  or \<linsep_checkpoint_dir\>/\<CPC_epoch\>/speaker
 40 | - number of epochs to run each linear separability task for can be specified with --linsep_n_epoch
 41 | - additional linear separability task parameters can be specified with:
 42 |     - params to set for Adam optimizer: --linsep_lr , --linsep_beta1 , --linsep_beta2 , --linsep_epsilon
 43 |     - --phone_get_encoded to use CNN encodings for classification instead of produced contexts (this is only for phoneme classification with regular loss, other of their classifiers don’t support it so it doesn’t affect them; specifying this with classification CTC loss (below) is not supported and will yield assertion error)
 44 |     - --CTCphones to use CTC-based loss for classification instead of ‘regular’ loss assuming representations/contexts should be aligned with audio data
 45 |     - --linsep_net_layers to use bigger fully connected net during classification training (default: 1 - then there is just one matrix without activations; each layer has classification_class_number neurons except for CTC-based loss which has additional 1 (in last layer for blank symbol))
 46 |     - --linsepBatchSizeGPU can be specified to choose batch size for linear separability task; this is separate from batch size for CPC training
 47 | 
 48 | example run combined with data capturing:
 49 | some real training + capture
 50 | ```
 51 | python train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 \
 52 | --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt \
 53 | --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \
 54 | --supervised_classif_metric \
 55 | --speaker_sep --path_phone_data /pio/scratch/1/i283340/MGR/zs/phones/converted_aligned_phones.txt \
 56 | --linsepBatchSizeGPU 32 --linsep_n_epoch 12 \
 57 | --linsep_logs_dir /pio/scratch/1/i283340/MGR/zs/linsep/logs2-001 \
 58 | --linsep_checkpoint_dir /pio/scratch/1/i283340/MGR/zs/linsep/checkp2-001 \
 59 | --linsep_classif_each_epochs 10 \
 60 | --pathCaptureDS /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt \
 61 | --captureDStotNr 100 --captureEachEpochs 10 \
 62 | --pathCaptureSave /pio/scratch/1/i283340/MGR/zs/capture/try2-001 \
 63 | --captureConvRepr --captureCtxRepr --captureSpeakerAlign --capturePhoneAlign --capturePred --captureCPCCTCalign --captureCPCCTClogScores \
 64 | --pathCheckpoint /pio/scratch/1/i283340/MGR/zs/checkpoints/cpcctc_tests2-001 \
 65 | --file_extension .flac --n_process_loader 1 --max_size_loaded 40000000 \
 66 | --batchSizeGPU 16 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12 \
 67 | --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2
 68 | ```
 69 | 
 70 | 
 71 | ## CPC-CTC data capturing description:
 72 | 
 73 | There are some new args added to group_save and group_db in train.py for capturing - options are also described in the ‘help’ argument in definitions in case I forget something here:
 74 | - Data capturing is possible in 2 modes: capture once for a teached model (use --onlyCapture and --pathCheckpoint) or capture each N epochs during training each N epochs (don’t use --onlyCapture and only specify --captureEachEpochs and things from two bullets below)
 75 | - The data is captured for a separately specified dataset I call captureDataset. This can e.g. be just same as valDataset. It is specified with --pathCaptureDS that is the path to .txt file with sequences in this DS. Additionally, --captureDSfreq OR --captureDStotNr can be used to sample only the part of sequences specified in the file - some percentage of those with freq one, and total number with totNr one. (example: --pathCaptureDS \<valDSpath\> --captureDStotNr 8 can be used to capture for just 8 audio files of the val dataset)
 76 | - --pathCaptureSave tells where to save the data. Data for each epoch (for 1 epoch if --onlyCapture) is saved under \<that_dir\>/\<num_epoch\>/\<what_is_captured\>/ with file names {what_is_captured}_batch{batchBegin}-{batchEnd}.pt in one file each thing for each batch (example: ctx_repr_batch0-15.pt under ./captureRoot/0/ctx_repr/). What to capture is chosen with --captureConvRepr , --captureCtxRepr, --captureSpeakerAlign, --capturePhoneAlign, --capturePred , --captureCPCCTCalign , --captureCPCCTClogScores args (so, those are: representations, LSTM-produced contexts, speaker alignments for the audio, phoneme alignments for the audio, CPC predictions, CPC-CTC alignments). Note that capturing speaker and phoneme alignments is necessary for their visualization, as it is later impossible to tell from what audio file particular batch was taken (audio files are glued together and chunked, and also randomly permuted). There is also --captureEverything added for convenience that captures everything that is valid for given run config, but it’s alway safer to specify exactly what to capture. For capturing phoneme alignments --path_phone_data needs to be specified (this is the path to a .txt file with phoneme alignments in their format, they provide it somewhere in repo’s main readme)
 77 | 
 78 | IN CASE YOU RUN DATA CAPTURE FOR AN ALREADY TRAINED MODEL, PASS SAME ARGUMENTS FOR THE MODEL TO LOAD CORRECTLY
 79 | 
 80 | Example run that saves data each 2 epochs for 8 audio files of val dataset (with some very small dummy train=val datasets I made):
 81 | ```
 82 | python train.py --pathDB /pio/scratch/1/i283340/MGR/zs/ds2
 83 | --pathTrain /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt
 84 | --pathVal /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt
 85 | --pathCaptureDS /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt
 86 | --captureDStotNr 8 --captureEachEpochs 2
 87 | --pathCaptureSave /pio/scratch/1/i283340/MGR/zs/capture/try1
 88 | --path_phone_data /pio/scratch/1/i283340/MGR/zs/phones/converted_aligned_phones.txt
 89 | --captureConvRepr --captureCtxRepr --captureSpeakerAlign --capturePhoneAlign --capturePred --captureCPCCTCalign --captureCPCCTClogScores
 90 | --pathCheckpoint /pio/scratch/1/i283340/MGR/zs/checkpoints/cpcctc_tests2
 91 | --file_extension .flac --n_process_loader 2 --max_size_loaded 40000000
 92 | --batchSizeGPU 16 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12
 93 | --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2
 94 | ```
 95 | 
 96 | Example with just capturing:
 97 | ```
 98 | python train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --onlyCapture \
 99 | --pathCaptureDS /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \
100 | --captureDStotNr 100 \
101 | --pathCaptureSave /pio/gluster/i283340/cpccapture/ls100_cpcctc_match12_pred8/ \
102 | --captureConvRepr --captureCtxRepr --captureSpeakerAlign --capturePhoneAlign --capturePred --captureCPCCTCalign --captureCPCCTClogScores \
103 | --path_phone_data /pio/scratch/1/i283340/MGR/zs/phones/converted_aligned_phones.txt \
104 | --pathCheckpoint /pio/gluster/i283340/modelcpy/ls100_cpcctc_match12_pred8 \
105 | --file_extension .flac \
106 | --normMode layerNorm --dropout --rnnMode transformer --n_process_loader 1 --max_size_loaded 4000000000 --nLevelsGRU 2 \
107 | --batchSizeGPU 32 --limitNegsInBatch 8 --schedulerRamp 10 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12
108 | ```


--------------------------------------------------------------------------------
/cpc/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/cpc/cpc_default_config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import argparse
  6 | 
  7 | 
  8 | def get_default_cpc_config():
  9 |     parser = set_default_cpc_config(argparse.ArgumentParser())
 10 |     return parser.parse_args([])
 11 | 
 12 | 
 13 | def set_default_cpc_config(parser):
 14 |     # Run parameters
 15 | 
 16 |     group = parser.add_argument_group('Architecture configuration',
 17 |                                       description="The arguments defining the "
 18 |                                       "model's architecture.")
 19 |     group.add_argument('--hiddenEncoder', type=int, default=256,
 20 |                        help='Hidden dimension of the encoder network.')
 21 |     group.add_argument('--hiddenGar', type=int, default=256,
 22 |                        help='Hidden dimension of the auto-regressive network')
 23 |     group.add_argument('--nPredicts', type=int, default=12,
 24 |                        help='Number of steps to predict.')
 25 |     
 26 |     group.add_argument('--CPCCTC', action='store_true')
 27 |     group.add_argument('--CPCCTCNumMatched', type=int, default=16)
 28 |     group.add_argument('--CPCCTCSkipBeg', type=int, default=0)
 29 |     group.add_argument('--CPCCTCSkipEnd', type=int, default=0)
 30 |     group.add_argument('--CPCCTCSelfLoop', action='store_true')
 31 |     group.add_argument('--CPCCTCLearnBlank', action='store_true')
 32 |     group.add_argument('--CPCCTCNoNegsMatchWin', action='store_true')
 33 |     group.add_argument('--CPCCTCMasq', default="")
 34 |     group.add_argument('--CPCCTCLossTemp', type=float, default=1.0)
 35 |     group.add_argument('--CPCCTCNormalizeEncs', action='store_true')
 36 |     group.add_argument('--CPCCTCNormalizePreds', action='store_true')
 37 |     group.add_argument('--limitNegsInBatch', type=int, default=0,
 38 |                        help='Limit the number of different seqs from whithc neg samples are taken.')
 39 | 
 40 |     
 41 |     group.add_argument('--negativeSamplingExt', type=int, default=128,
 42 |                        help='Number of negative samples to take.')
 43 |     group.add_argument('--learningRate', type=float, default=2e-4)
 44 |     group.add_argument('--schedulerStep', type=int, default=-1,
 45 |                        help='Step of the learning rate scheduler: at each '
 46 |                        'step the learning rate is divided by 2. Default: '
 47 |                        'no scheduler.')
 48 |     group.add_argument('--schedulerRamp', type=int, default=None,
 49 |                        help='Enable a warm up phase for the learning rate: '
 50 |                        'adds a linear ramp of the given size.')
 51 |     group.add_argument('--beta1', type=float, default=0.9,
 52 |                        help='Value of beta1 for the Adam optimizer')
 53 |     group.add_argument('--beta2', type=float, default=0.999,
 54 |                        help='Value of beta2 for the Adam optimizer')
 55 |     group.add_argument('--epsilon', type=float, default=1e-08,
 56 |                        help='Value of epsilon for the Adam optimizer')
 57 |     group.add_argument('--sizeWindow', type=int, default=20480,
 58 |                        help='Number of frames to consider at each batch.')
 59 |     group.add_argument('--nEpoch', type=int, default=200,
 60 |                        help='Number of epoch to run')
 61 |     group.add_argument('--samplingType', type=str, default='samespeaker',
 62 |                        choices=['samespeaker', 'uniform',
 63 |                                 'samesequence', 'sequential'],
 64 |                        help='How to sample the negative examples in the '
 65 |                        'CPC loss.')
 66 |     group.add_argument('--nLevelsPhone', type=int, default=1,
 67 |                        help='(Supervised mode only). Number of layers in '
 68 |                        'the phone classification network.')
 69 |     group.add_argument('--cpc_mode', type=str, default=None,
 70 |                        choices=['reverse', 'none'],
 71 |                        help='Some variations on CPC.')
 72 |     group.add_argument('--encoder_type', type=str,
 73 |                        choices=['cpc', 'mfcc', 'lfb'],
 74 |                        default='cpc',
 75 |                        help='Replace the encoder network by mfcc features '
 76 |                        'or learned filter banks')
 77 |     group.add_argument('--normMode', type=str, default='layerNorm',
 78 |                        choices=['instanceNorm', 'ID', 'layerNorm',
 79 |                                 'batchNorm'],
 80 |                        help="Type of normalization to use in the encoder "
 81 |                        "network (default is layerNorm).")
 82 |     group.add_argument('--onEncoder', action='store_true',
 83 |                        help="(Supervised mode only) Perform the "
 84 |                        "classification on the encoder's output.")
 85 |     group.add_argument('--random_seed', type=int, default=None,
 86 |                        help="Set a specific random seed.")
 87 |     group.add_argument('--speakerEmbedding', type=int, default=0,
 88 |                        help="(Depreciated) Feed the prediction network with "
 89 |                        "speaker embeddings along with the usual sequence.")
 90 |     group.add_argument('--arMode', default='LSTM',
 91 |                        choices=['GRU', 'LSTM', 'RNN', 'no_ar', 'transformer'],
 92 |                        help="Architecture to use for the auto-regressive "
 93 |                        "network (default is lstm).")
 94 |     group.add_argument('--nLevelsGRU', type=int, default=1,
 95 |                        help='Number of layers in the autoregressive network.')
 96 |     group.add_argument('--rnnMode', type=str, default='transformer',
 97 |                        choices=['transformer', 'RNN', 'LSTM', 'linear',
 98 |                                 'ffd', 'conv4', 'conv8', 'conv12'],
 99 |                        help="Architecture to use for the prediction network")
100 |     group.add_argument('--dropout', action='store_true',
101 |                        help="Add a dropout layer at the output of the "
102 |                        "prediction network.")
103 |     group.add_argument('--abspos', action='store_true',
104 |                        help='If the prediction network is a transformer, '
105 |                        'active to use absolute coordinates.')
106 | 
107 |     return parser
108 | 


--------------------------------------------------------------------------------
/cpc/criterion/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | from .criterion import CPCUnsupersivedCriterion, SpeakerCriterion, \
6 |     PhoneCriterion, NoneCriterion, CTCPhoneCriterion, SpeakerDoubleCriterion
7 | 


--------------------------------------------------------------------------------
/cpc/criterion/clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from .clustering import kMeanCluster, kMeanGPU


--------------------------------------------------------------------------------
/cpc/criterion/clustering/clustering.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import progressbar
  6 | import torch
  7 | import torch.nn as nn
  8 | import cpc.feature_loader as fl
  9 | from .. import CTCPhoneCriterion
 10 | from os.path import join, exists
 11 | from os import remove
 12 | from time import time
 13 | 
 14 | 
 15 | class kMeanCluster(nn.Module):
 16 | 
 17 |     def __init__(self, Ck, norm_vec_len=False):
 18 | 
 19 |         super(kMeanCluster, self).__init__()
 20 |         self.register_buffer('Ck', Ck)
 21 |         self.k = Ck.size(1)
 22 |         self.norm_vec_len = norm_vec_len
 23 |         print("-----> kMeanCluster init")
 24 | 
 25 |     def forward(self, features):
 26 |         B, S, D = features.size()
 27 |         if self.norm_vec_len:
 28 |             featuresLengths = torch.sqrt((features*features).sum(2))
 29 |             features = features / featuresLengths.view(*(featuresLengths.shape), 1)
 30 |         Ck = self.Ck
 31 |         
 32 |         if self.norm_vec_len:
 33 |             CkLengths = torch.sqrt((Ck*Ck).sum(2))
 34 |             Ck = Ck / CkLengths.view(*(CkLengths.shape), 1)
 35 |             clen = torch.sqrt((Ck*Ck).sum(2))
 36 |         features = features.contiguous().view(B*S, 1, -1)
 37 |         return ((features - Ck)**2).sum(dim=2).view(-1, S, self.k)
 38 | 
 39 | 
 40 | class kMeanClusterStep(torch.nn.Module):
 41 | 
 42 |     def __init__(self, k, D, norm_vec_len=False):
 43 | 
 44 |         super(kMeanClusterStep, self).__init__()
 45 |         self.k = k
 46 |         self.register_buffer('Ck', torch.zeros(1, k, D))
 47 |         self.norm_vec_len = norm_vec_len
 48 | 
 49 |     def forward(self, locF):
 50 | 
 51 |         if self.norm_vec_len:
 52 |             locFLengths = torch.sqrt((locF*locF).sum(2))
 53 |             locF = locF / locFLengths.view(*(locFLengths.shape), 1)
 54 |             
 55 |         index = ((locF - self.Ck)**2).mean(dim=2).min(dim=1)[1]
 56 |         Ck1 = torch.cat([locF[index == p].sum(dim=0, keepdim=True)
 57 |                          for p in range(self.k)], dim=1)
 58 |         nItems = torch.cat([(index == p).sum(dim=0, keepdim=True)
 59 |                             for p in range(self.k)], dim=0).view(1, -1)
 60 |         
 61 |         return Ck1, nItems
 62 | 
 63 | 
 64 | def kMeanGPU(dataLoader, featureMaker, k, n_group=1,
 65 |              MAX_ITER=100, EPSILON=1e-4,
 66 |              perIterSize=-1, start_clusters=None,
 67 |              save=False, load=False, save_dir=None,
 68 |              save_last=5, norm_vec_len=False):
 69 | 
 70 |     print(f"Start Kmean clustering with {k} clusters and {n_group} groups...")
 71 | 
 72 |     if save or load:
 73 |         assert save_dir is not None
 74 | 
 75 |     if start_clusters is None:
 76 |         if load and exists(join(save_dir, "checkpoint_last.pt")):
 77 |             print("Loading from last checkpoint")
 78 |             state_dict = torch.load(join(save_dir, "checkpoint_last.pt"))
 79 |             Ck = state_dict["state_dict"]["Ck"]
 80 |             D = Ck.size(2)
 81 |         else:
 82 |             Ck = []
 83 |             with torch.no_grad():
 84 |                 for index, data in enumerate(dataLoader):
 85 |                     cFeature = featureMaker(data)
 86 |                     cFeature = cFeature.contiguous().view(-1, cFeature.size(2)//n_group)
 87 |                     Ck.append(cFeature)
 88 |                     if index > k:
 89 |                         break
 90 |             Ck = torch.cat(Ck, dim=0)
 91 |             N, D = Ck.size()
 92 |             indexes = torch.randperm(N)[:k]
 93 |             Ck = Ck[indexes].view(k, D)  #(1, k, D)
 94 |             # centers will be normalized from the very beginning and kept like that, later only norm points (AND re-normalize centers after each epoch-iter)
 95 |             if norm_vec_len:
 96 |                 CkLengths = torch.sqrt((Ck*Ck).sum(1))
 97 |                 Ck = Ck / CkLengths.view(-1, 1)
 98 |             Ck = Ck.view(1, k, D)
 99 |     else:
100 |         Ck = start_clusters
101 |         D = Ck.size(2)
102 | 
103 |     if perIterSize < 0:
104 |         perIterSize = len(dataLoader)
105 | 
106 |     clusterStep = kMeanClusterStep(k, D, norm_vec_len=norm_vec_len).cuda()
107 |     clusterStep = torch.nn.DataParallel(clusterStep)
108 |     clusterStep.module.Ck.copy_(Ck)
109 |     
110 |     bar = progressbar.ProgressBar(maxval=MAX_ITER)
111 |     bar.start()
112 |     iter, stored = 0, 0
113 |     if load and start_clusters is None and exists(join(save_dir, "checkpoint_last.pt")):
114 |         iter = state_dict["iteration"]
115 |         lastDiff = state_dict["lastDiff"]
116 |         print(f"Continuing training from iteration {iter}. lastDiff: {lastDiff}")
117 |     with torch.no_grad():
118 |         while iter < MAX_ITER:
119 |             start_time = time()
120 |             Ck1 = torch.zeros(Ck.size()).cuda()
121 |             nItemsClusters = torch.zeros(Ck.size(1),
122 |                                          dtype=torch.long).cuda()
123 |             for index, data in enumerate(dataLoader):
124 |                 cFeature = featureMaker(data).contiguous().view(-1, 1, D)
125 |                 locC, locN = clusterStep(cFeature)
126 |                 Ck1 += locC.sum(dim=0, keepdim=True)
127 |                 nItemsClusters += locN.sum(dim=0)
128 |                 ### If the training set is too big and we want to redude the number of item per iteration
129 |                 # stored += 1
130 |                 # if stored >= perIterSize:
131 |                 #     bar.update(iter)
132 |                 #     iter += 1
133 |                 #     stored = 0
134 |                 #     if iter >= MAX_ITER:
135 |                 #         break
136 | 
137 |             iter += 1
138 |             bar.update(iter)
139 | 
140 |             nItemsClusters = nItemsClusters.float().view(1, -1, 1) + 1e-8
141 |             Ck1 /= nItemsClusters
142 | 
143 |             if norm_vec_len:  # need to re-normalize, as mean of things of length 1 has length <= 1
144 |                 Ck1Lengths = torch.sqrt((Ck1*Ck1).sum(2))
145 |                 print("clustNorm", Ck1.shape, Ck1Lengths.shape, Ck1Lengths.view(*(Ck1Lengths.shape), 1).shape)
146 |                 Ck1 = Ck1 / Ck1Lengths.view(*(Ck1Lengths.shape), 1)
147 | 
148 |             lastDiff = (clusterStep.module.Ck - Ck1).norm(dim=2).max().item()
149 |             nItems = int(nItemsClusters.sum().cpu().detach().item())
150 |             info=f"ITER {iter} done in {time()-start_time:.2f} seconds. nItems: {nItems}. Difference with last checkpoint: {lastDiff}"
151 |             print(info)
152 |             with open(join(save_dir, "training_logs.txt"), "a") as f:
153 |                 f.write(info+"\n")
154 |             if save:
155 |                 info=f"Saving last checkpoint to {join(save_dir, 'checkpoint_last.pt')}"
156 |                 print(info)
157 |                 with open(join(save_dir, "training_logs.txt"), "a") as f:
158 |                     f.write(info+"\n")
159 |                 out_state_dict = {}
160 | 
161 |                 clusterModule = kMeanCluster(Ck1, norm_vec_len=norm_vec_len)
162 |                 out_state_dict["state_dict"] = clusterModule.state_dict()
163 |                 out_state_dict["n_clusters"] = Ck1.size(1)
164 |                 out_state_dict['dim'] = Ck1.size(2)
165 |                 out_state_dict["iteration"] = iter
166 |                 out_state_dict["lastDiff"] = lastDiff
167 |                 torch.save(out_state_dict, join(save_dir, "checkpoint_last.pt"))
168 |                 torch.save(out_state_dict, join(save_dir, f"checkpoint_{iter}.pt"))
169 |                 if exists(join(save_dir, f"checkpoint_{iter-save_last}.pt")):
170 |                     remove(join(save_dir, f"checkpoint_{iter-save_last}.pt"))
171 |             if lastDiff < EPSILON:
172 |                 print(
173 |                     f"Clustering ended in {iter} iterations out of {MAX_ITER}")
174 |                 break
175 |             clusterStep.module.Ck.copy_(Ck1)
176 | 
177 |     bar.finish()
178 | 
179 |     print(f"Clustering ended in {MAX_ITER} iterations out of {MAX_ITER}")
180 |     print(f"Last diff {lastDiff}")
181 |     if start_clusters is not None:
182 |         nEmptyClusters = (nItemsClusters < 1).sum().item()
183 |         print(f"{nEmptyClusters} empty clusters out of {k}")
184 |     return clusterStep.module.Ck
185 | 


--------------------------------------------------------------------------------
/cpc/criterion/clustering/clustering_quantization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import progressbar
  6 | from pathlib import Path
  7 | from random import shuffle
  8 | from time import time
  9 | import torch
 10 | from cpc.dataset import findAllSeqs
 11 | from cpc.feature_loader import buildFeature, FeatureModule, loadModel, buildFeature_batch
 12 | from cpc.criterion.clustering import kMeanCluster
 13 | #from cpc.criterion.research.clustering import kMeanCluster
 14 | 
 15 | 
 16 | def readArgs(pathArgs):
 17 |     print(f"Loading args from {pathArgs}")
 18 |     with open(pathArgs, 'r') as file:
 19 |         args = argparse.Namespace(**json.load(file))
 20 |         
 21 |     return args
 22 | 
 23 | 
 24 | def loadClusterModule(pathCheckpoint, norm_vec_len=False):
 25 |     print(f"Loading ClusterModule at {pathCheckpoint}")
 26 |     state_dict = torch.load(pathCheckpoint)
 27 |     if "state_dict" in state_dict: #kmeans
 28 |         clusterModule = kMeanCluster(torch.zeros(1, state_dict["n_clusters"], state_dict["dim"]), norm_vec_len)
 29 |         clusterModule.load_state_dict(state_dict["state_dict"])
 30 |     else: #dpmeans
 31 |         clusterModule = kMeanCluster(state_dict["mu"])
 32 |     clusterModule = clusterModule.cuda()
 33 |     return clusterModule
 34 | 
 35 | def parseArgs(argv):
 36 |     # Run parameters
 37 |     parser = argparse.ArgumentParser(description='Quantize audio files using CPC Clustering Module.')
 38 |     parser.add_argument('pathCheckpoint', type=str,
 39 |                         help='Path to the clustering checkpoint.')
 40 |     parser.add_argument('pathDB', type=str,
 41 |                         help='Path to the dataset that we want to quantize.')
 42 |     parser.add_argument('pathOutput', type=str,
 43 |                         help='Path to the output directory.')
 44 |     parser.add_argument('--pathSeq', type=str,
 45 |                        help='Path to the sequences (file names) to be included used.')
 46 |     parser.add_argument('--split', type=str, default=None,
 47 |                         help="If you want to divide the dataset in small splits, specify it "
 48 |                         "with idxSplit-numSplits (idxSplit > 0), eg. --split 1-20.")
 49 |     parser.add_argument('--file_extension', type=str, default=".flac",
 50 |                           help="Extension of the audio files in the dataset (default: .flac).")
 51 |     parser.add_argument('--max_size_seq', type=int, default=10240,
 52 |                         help='Maximal number of frames to consider '
 53 |                         'when computing a batch of features (defaut: 10240).')
 54 |     parser.add_argument('--batch_size', type=int, default=8,
 55 |                         help='Batch size used to compute features '
 56 |                         'when computing each file (defaut: 8).')
 57 |     parser.add_argument('--strict', type=bool, default=True,
 58 |                         help='If activated, each batch of feature '
 59 |                         'will contain exactly max_size_seq frames (defaut: True).')
 60 |     parser.add_argument('--debug', action='store_true',
 61 |                         help="Load only a very small amount of files for "
 62 |                         "debugging purposes.")
 63 |     parser.add_argument('--nobatch', action='store_true',
 64 |                         help="Don't use batch implementation of when building features."
 65 |                         "NOTE: This can have better quantized units as we can set "
 66 |                         "model.gAR.keepHidden = True (line 162), but the quantization"
 67 |                         "will be a bit longer.")
 68 |     parser.add_argument('--recursionLevel', type=int, default=1,
 69 |                         help='Speaker level in pathDB (defaut: 1). This is only helpful'
 70 |                         'when --separate-speaker is activated.')
 71 |     parser.add_argument('--separate-speaker', action='store_true',
 72 |                         help="Separate each speaker with a different output file.")
 73 | 
 74 | 
 75 |     parser.add_argument('--norm_vec_len', action='store_true',
 76 |                         help="Normalize vector lengths.")
 77 | 
 78 |     return parser.parse_args(argv)
 79 | 
 80 | def main(argv):
 81 |     # Args parser
 82 |     args = parseArgs(argv)
 83 |     
 84 |     print("=============================================================")
 85 |     print(f"Quantizing data from {args.pathDB}")
 86 |     print("=============================================================")
 87 | 
 88 |     # Check if directory exists
 89 |     if not os.path.exists(args.pathOutput):
 90 |         print("")
 91 |         print(f"Creating the output directory at {args.pathOutput}")
 92 |         Path(args.pathOutput).mkdir(parents=True, exist_ok=True)
 93 | 
 94 |     # Get splits
 95 |     if args.split:
 96 |         assert len(args.split.split("-"))==2 and int(args.split.split("-")[1]) >= int(args.split.split("-")[0]) >= 1, \
 97 |             "SPLIT must be under the form idxSplit-numSplits (numSplits >= idxSplit >= 1), eg. --split 1-20"
 98 |         idx_split, num_splits = args.split.split("-")
 99 |         idx_split = int(idx_split)
100 |         num_splits = int(num_splits)
101 | 
102 |     # Find all sequences
103 |     print("")
104 |     print(f"Looking for all {args.file_extension} files in {args.pathDB} with speakerLevel {args.recursionLevel}")
105 |     seqNames, speakers = findAllSeqs(args.pathDB,
106 |                                  speaker_level=args.recursionLevel,
107 |                                  extension=args.file_extension,
108 |                                  loadCache=True)
109 | 
110 |     if args.pathSeq:
111 |         with open(args.pathSeq, 'r') as f:
112 |             seqs = set([x.strip() for x in f])
113 | 
114 |         filtered = []
115 |         for s in seqNames:
116 |             if s[1].split('/')[-1].split('.')[0] in seqs:
117 |                 filtered.append(s)
118 |         seqNames = filtered
119 | 
120 |     print(f"Done! Found {len(seqNames)} files and {len(speakers)} speakers!")
121 |     if args.separate_speaker:
122 |         seqNames_by_speaker = {}
123 |         for seq in seqNames:
124 |             speaker = seq[1].split("/")[args.recursionLevel-1]
125 |             if speaker not in seqNames_by_speaker:
126 |                 seqNames_by_speaker[speaker] = []
127 |             seqNames_by_speaker[speaker].append(seq)
128 | 
129 |     # Check if output file exists
130 |     if not args.split:
131 |         nameOutput = "quantized_outputs.txt"
132 |     else:
133 |         nameOutput = f"quantized_outputs_split_{idx_split}-{num_splits}.txt"
134 |     if args.separate_speaker is False:
135 |         outputFile = os.path.join(args.pathOutput, nameOutput)
136 |         assert not os.path.exists(outputFile), \
137 |             f"Output file {outputFile} already exists !!!"
138 |     
139 |     # Get splits
140 |     if args.split:
141 |         startIdx = len(seqNames) // num_splits * (idx_split-1)
142 |         if idx_split == num_splits:
143 |             endIdx = len(seqNames)
144 |         else:
145 |             endIdx = min(len(seqNames) // num_splits * idx_split, len(seqNames))
146 |         seqNames = seqNames[startIdx:endIdx]
147 |         print("")
148 |         print(f"Quantizing split {idx_split} out of {num_splits} splits, with {len(seqNames)} files (idx in range({startIdx}, {endIdx})).")
149 | 
150 |     # Debug mode
151 |     if args.debug:
152 |         nsamples=20
153 |         print("")
154 |         print(f"Debug mode activated, only load {nsamples} samples!")
155 |         # shuffle(seqNames)
156 |         seqNames = seqNames[:nsamples]
157 | 
158 |     # Load Clustering args
159 |     assert args.pathCheckpoint[-3:] == ".pt"
160 |     if os.path.exists(args.pathCheckpoint[:-3] + "_args.json"):
161 |         pathConfig = args.pathCheckpoint[:-3] + "_args.json"
162 |     elif os.path.exists(os.path.join(os.path.dirname(args.pathCheckpoint), "checkpoint_args.json")):
163 |         pathConfig = os.path.join(os.path.dirname(args.pathCheckpoint), "checkpoint_args.json")
164 |     else:
165 |         assert False, \
166 |             f"Args file not found in the directory {os.path.dirname(args.pathCheckpoint)}"
167 |     clustering_args = readArgs(pathConfig)
168 |     print("")
169 |     print(f"Clutering args:\n{json.dumps(vars(clustering_args), indent=4, sort_keys=True)}")
170 |     print('-' * 50)
171 | 
172 |     # Load CluterModule
173 |     clusterModule = loadClusterModule(args.pathCheckpoint, norm_vec_len=args.norm_vec_len)
174 |     clusterModule.cuda()
175 | 
176 |     # Load FeatureMaker
177 |     print("")
178 |     print("Loading CPC FeatureMaker")
179 |     if 'level_gru' in vars(clustering_args) and clustering_args.level_gru is not None:
180 |         updateConfig = argparse.Namespace(nLevelsGRU=clustering_args.level_gru)
181 |     else:
182 |         updateConfig = None
183 |     model = loadModel([clustering_args.pathCheckpoint], updateConfig=updateConfig)[0]
184 |     ## If we don't apply batch implementation, we can set LSTM model to keep hidden units
185 |     ## making the quality of the quantized units better
186 |     if args.nobatch:
187 |         model.gAR.keepHidden = True
188 |     featureMaker = FeatureModule(model, clustering_args.encoder_layer)
189 |     if clustering_args.dimReduction is not None:
190 |         dimRed = loadDimReduction(clustering_args.dimReduction, clustering_args.centroidLimits)
191 |         featureMaker = torch.nn.Sequential(featureMaker, dimRed)
192 |     if not clustering_args.train_mode:
193 |         featureMaker.eval()
194 |     featureMaker.cuda()
195 |     def feature_function(x): 
196 |         if args.nobatch is False:
197 |             res0 = buildFeature_batch(featureMaker, x,
198 |                                                     seqNorm=False,
199 |                                                     strict=args.strict,
200 |                                                     maxSizeSeq=args.max_size_seq,
201 |                                                     batch_size=args.batch_size)
202 |             if args.norm_vec_len:
203 |                 # [!] we actually used CPC_audio/scripts/quantize_audio.py for that in the end
204 |                 res0Lengths = torch.sqrt((res0*res0).sum(2))
205 |                 res0 = res0 / res0Lengths.view(*(res0Lengths.shape), 1)
206 |             return res0
207 |         else:
208 |             res0 = buildFeature(featureMaker, x,
209 |                                 seqNorm=False,
210 |                                 strict=args.strict)
211 |             if args.norm_vec_len:
212 |                 # [!] we actually used CPC_audio/scripts/quantize_audio.py for that in the end
213 |                 res0Lengths = torch.sqrt((res0*res0).sum(2))
214 |                 res0 = res0 / res0Lengths.view(*(res0Lengths.shape), 1)
215 |             return res0
216 |     print("CPC FeatureMaker loaded!")
217 |     
218 |     # Quantization of files
219 |     print("")
220 |     print(f"Quantizing audio files...")
221 |     seqQuantLines = []
222 |     bar = progressbar.ProgressBar(maxval=len(seqNames))
223 |     bar.start()
224 |     start_time = time()
225 |     for index, vals in enumerate(seqNames):
226 |         bar.update(index)
227 | 
228 |         file_path = vals[1]
229 |         file_path = os.path.join(args.pathDB, file_path)
230 | 
231 |         # Get features & quantizing
232 |         cFeatures = feature_function(file_path).cuda()
233 | 
234 |         nGroups = cFeatures.size(-1)//clusterModule.Ck.size(-1)
235 | 
236 |         cFeatures = cFeatures.view(1, -1, clusterModule.Ck.size(-1))
237 | 
238 |         if len(vals) > 2 and int(vals[-1]) > 9400000: # Librilight, to avoid OOM
239 |             clusterModule = clusterModule.cpu()
240 |             cFeatures = cFeatures.cpu()
241 |             qFeatures = torch.argmin(clusterModule(cFeatures), dim=-1)
242 |             clusterModule = clusterModule.cuda()
243 |         else:
244 |             qFeatures = torch.argmin(clusterModule(cFeatures), dim=-1)
245 |         qFeatures = qFeatures[0].detach().cpu().numpy()
246 | 
247 |         # Transform to quantized line
248 |         quantLine = ",".join(["-".join([str(i) for i in item]) for item in qFeatures.reshape(-1, nGroups)])
249 |         seqQuantLines.append(quantLine)
250 | 
251 |     bar.finish()
252 |     print(f"...done {len(seqQuantLines)} files in {time()-start_time} seconds.")
253 | 
254 |     # Saving outputs
255 |     print("")
256 |     print(f"Saving outputs to {outputFile}")
257 |     outLines = []
258 |     for vals, quantln in zip(seqNames, seqQuantLines):
259 |         file_path = vals[1]
260 |         file_name = os.path.splitext(os.path.basename(file_path))[0]
261 |         outLines.append("\t".join([file_name, quantln]))
262 |     with open(outputFile, "w") as f:
263 |         f.write("\n".join(outLines))
264 | 
265 | if __name__ == "__main__":
266 |     args = sys.argv[1:]
267 |     main(args)
268 | 
269 | 


--------------------------------------------------------------------------------
/cpc/criterion/clustering/clustering_script.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import torch
  6 | import numpy as np
  7 | import time
  8 | import argparse
  9 | import sys
 10 | import os
 11 | import json
 12 | from random import shuffle
 13 | from cpc.criterion.clustering import kMeanCluster, kMeanGPU
 14 | from pathlib import Path
 15 | 
 16 | 
 17 | def getQuantile(sortedData, percent):
 18 |     return sortedData[int(percent * len(sortedData))]
 19 | 
 20 | 
 21 | def parseArgs(argv):
 22 |     # Run parameters
 23 |     parser = argparse.ArgumentParser(description='Clustering module using kmeans or dpmeans.')
 24 |     parser.add_argument('pathCheckpoint', type=str,
 25 |                         help="Path to the checkpoint of CPC module.")
 26 |     parser.add_argument('pathOutput', type=str,
 27 |                         help="Path to the output clustering checkpoint.")
 28 |     parser.add_argument(
 29 |         '--pathDB', type=str,
 30 |         default="/datasets01/LibriSpeech/022219/train-clean-100/")
 31 |     parser.add_argument('-k', '--nClusters', type=int, default=50,
 32 |                         help="Number of clusters for kmeans algorithm (default: 50).")
 33 |     parser.add_argument('-g',  '--nGroups', type=int, default=1,
 34 |                         help="Number of groups for kmeans algorithm (default: 1).")
 35 |     parser.add_argument('-n', '--MAX_ITER', type=int, default=100,
 36 |                         help="Number of iterations (default: 150).")
 37 |     parser.add_argument('--recursionLevel', type=int, default=2,
 38 |                         help="The speaker recursionLevel in the training dataset (default: 2).")
 39 |     parser.add_argument('--extension', type=str, default='.flac',
 40 |                         help="The audio file extension (default: .flac).")
 41 |     parser.add_argument('--seqList', type=str, default=None,
 42 |                         help="Specific the training sequence list (default: None).")
 43 |     parser.add_argument('--sizeWindow', type=int, default=10240,
 44 |                         help="The size of the window when loading audio data (default: 10240).")
 45 |     parser.add_argument('--debug', action='store_true',
 46 |                         help='Debug mode, only use a small number of training data.')
 47 |     parser.add_argument('--encoder_layer', action='store_true',
 48 |                         help='Whether to use the output of the encoder for the clustering.')
 49 |     parser.add_argument('--level_gru', type=int, default=None,
 50 |                         help='Specify the LSTM hidden level to take the representation (default: None).')
 51 |     parser.add_argument('--batchSizeGPU', type=int, default=50,
 52 |                         help='Batch size of each GPU (default: 50).')
 53 |     parser.add_argument('--DPMean', action='store_true',
 54 |                         help='Activate DPMeans training instead of Kmeans.')
 55 |     parser.add_argument('-l', '--DPLambda', type=float, default=11,
 56 |                         help='Lambda parameter of DPMeans algo (default: 11).')
 57 |     parser.add_argument('--perIterSize', type=int, default=-1,
 58 |                         help='(Depreciated) Number of items per iteration (default: -1).')
 59 |     parser.add_argument('--train_mode', action='store_true',
 60 |                         help='Activate training CPC module too.')
 61 |     parser.add_argument('--dimReduction', type=str, default=None,
 62 |                         help='Dimentionality reduction (default: None)')
 63 |     parser.add_argument('--centroidLimits', type=int, nargs=2, default=None,
 64 |                         help='centroidLimits when using dimentionality reduction (default: None)')
 65 |     parser.add_argument('--getDistanceEstimation', action='store_true',
 66 |                         help='Get distance estimation')
 67 |     parser.add_argument('--save', action='store_true',
 68 |                         help='Save the intermediate checkpoints. The checkpoints will'
 69 |                         'be saved in the same directory as the output.')
 70 |     parser.add_argument('--load', action='store_true',
 71 |                         help='Load the last checkpoint from the same directory as the output.')
 72 |     parser.add_argument('--save-last', type=int, default=5,
 73 |                         help='Number of last checkpoints to be saved (default: 5).')
 74 | 
 75 |     parser.add_argument('--n_process_loader', type=int, default=8,
 76 |                           help='Number of processes to call to load the '
 77 |                           'dataset')
 78 |     parser.add_argument('--max_size_loaded', type=int, default=4000000000,
 79 |                           help='Maximal amount of data (in byte) a dataset '
 80 |                           'can hold in memory at any given time')
 81 | 
 82 |     parser.add_argument('--nullspace', action='store_true',
 83 |                           help="Additionally load nullspace")
 84 | 
 85 |     parser.add_argument('--norm_vec_len', action='store_true',
 86 |                         help="Normalize vector lengths.")
 87 | 
 88 |     return parser.parse_args(argv)
 89 | 
 90 | # some example with nullspace and normalization making dists cosine:
 91 | # python cpc/criterion/clustering/clustering_script.py --pathDB /pio/data/zerospeech2021/LibriSpeech/dev-clean \
 92 | # --recursionLevel 1 --nClusters 50 --MAX_ITER 10 --level_gru 2 --save --load --batchSizeGPU 200 --max_size_loaded 40000000 \
 93 | # --n_process_loader 2 --nullspace --norm_vec_len ../nspChp/64ok/checkpoint_9.pt ../nspChp/tryNew64-11/try11chp.pt
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     torch.cuda.empty_cache()
 98 | 
 99 |     import os
100 |     from cpc.feature_loader import loadModel, FeatureModule
101 |     from cpc.dataset import findAllSeqs, filterSeqs, AudioBatchData
102 | 
103 |     args = parseArgs(sys.argv[1:])
104 |     # Export absolute paths for later use
105 |     args.pathCheckpoint = os.path.abspath(args.pathCheckpoint)
106 |     args.pathOutput = os.path.abspath(args.pathOutput)
107 |     args.pathDB = os.path.abspath(args.pathDB)
108 | 
109 |     if not args.load: 
110 |         assert os.path.exists(args.pathOutput) is False, \
111 |             f"The output file {args.pathOutput} already exists, please check the option --load !"
112 |         assert os.path.exists(os.path.join(os.path.dirname(args.pathOutput), "checkpoint_last.pt")) is False, \
113 |             f"Found last_checkpoint.pt in the output directory, please check the option --load !"
114 | 
115 |     print(args)
116 |     seqNames, speakers = findAllSeqs(args.pathDB,
117 |                                      speaker_level=args.recursionLevel,
118 |                                      extension=args.extension,
119 |                                      loadCache=True)
120 | 
121 |     if args.seqList is not None:
122 |         seqNames = filterSeqs(args.seqList, seqNames)
123 |     if args.debug:
124 |         nsamples=1000
125 |         print(f"Debug mode activated, get only {nsamples} samples!")
126 |         shuffle(seqNames)
127 |         seqNames = seqNames[:nsamples]
128 |     if args.getDistanceEstimation:
129 |         shuffle(seqNames)
130 |         seqNames = seqNames[:5000]
131 | 
132 |     print("")
133 |     print(f'Loading audio data at {args.pathDB}')
134 |     start_time = time.time()
135 |     dataset = AudioBatchData(args.pathDB,
136 |                              args.sizeWindow,
137 |                              seqNames,
138 |                              None,
139 |                              len(speakers),
140 |                              nProcessLoader=args.n_process_loader,
141 |                              MAX_SIZE_LOADED=args.max_size_loaded)
142 |     print(f"Dataset loaded in {time.time()-start_time} seconds !")
143 |     print("")
144 | 
145 |     nGPUs = torch.cuda.device_count()
146 |     batchSize = args.batchSizeGPU * nGPUs
147 |     trainLoader = dataset.getDataLoader(batchSize, "uniform",
148 |                                         False, numWorkers=0)
149 |     print(f"Length of dataLoader: {len(trainLoader)}")
150 |     print("")
151 | 
152 | 
153 |     if args.level_gru is None:
154 |         updateConfig = None
155 |     else:
156 |         updateConfig = argparse.Namespace(nLevelsGRU=args.level_gru)
157 | 
158 |     model = loadModel([args.pathCheckpoint], updateConfig=updateConfig, load_nullspace=args.nullspace)[0]
159 |     #model = loadModel([args.pathCheckpoint])[0]#, updateConfig=updateConfig)[0]
160 | 
161 |     featureMaker = FeatureModule(model, args.encoder_layer)
162 |     print("Checkpoint loaded!")
163 |     print("")
164 | 
165 |     if not args.train_mode:
166 |         featureMaker.eval()
167 |     featureMaker.cuda()
168 | 
169 |     # Check if dir exists
170 |     if not os.path.exists(os.path.dirname(args.pathOutput)) and os.path.dirname(args.pathOutput):
171 |         Path(os.path.dirname(args.pathOutput)).mkdir(parents=True, exist_ok=True)
172 | 
173 |     pathConfig = f"{os.path.splitext(args.pathOutput)[0]}_args.json"
174 |     with open(pathConfig, 'w') as file:
175 |         json.dump(vars(args), file, indent=2)
176 | 
177 |     out_state_dict = {}
178 |     print("Starting the clustering...")
179 |     start_time = time.time()
180 |     clusters = kMeanGPU(trainLoader, featureMaker, args.nClusters, args.nGroups,
181 |                             perIterSize=args.perIterSize,
182 |                             MAX_ITER=args.MAX_ITER,
183 |                             save=args.save, load=args.load, 
184 |                             save_dir=os.path.dirname(args.pathOutput),
185 |                             save_last=args.save_last,
186 |                             norm_vec_len=args.norm_vec_len).cpu()
187 | 
188 | 
189 |     print(f'Ran clustering '
190 |           f'in {time.time() - start_time:.2f} seconds')
191 | 
192 |     clusterModule = kMeanCluster(clusters, norm_vec_len=args.norm_vec_len)
193 | 
194 |     out_state_dict["state_dict"] = clusterModule.state_dict()
195 |     out_state_dict["encoder_layer"] = args.encoder_layer
196 |     out_state_dict["n_clusters"] = args.nClusters
197 |     out_state_dict['dim'] = clusters.size(2)
198 |     torch.save(out_state_dict, args.pathOutput)
199 |     with open(pathConfig, 'w') as file:
200 |         json.dump(vars(args), file, indent=2)
201 | 


--------------------------------------------------------------------------------
/cpc/criterion/custom_layers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  6 | import math
  7 | 
  8 | import torch.nn as nn
  9 | 
 10 | from numpy import prod
 11 | 
 12 | 
 13 | class NormalizationLayer(nn.Module):
 14 | 
 15 |     def __init__(self):
 16 |         super(NormalizationLayer, self).__init__()
 17 | 
 18 |     def forward(self, x, epsilon=1e-8):
 19 |         return x * (((x**2).mean(dim=1, keepdim=True) + epsilon).rsqrt())
 20 | 
 21 | 
 22 | def Upscale2d(x, factor=2):
 23 |     assert isinstance(factor, int) and factor >= 1
 24 |     if factor == 1:
 25 |         return x
 26 |     s = x.size()
 27 |     x = x.view(-1, s[1], s[2], 1, s[3], 1)
 28 |     x = x.expand(-1, s[1], s[2], factor, s[3], factor)
 29 |     x = x.contiguous().view(-1, s[1], s[2] * factor, s[3] * factor)
 30 |     return x
 31 | 
 32 | 
 33 | def getLayerNormalizationFactor(x):
 34 |     r"""
 35 |     Get He's constant for the given layer
 36 |     https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf
 37 |     """
 38 |     size = x.weight.size()
 39 |     fan_in = prod(size[1:])
 40 | 
 41 |     return math.sqrt(2.0 / fan_in)
 42 | 
 43 | 
 44 | class ConstrainedLayer(nn.Module):
 45 |     r"""
 46 |     A handy refactor that allows the user to:
 47 |     - initialize one layer's bias to zero
 48 |     - apply He's initialization at runtime
 49 |     """
 50 | 
 51 |     def __init__(self,
 52 |                  module,
 53 |                  equalized=True,
 54 |                  lrMul=1.0,
 55 |                  initBiasToZero=True):
 56 |         r"""
 57 |         equalized (bool): if true, the layer's weight should evolve within
 58 |                          the range (-1, 1)
 59 |         initBiasToZero (bool): if true, bias will be initialized to zero
 60 |         """
 61 | 
 62 |         super(ConstrainedLayer, self).__init__()
 63 | 
 64 |         self.module = module
 65 |         self.equalized = equalized
 66 | 
 67 |         if initBiasToZero and module.bias is not None:
 68 |             self.module.bias.data.fill_(0)
 69 |         if self.equalized:
 70 |             self.module.weight.data.normal_(0, 1)
 71 |             self.weight = getLayerNormalizationFactor(self.module) * lrMul
 72 | 
 73 |     def forward(self, x):
 74 | 
 75 |         x = self.module(x)
 76 |         if self.equalized:
 77 |             x *= self.weight
 78 |         return x
 79 | 
 80 | 
 81 | class EqualizedConv1d(ConstrainedLayer):
 82 | 
 83 |     def __init__(self,
 84 |                  nChannelsPrevious,
 85 |                  nChannels,
 86 |                  kernelSize,
 87 |                  padding=0,
 88 |                  bias=True,
 89 |                  stride=1,
 90 |                  **kwargs):
 91 |         r"""
 92 |         A nn.Conv2d module with specific constraints
 93 |         Args:
 94 |             nChannelsPrevious (int): number of channels in the previous layer
 95 |             nChannels (int): number of channels of the current layer
 96 |             kernelSize (int): size of the convolutional kernel
 97 |             padding (int): convolution's padding
 98 |             bias (bool): with bias ?
 99 |         """
100 | 
101 |         ConstrainedLayer.__init__(self,
102 |                                   nn.Conv1d(nChannelsPrevious, nChannels,
103 |                                             kernelSize, padding=padding,
104 |                                             bias=bias, stride=stride),
105 |                                   **kwargs)
106 | 
107 | 
108 | class EqualizedConv2d(ConstrainedLayer):
109 | 
110 |     def __init__(self,
111 |                  nChannelsPrevious,
112 |                  nChannels,
113 |                  kernelSize,
114 |                  padding=0,
115 |                  bias=True,
116 |                  **kwargs):
117 |         r"""
118 |         A nn.Conv2d module with specific constraints
119 |         Args:
120 |             nChannelsPrevious (int): number of channels in the previous layer
121 |             nChannels (int): number of channels of the current layer
122 |             kernelSize (int): size of the convolutional kernel
123 |             padding (int): convolution's padding
124 |             bias (bool): with bias ?
125 |         """
126 | 
127 |         ConstrainedLayer.__init__(self,
128 |                                   nn.Conv2d(nChannelsPrevious, nChannels,
129 |                                             kernelSize, padding=padding,
130 |                                             bias=bias),
131 |                                   **kwargs)
132 | 
133 | 
134 | class EqualizedLinear(ConstrainedLayer):
135 | 
136 |     def __init__(self,
137 |                  nChannelsPrevious,
138 |                  nChannels,
139 |                  bias=True,
140 |                  **kwargs):
141 |         r"""
142 |         A nn.Linear module with specific constraints
143 |         Args:
144 |             nChannelsPrevious (int): number of channels in the previous layer
145 |             nChannels (int): number of channels of the current layer
146 |             bias (bool): with bias ?
147 |         """
148 | 
149 |         ConstrainedLayer.__init__(self,
150 |                                   nn.Linear(nChannelsPrevious, nChannels,
151 |                                             bias=bias), **kwargs)
152 | 


--------------------------------------------------------------------------------
/cpc/criterion/seq_alignment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import progressbar
  6 | import torch
  7 | from multiprocessing import Lock, Manager, Process
  8 | from copy import deepcopy
  9 | 
 10 | 
 11 | def beam_search(score_preds, nKeep, blankLabel):
 12 | 
 13 |     T, P = score_preds.shape
 14 |     beams = set([''])
 15 |     pb_t_1 = {"": 1}
 16 |     pnb_t_1 = {"": 0}
 17 | 
 18 |     def getLastNumber(b):
 19 |         return int(b.split(',')[-1])
 20 | 
 21 |     for t in range(T):
 22 | 
 23 |         nextBeams = set()
 24 |         pb_t = {}
 25 |         pnb_t = {}
 26 |         for i_beam, b in enumerate(beams):
 27 |             if b not in pb_t:
 28 |                 pb_t[b] = 0
 29 |                 pnb_t[b] = 0
 30 | 
 31 |             if len(b) > 0:
 32 |                 pnb_t[b] += pnb_t_1[b] * score_preds[t, getLastNumber(b)]
 33 |             pb_t[b] = (pnb_t_1[b] + pb_t_1[b]) * score_preds[t, blankLabel]
 34 |             nextBeams.add(b)
 35 | 
 36 |             for c in range(P):
 37 |                 if c == blankLabel:
 38 |                     continue
 39 | 
 40 |                 b_ = b + "," + str(c)
 41 |                 if b_ not in pb_t:
 42 |                     pb_t[b_] = 0
 43 |                     pnb_t[b_] = 0
 44 | 
 45 |                 if b != "" and getLastNumber(b) == c:
 46 |                     pnb_t[b_] += pb_t_1[b] * score_preds[t, c]
 47 |                 else:
 48 |                     pnb_t[b_] += (pb_t_1[b] + pnb_t_1[b]) * score_preds[t, c]
 49 |                 nextBeams.add(b_)
 50 | 
 51 |         allPreds = [(pb_t[b] + pnb_t[b], b) for b in nextBeams]
 52 |         allPreds.sort(reverse=True)
 53 | 
 54 |         beams = [x[1] for x in allPreds[:nKeep]]
 55 |         pb_t_1 = deepcopy(pb_t)
 56 |         pnb_t_1 = deepcopy(pnb_t)
 57 | 
 58 |     output = []
 59 |     for score, x in allPreds[:nKeep]:
 60 |         output.append((score, [int(y) for y in x.split(',') if len(y) > 0]))
 61 |     return output
 62 | 
 63 | 
 64 | def collapseLabelChain(inputLabels):
 65 | 
 66 |     # Shape N,T
 67 |     N, T = inputLabels.size()
 68 |     outSizes = torch.zeros(N, device=inputLabels.device, dtype=torch.int64)
 69 |     output = []
 70 |     for l in range(N):
 71 |         status = inputLabels[l, :-1] - inputLabels[l, 1:]
 72 |         status = torch.cat([torch.ones(1, device=status.device,
 73 |                                        dtype=status.dtype),
 74 |                             status], dim=0)
 75 |         outSizes[l] = (status != 0).sum()
 76 |         output.append(inputLabels[l][status != 0])
 77 |     maxSize = int(outSizes.max().item())
 78 |     paddedOutput = torch.zeros(N, maxSize,
 79 |                                device=inputLabels.device,
 80 |                                dtype=torch.int64)
 81 | 
 82 |     for l in range(N):
 83 |         S = int(outSizes[l])
 84 |         paddedOutput[l, :S] = output[l]
 85 | 
 86 |     return paddedOutput, outSizes
 87 | 
 88 | 
 89 | def NeedlemanWunschAlignScore(seq1, seq2, d, m, r, normalize=True):
 90 | 
 91 |     N1, N2 = len(seq1), len(seq2)
 92 | 
 93 |     # Fill up the errors
 94 |     tmpRes_ = [[None for x in range(N2 + 1)] for y in range(N1 + 1)]
 95 |     for i in range(N1 + 1):
 96 |         tmpRes_[i][0] = i * d
 97 |     for j in range(N2 + 1):
 98 |         tmpRes_[0][j] = j * d
 99 | 
100 |     for i in range(N1):
101 |         for j in range(N2):
102 | 
103 |             match = r if seq1[i] == seq2[j] else m
104 |             v1 = tmpRes_[i][j] + match
105 |             v2 = tmpRes_[i + 1][j] + d
106 |             v3 = tmpRes_[i][j + 1] + d
107 |             tmpRes_[i + 1][j + 1] = max(v1, max(v2, v3))
108 | 
109 |     i = j = 0
110 |     res = -tmpRes_[N1][N2]
111 |     if normalize:
112 |         res /= float(N1)
113 |     return res
114 | 
115 | 
116 | def get_seq_PER(seqLabels, detectedLabels):
117 |     return NeedlemanWunschAlignScore(seqLabels, detectedLabels, -1, -1, 0,
118 |                                      normalize=True)
119 | 
120 | 
121 | def getPER(dataLoader, featureMaker, blankLabel):
122 | 
123 |     bar = progressbar.ProgressBar(len(dataLoader))
124 |     bar.start()
125 | 
126 |     out = 0
127 |     n_items = 0
128 |     n_keep_beam_search = 100
129 |     for index, data in enumerate(dataLoader):
130 | 
131 |         bar.update(index)
132 |         with torch.no_grad():
133 |             output = featureMaker(data).cpu().numpy()
134 |         labels = data[1]
135 |         labels, targetSize = collapseLabelChain(labels)
136 |         lock = Lock()
137 | 
138 |         def per(rank, outScore):
139 |             S = int(targetSize[rank])
140 |             seqLabels = labels[rank, :S]
141 |             preds = beam_search(output[rank],
142 |                                 n_keep_beam_search, blankLabel)[0][1]
143 |             value = get_seq_PER(seqLabels, preds)
144 |             with lock:
145 |                 outScore.value += value
146 | 
147 |         manager = Manager()
148 |         outScore = manager.Value('f', 0.)
149 | 
150 |         N, S, D = output.shape
151 |         processes = []
152 |         for rank in range(N):
153 |             p = Process(
154 |                 target=per, args=(rank, outScore))
155 |             p.start()
156 |             processes.append(p)
157 |         for p in processes:
158 |             p.join()
159 | 
160 |         out += outScore.value
161 |         n_items += N
162 | 
163 |     bar.finish()
164 |     return (out / n_items)
165 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import argparse
  6 | import sys
  7 | import torch
  8 | import json
  9 | from pathlib import Path
 10 | import ABX.abx_group_computation as abx_g
 11 | import ABX.abx_iterators as abx_it
 12 | from cpc.dataset import findAllSeqs
 13 | from cpc.feature_loader import buildFeature, FeatureModule, loadModel
 14 | 
 15 | 
 16 | def reduce_sparse_data(quotient, divisor):
 17 |     return quotient / (1e-08 * (divisor == 0) + divisor)
 18 | 
 19 | 
 20 | def ABX(feature_function,
 21 |         path_item_file,
 22 |         seq_list,
 23 |         distance_mode,
 24 |         step_feature,
 25 |         modes,
 26 |         seq_norm=True,
 27 |         cuda=False,
 28 |         max_x_across=5,
 29 |         max_size_group=30):
 30 | 
 31 |     # ABX dataset
 32 |     ABXDataset = abx_it.ABXFeatureLoader(path_item_file, seq_list,
 33 |                                          feature_function, step_feature, True)
 34 | 
 35 |     if cuda:
 36 |         ABXDataset.cuda()
 37 | 
 38 |     # Distance function
 39 |     distance_function = abx_g.get_distance_function_from_name(distance_mode)
 40 | 
 41 |     # Output
 42 |     scores = {}
 43 | 
 44 |     # ABX within
 45 |     if 'within' in modes:
 46 |         print("Computing ABX within speakers...")
 47 |         ABXIterator = ABXDataset.get_iterator('within', max_size_group)
 48 |         group_confusion = abx_g.get_abx_scores_dtw_on_group(ABXIterator,
 49 |                                                             distance_function,
 50 |                                                             ABXIterator.symmetric)
 51 |         n_data = group_confusion._values().size(0)
 52 |         index_ = torch.sparse.LongTensor(group_confusion._indices(),
 53 |                                          torch.ones((n_data),
 54 |                                                     dtype=torch.float),
 55 |                                          group_confusion.size())
 56 |         divisor_context = torch.sparse.sum(index_, dim=3).to_dense()
 57 |         group_confusion = torch.sparse.sum(group_confusion, dim=3).to_dense()
 58 |         group_confusion = reduce_sparse_data(group_confusion, divisor_context)
 59 |         S, p1, p2 = group_confusion.size()
 60 | 
 61 |         index_speaker = divisor_context > 0
 62 |         divisor_speaker = index_speaker.sum(dim=0)
 63 |         phone_confusion = reduce_sparse_data(group_confusion.sum(dim=0),
 64 |                                              divisor_speaker)
 65 | 
 66 |         scores['within'] = (phone_confusion.sum() /
 67 |                             (divisor_speaker > 0).sum()).item()
 68 |         print(f"...done. ABX within : {scores['within']}")
 69 | 
 70 |     # ABX across
 71 |     if 'across' in modes:
 72 |         print("Computing ABX across speakers...")
 73 |         ABXIterator = ABXDataset.get_iterator('across', max_size_group)
 74 |         ABXIterator.max_x = max_x_across
 75 |         group_confusion = abx_g.get_abx_scores_dtw_on_group(ABXIterator,
 76 |                                                             distance_function,
 77 |                                                             ABXIterator.symmetric)
 78 |         n_data = group_confusion._values().size(0)
 79 |         index_ = torch.sparse.LongTensor(group_confusion._indices(),
 80 |                                          torch.ones((n_data),
 81 |                                                     dtype=torch.float),
 82 |                                          group_confusion.size())
 83 |         divisor_context = torch.sparse.sum(index_, dim=[3, 4]).to_dense()
 84 |         group_confusion = torch.sparse.sum(
 85 |             group_confusion, dim=[3, 4]).to_dense()
 86 |         group_confusion = reduce_sparse_data(group_confusion, divisor_context)
 87 |         S, p1, p2 = group_confusion.size()
 88 | 
 89 |         index_speaker = divisor_context > 0
 90 |         divisor_speaker = index_speaker.sum(dim=0)
 91 |         phone_confusion = reduce_sparse_data(group_confusion.sum(dim=0),
 92 |                                              divisor_speaker)
 93 |         scores['across'] = (phone_confusion.sum() /
 94 |                             (divisor_speaker > 0).sum()).item()
 95 |         print(f"...done. ABX across : {scores['across']}")
 96 | 
 97 |     return scores
 98 | 
 99 | 
100 | def update_base_parser(parser):
101 |     parser.add_argument('--debug', action='store_true')
102 |     parser.add_argument('--feature_size', type=int, default=0.01,
103 |                         help="Size (in s) of one feature")
104 |     parser.add_argument('--cuda', action='store_true',
105 |                         help="Use the GPU to compute distances")
106 |     parser.add_argument('--mode', type=str, default='all',
107 |                         choices=['all', 'within', 'across'],
108 |                         help="Type of ABX score to compute")
109 |     parser.add_argument("--max_size_group", type=int, default=10,
110 |                         help="Max size of a group while computing the"
111 |                              "ABX score")
112 |     parser.add_argument("--max_x_across", type=int, default=5,
113 |                         help="When computing the ABX across score, maximum"
114 |                              "number of speaker X to sample per couple A,B")
115 |     parser.add_argument("--out", type=str, default=None,
116 |                         help="Path where the results should be saved")
117 | 
118 | 
119 | def parse_args(argv):
120 | 
121 |     base_parser = argparse.ArgumentParser(description='ABX metric')
122 | 
123 |     subparsers = base_parser.add_subparsers(dest='load')
124 |     parser_checkpoint = subparsers.add_parser('from_checkpoint')
125 |     update_base_parser(parser_checkpoint)
126 |     parser_checkpoint.add_argument('path_checkpoint', type=str,
127 |                                    help="Path to the model's checkpoint")
128 |     parser_checkpoint.add_argument('path_item_file', type=str,
129 |                                    help="Path to the ABX .item file containing "
130 |                                    "the triplets labels")
131 |     parser_checkpoint.add_argument('path_dataset', type=str,
132 |                                    help="Path to the dataset")
133 |     parser_checkpoint.add_argument('--seq_norm', action='store_true',
134 |                                    help='If activated, normalize each batch '
135 |                                    'of feature across the time channel before '
136 |                                    'computing ABX.')
137 |     parser_checkpoint.add_argument('--max_size_seq', default=64000, type=int,
138 |                                    help='Maximal number of frames to consider '
139 |                                    'when computing a batch of features.')
140 |     parser_checkpoint.add_argument('--strict', action='store_true',
141 |                                    help='If activated, each batch of feature '
142 |                                    'will contain exactly max_size_seq frames.')
143 |     parser_checkpoint.add_argument('--file_extension', type=str,
144 |                                    default='.wav',
145 |                                    help='Extension of ecah audio file in the '
146 |                                    'dataset.')
147 |     parser_checkpoint.add_argument('--get_encoded', action='store_true',
148 |                                    help='If activated, compute the ABX score '
149 |                                    'using the output of the encoder network.')
150 | 
151 |     parser_db = subparsers.add_parser('from_pre_computed')
152 |     update_base_parser(parser_db)
153 |     parser_db.add_argument('path_features', type=str,
154 |                            help="Path to pre-computed torch features (.pt)")
155 |     parser_db.add_argument('--file_extension', type=str,
156 |                            default='.pt', help='Extension of each feature '
157 |                            'in the dataset')
158 | 
159 |     # multi-gpu / multi-node
160 |     return base_parser.parse_args(argv)
161 | 
162 | 
163 | def main(argv):
164 | 
165 |     args = parse_args(argv)
166 | 
167 |     if args.load == 'from_checkpoint':
168 |         # Checkpoint
169 |         model = loadModel([args.path_checkpoint])[0]
170 |         model.gAR.keepHidden = True
171 |         # Feature maker
172 |         feature_maker = FeatureModule(model, args.get_encoded).cuda().eval()
173 | 
174 |         def feature_function(x): return buildFeature(feature_maker, x,
175 |                                                      seqNorm=args.seq_norm,
176 |                                                      strict=args.strict,
177 |                                                      maxSizeSeq=args.max_size_seq)
178 |     elif args.load == 'from_pre_computed':
179 |         def feature_function(x): return torch.load(x, 'cpu')
180 | 
181 |     # Modes
182 |     if args.mode == 'all':
183 |         modes = ["within", "across"]
184 |     else:
185 |         modes = [args.mode]
186 | 
187 |     distance_mode = 'cosine'
188 | 
189 |     step_feature = 1 / args.feature_size
190 | 
191 |     # Get the list of sequences
192 |     seq_list, _ = findAllSeqs(args.path_dataset, extension=args.file_extension)
193 |     seq_list = [(str(Path(x).stem), str(Path(args.path_dataset) / x))
194 |                 for (_, x) in seq_list]
195 | 
196 |     if args.debug:
197 |         seq_list = seq_list[:1000]
198 | 
199 |     scores = ABX(feature_function, args.path_item_file,
200 |                  seq_list, distance_mode,
201 |                  step_feature, modes,
202 |                  cuda=args.cuda,
203 |                  seq_norm=args.seq_norm,
204 |                  max_x_across=args.max_x_across,
205 |                  max_size_group=args.max_size_group)
206 | 
207 |     out_dir = Path(args.path_checkpoint).parent if args.out is None \
208 |         else Path(args.out)
209 |     out_dir.mkdir(exist_ok=True)
210 | 
211 |     path_score = out_dir / 'ABX_scores.json'
212 |     with open(path_score, 'w') as file:
213 |         json.dump(scores, file, indent=2)
214 | 
215 |     path_args = out_dir / 'ABX_args.json'
216 |     with open(path_args, 'w') as file:
217 |         json.dump(vars(args), file, indent=2)
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     args = sys.argv[1:]
222 |     main(args)
223 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX/abx_group_computation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import torch
  6 | import math
  7 | from . import dtw
  8 | import progressbar
  9 | 
 10 | 
 11 | def get_distance_function_from_name(name_str):
 12 |     if name_str == 'euclidian':
 13 |         return get_euclidian_distance_batch
 14 |     if name_str == 'cosine':
 15 |         return get_cosine_distance_batch
 16 |     raise ValueError(f"Invalid distance mode")
 17 | 
 18 | 
 19 | def check_dtw_group_validity(a, b, x):
 20 |     assert(len(a.size()) == len(b.size()))
 21 |     assert(len(a.size()) == len(x.size()))
 22 |     assert(a.size(2) == x.size(2))
 23 |     assert(a.size(2) == b.size(2))
 24 | 
 25 | 
 26 | def get_cosine_distance_batch(a1, a2, epsilon=1e-8):
 27 |     r""" a1 and a2 must be normalized"""
 28 |     N1, S1, D = a1.size()  # Batch x Seq x Channel
 29 |     N2, S2, D = a2.size()  # Batch x Seq x Channel
 30 | 
 31 |     prod = (a1.view(N1, 1, S1, 1, D)) * (a2.view(1, N2, 1, S2, D))
 32 |     # Sum accross the channel dimension
 33 |     prod = torch.clamp(prod.sum(dim=4), -1, 1).acos() / math.pi
 34 | 
 35 |     return prod
 36 | 
 37 | 
 38 | def get_euclidian_distance_batch(a1, a2):
 39 |     N1, S1, D = a1.size()
 40 |     N2, S2, D = a2.size()
 41 |     diff = a1.view(N1, 1, S1, 1, D) - a2.view(1, N2, 1, S2, D)
 42 |     return torch.sqrt((diff**2).sum(dim=4))
 43 | 
 44 | 
 45 | def get_distance_group_dtw(a1, a2, size1, size2,
 46 |                            ignore_diag=False, symmetric=False,
 47 |                            distance_function=get_cosine_distance_batch):
 48 | 
 49 |     N1, S1, D = a1.size()
 50 |     N2, S2, D = a2.size()
 51 |     if size1.size(0) != N1:
 52 |         print(a1.size(), size1.size())
 53 |         print(a2.size(), size2.size())
 54 |     assert(size1.size(0) == N1)
 55 |     assert(size2.size(0) == N2)
 56 | 
 57 |     distance_mat = distance_function(a1, a2).detach().cpu().numpy()
 58 |     return dtw.dtw_batch(a1, a2, size1, size2,
 59 |                          distance_mat,
 60 |                          ignore_diag, symmetric)
 61 | 
 62 | 
 63 | def get_theta_group_dtw(a, b, x, sa, sb, sx, distance_function, symmetric):
 64 | 
 65 |     check_dtw_group_validity(a, b, x)
 66 | 
 67 |     dxb = get_distance_group_dtw(
 68 |         x, b, sx, sb, distance_function=distance_function)
 69 |     dxa = get_distance_group_dtw(x, a, sx, sa, ignore_diag=symmetric,
 70 |                                  symmetric=symmetric,
 71 |                                  distance_function=distance_function)
 72 | 
 73 |     Nx, Na = dxa.size()
 74 |     Nx, Nb = dxb.size()
 75 | 
 76 |     if symmetric:
 77 |         n_pos = Na * (Na - 1)
 78 |         max_val = dxb.max().item()
 79 |         for i in range(Na):
 80 |             dxa[i, i] = max_val + 1
 81 |     else:
 82 |         n_pos = Na * Nx
 83 | 
 84 |     dxb = dxb.view(Nx, 1, Nb).expand(Nx, Na, Nb)
 85 |     dxa = dxa.view(Nx, Na, 1).expand(Nx, Na, Nb)
 86 | 
 87 |     sc = (dxa < dxb).sum() + 0.5 * (dxa == dxb).sum()
 88 |     sc /= (n_pos * Nb)
 89 | 
 90 |     return sc.item()
 91 | 
 92 | 
 93 | def loc_dtw(data, distance_function, symmetric):
 94 |     coords, group_a, group_b, group_x = data
 95 |     group_a_data, group_a_size = group_a
 96 |     group_b_data, group_b_size = group_b
 97 |     group_x_data, group_x_size = group_x
 98 |     theta = get_theta_group_dtw(group_a_data,
 99 |                                 group_b_data,
100 |                                 group_x_data,
101 |                                 group_a_size,
102 |                                 group_b_size,
103 |                                 group_x_size,
104 |                                 distance_function,
105 |                                 symmetric)
106 | 
107 |     return (coords, 1 - theta)
108 | 
109 | 
110 | def get_abx_scores_dtw_on_group(group_iterator,
111 |                                 distance_function,
112 |                                 symmetric):
113 | 
114 |     data_list = []
115 |     coords_list = []
116 |     bar = progressbar.ProgressBar(maxval=len(group_iterator))
117 |     bar.start()
118 | 
119 |     with torch.no_grad():
120 |         for index, group in enumerate(group_iterator):
121 |             bar.update(index)
122 |             coords, abx = loc_dtw(group, distance_function, symmetric)
123 |             data_list.append(abx)
124 |             coords_list.append(coords)
125 |     bar.finish()
126 | 
127 |     return torch.sparse.FloatTensor(torch.LongTensor(coords_list).t(),
128 |                                     torch.FloatTensor(data_list),
129 |                                     group_iterator.get_board_size())
130 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX/dtw.pyx:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | import torch
 6 | import numpy as np
 7 | cimport numpy as np
 8 | cimport cython
 9 | from cpython cimport bool
10 | ctypedef np.float32_t CTYPE_t # cost type
11 | ctypedef np.intp_t IND_t # array index type
12 | CTYPE = np.float32 # cost type
13 | 
14 | 
15 | 
16 | def dtw_batch(x,y, sx, sy, dist_mat, ignore_diag=False, symetric=False):
17 | 
18 |     Nx = dist_mat.shape[0]
19 |     Ny = dist_mat.shape[1]
20 | 
21 |     out = torch.zeros((Nx,  Ny))
22 | 
23 |     for i in range(Nx):
24 |         start_index = i if symetric else 0
25 |         i_sx = sx[i]
26 |         for j in range(start_index, Ny):
27 | 
28 |             j_sy = sy[j]
29 |             if ignore_diag and i == j:
30 |                 continue
31 |             distance = _dtw(i_sx, j_sy, dist_mat[i, j, :i_sx, :j_sy], True)
32 |             out[i][j] = distance
33 |             if symetric and i != j:
34 |                 out[j][i] = out[i][j]
35 | 
36 |     return out
37 | 
38 | 
39 | 
40 | cpdef _dtw(IND_t N, IND_t M, CTYPE_t[:,:] dist_array, bool normalized):
41 |     cdef IND_t i, j
42 |     cdef CTYPE_t[:,:] cost = np.empty((N, M), dtype=CTYPE)
43 |     cdef CTYPE_t final_cost, c_diag, c_left, c_up
44 |     # initialization
45 |     cost[0,0] = dist_array[0,0]
46 |     for i in range(1,N):
47 |         cost[i,0] = dist_array[i,0] + cost[i-1,0]
48 |     for j in range(1,M):
49 |         cost[0,j] = dist_array[0,j] + cost[0,j-1]
50 |     # the dynamic programming loop
51 |     for i in range(1,N):
52 |         for j in range(1,M):
53 |             cost[i, j] = dist_array[i, j] + min(cost[i-1,j], cost[i - 1, j - 1], cost[i, j - 1])
54 | 
55 |     final_cost = cost[N-1, M-1]
56 |     if normalized:
57 |         path_len = 1
58 |         i = N-1
59 |         j = M-1
60 |         while i > 0 and j > 0:
61 |             c_up = cost[i - 1, j]
62 |             c_left = cost[i, j-1]
63 |             c_diag = cost[i-1, j-1]
64 |             if c_diag <= c_left and c_diag <= c_up:
65 |                 i -= 1
66 |                 j -= 1
67 |             elif c_left <= c_up:
68 |                 j -= 1
69 |             else:
70 |                 i -= 1
71 |             path_len += 1
72 |         if i == 0:
73 |             path_len += j
74 |         if j == 0:
75 |             path_len += i
76 |         final_cost /= path_len
77 |     return final_cost
78 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX/test_data/2107.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/2107.npy


--------------------------------------------------------------------------------
/cpc/eval/ABX/test_data/23.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/23.npy


--------------------------------------------------------------------------------
/cpc/eval/ABX/test_data/407.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/407.npy


--------------------------------------------------------------------------------
/cpc/eval/ABX/test_data/42.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/42.npy


--------------------------------------------------------------------------------
/cpc/eval/ABX/test_data/dummy_item_file.item:
--------------------------------------------------------------------------------
 1 | #file onset offset #phone prev-phone next-phone speaker
 2 | 2107 0.3225 0.5225 n ae d 8193
 3 | 2107 0.4225 0.5925 d n l 2222
 4 | 42 0.4525 0.6525 d n l 2222
 5 | 42 0.5225 0.7325 ih l n 8193
 6 | 42 0.5925 0.8725 n ih s 8193
 7 | 23 0.6525 1.1025 s n ax 8193
 8 | 23 0.7325 1.1925 s n ax 2222
 9 | 407 0.8725 1.2425 s ax dh 2222
10 | 2107 1.1025 1.2925 dh s ax 12
11 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX/test_data/dummy_item_within.item:
--------------------------------------------------------------------------------
 1 | #file onset offset #phone prev-phone next-phone speaker
 2 | 2107 0. 0.2 n p d 8193
 3 | 2107 0.3225 0.5225 n ae d 8193
 4 | 2107 0.6 0.75 n ae d 8193
 5 | 2107 0.4225 0.5925 d n l 2222
 6 | 42 0.4525 0.6525 d n l 2222
 7 | 42 0.1301 0.2501 q n l 2222
 8 | 42 0.5225 0.7325 d n l 8193
 9 | 42 0.0025 0.3561 d p l 2222
10 | 42 0.5925 0.8725 d p l 8193
11 | 


--------------------------------------------------------------------------------
/cpc/eval/ABX/unit_tests.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import unittest
  6 | import torch
  7 | from nose.tools import eq_, ok_
  8 | from . import abx_group_computation
  9 | from . import abx_iterators
 10 | from pathlib import Path
 11 | import numpy as np
 12 | import math
 13 | 
 14 | 
 15 | class TestDistancesDTW(unittest.TestCase):
 16 | 
 17 |     def testDTWFunction(self):
 18 |         X = torch.tensor([[[0, 1], [0, 0], [1, 1], [42, 42]],
 19 |                           [[0, 2], [0, 1], [1, 1], [-1, 0]],
 20 |                           [[0, 0], [0, 1], [0, 0], [21, 211]]],
 21 |                          dtype=torch.float)
 22 | 
 23 |         X_size = torch.tensor([3, 4, 2])
 24 | 
 25 |         Y = torch.tensor([[[0, 1], [1, 2], [0, 0]]], dtype=torch.float)
 26 |         Y_size = torch.tensor([3])
 27 | 
 28 |         distance_mode = abx_group_computation.get_euclidian_distance_batch
 29 |         dist = abx_group_computation.get_distance_group_dtw(X, Y,
 30 |                                                             X_size, Y_size,
 31 |                                                             distance_function=distance_mode)
 32 |         eq_(dist.size(), (3, 1))
 33 |         expected_dist = [[(math.sqrt(2)) / 2], [3 / 4],
 34 |                          [(2 + math.sqrt(2)) / 3]]
 35 |         for i in range(3):
 36 |             ok_(abs(expected_dist[i][0] - dist[i].item()) < 1e-4)
 37 | 
 38 |     def testThetaDTWFunctionSymetric(self):
 39 |         A = torch.tensor([[[0, 1], [0, 0], [1, 1], [42, 42]],
 40 |                           [[0, 2], [0, 1], [1, 1], [-1, 0]],
 41 |                           [[0, 0], [0, 1], [0, 0], [21, 211]]],
 42 |                          dtype=torch.float)
 43 |         A_size = torch.tensor([3, 4, 2])
 44 |         B = torch.tensor([[[0, 1], [1, 2], [0, 0]]], dtype=torch.float)
 45 |         B_size = torch.tensor([3])
 46 | 
 47 |         distance_mode = abx_group_computation.get_euclidian_distance_batch
 48 |         symetric = True
 49 |         theta = abx_group_computation.get_theta_group_dtw(A, B, A, A_size,
 50 |                                                           B_size, A_size,
 51 |                                                           distance_mode,
 52 |                                                           symetric)
 53 |         eq_(theta, 0.5)
 54 | 
 55 | 
 56 | class testSingularityNormalization(unittest.TestCase):
 57 | 
 58 |     def testCosineNormalized(self):
 59 |         x = torch.tensor([[[1., 0., 0., 0.], [0., 0., 0., 0.]],
 60 |                           [[0., 0., -1., 0.], [0.5, -0.5, 0.5, -0.5]]])
 61 |         y = torch.tensor(
 62 |             [[[-0.5, -0.5, -0.5, 0.5], [0., 0., 0., 0.], [0., 1., 0., 0.]]])
 63 |         norm_x = abx_iterators.normalize_with_singularity(x)
 64 |         norm_y = abx_iterators.normalize_with_singularity(y)
 65 |         dist = abx_group_computation.get_cosine_distance_batch(norm_x, norm_y)
 66 | 
 67 |         eq_(dist.size(), (2, 1, 2, 3))
 68 |         ok_(abs(dist[0, 0, 0, 0] - 0.6667) < 1e-4)
 69 |         ok_(abs(dist[0, 0, 0, 1] - 1.) < 1e-4)
 70 |         ok_(abs(dist[0, 0, 0, 2] - 0.5) < 1e-4)
 71 | 
 72 |         ok_(abs(dist[0, 0, 1, 0] - 1) < 1e-4)
 73 |         ok_(abs(dist[0, 0, 1, 1]) < 1e-4)
 74 |         ok_(abs(dist[0, 0, 1, 2] - 1) < 1e-4)
 75 | 
 76 |         ok_(abs(dist[1, 0, 0, 0] - 0.3333) < 1e-4)
 77 |         ok_(abs(dist[1, 0, 0, 1] - 1.) < 1e-4)
 78 |         ok_(abs(dist[1, 0, 0, 2] - 0.5) < 1e-4)
 79 | 
 80 |         ok_(abs(dist[1, 0, 1, 0]-0.6667) < 1e-4)
 81 |         ok_(abs(dist[1, 0, 1, 1] - 1.) < 1e-4)
 82 |         ok_(abs(dist[1, 0, 1, 2] - 0.6667) < 1e-4)
 83 | 
 84 | 
 85 | class testGroupMaker(unittest.TestCase):
 86 | 
 87 |     def test1DGroupMaker(self):
 88 | 
 89 |         data = [[0], [1], [2], [3], [4], [2], [2], [2]]
 90 |         order = [0]
 91 |         out_index, out_data = abx_iterators.get_features_group(data, order)
 92 | 
 93 |         expected_index = [0, 1, 2, 5, 6, 7, 3, 4]
 94 |         eq_(out_index, expected_index)
 95 | 
 96 |         expected_output = [(0, 1), (1, 2), (2, 6), (6, 7), (7, 8)]
 97 |         eq_(out_data, expected_output)
 98 | 
 99 |     def test2DGroupMaker(self):
100 | 
101 |         data = [[0, 1], [1, 2], [2, 3], [3, 3],
102 |                 [4, 0], [2, 2], [4, 2], [2, 2], [0, 3]]
103 | 
104 |         order = [1, 0]
105 |         out_index, out_data = abx_iterators.get_features_group(data, order)
106 |         expected_index = [4, 0, 1, 5, 7, 6, 8, 2, 3]
107 |         eq_(out_index, expected_index)
108 |         expected_output = [[(0, 1)],
109 |                            [(1, 2)],
110 |                            [(2, 3), (3, 5), (5, 6)],
111 |                            [(6, 7), (7, 8), (8, 9)]]
112 |         eq_(out_data, expected_output)
113 | 
114 |     def test3DGroupMaker(self):
115 | 
116 |         data = [[0, 0, 0, 1],
117 |                 [41, 1, 0, 2],
118 |                 [-23, 0, 3, 1],
119 |                 [220, 1, -2, 3],
120 |                 [40, 2, 1, 0],
121 |                 [200, 0, 0, 1]]
122 | 
123 |         order = [1, 3, 2]
124 |         out_index, out_data = abx_iterators.get_features_group(data, order)
125 |         expected_index = [0, 5, 2, 1, 3, 4]
126 |         eq_(out_index, expected_index)
127 | 
128 |         expected_output = [[[(0, 2), (2, 3)]], [
129 |             [(3, 4)], [(4, 5)]], [[(5, 6)]]]
130 |         eq_(out_data, expected_output)
131 | 
132 | 
133 | class testItemLoader(unittest.TestCase):
134 | 
135 |     def setUp(self):
136 |         self.test_data_dir = Path(__file__).parent / 'test_data'
137 | 
138 |     def testLoadItemFile(self):
139 |         path_item_file = self.test_data_dir / "dummy_item_file.item"
140 |         out, context_match, phone_match, speaker_match = \
141 |             abx_iterators.load_item_file(path_item_file)
142 | 
143 |         eq_(len(out), 4)
144 |         eq_(len(phone_match), 5)
145 |         eq_(len(speaker_match), 3)
146 | 
147 |         expected_phones = {'n': 0, 'd': 1, 'ih': 2,
148 |                            's': 3, 'dh': 4}
149 |         eq_(phone_match, expected_phones)
150 | 
151 |         expected_speakers = {'8193': 0, '2222': 1, '12': 2}
152 |         eq_(speaker_match, expected_speakers)
153 | 
154 |         expected_context = {'ae+d': 0, 'n+l': 1, 'l+n': 2, 'ih+s': 3,
155 |                             'n+ax': 4, 'ax+dh': 5, 's+ax': 6}
156 |         eq_(context_match, expected_context)
157 | 
158 |         expected_output = {'2107': [[0.3225, 0.5225, 0, 0, 0],
159 |                                     [0.4225, 0.5925, 1, 1, 1],
160 |                                     [1.1025, 1.2925, 6, 4, 2]],
161 |                            '42':  [[0.4525, 0.6525, 1, 1, 1],
162 |                                    [0.5225, 0.7325, 2, 2, 0],
163 |                                    [0.5925, 0.8725, 3, 0, 0]],
164 |                            '23':  [[0.6525, 1.1025, 4, 3, 0],
165 |                                    [0.7325, 1.1925, 4, 3, 1]],
166 |                            '407': [[0.8725, 1.2425, 5, 3, 1]]}
167 | 
168 |         eq_(expected_output, out)
169 | 
170 |     def testLoadWithinItemFile(self):
171 |         path_item_file = self.test_data_dir / "dummy_item_within.item"
172 |         out, context_match, phone_match, speaker_match = \
173 |             abx_iterators.load_item_file(path_item_file)
174 | 
175 |         expected_output = {'2107': [[0., 0.2, 0, 0, 0],
176 |                                     [0.3225, 0.5225, 1, 0, 0],
177 |                                     [0.6, 0.75, 1, 0, 0],
178 |                                     [0.4225, 0.5925, 2, 1, 1]],
179 |                            '42':  [[0.4525, 0.6525, 2, 1, 1],
180 |                                    [0.1301, 0.2501, 2, 2, 1],
181 |                                    [0.5225, 0.7325, 2, 1, 0],
182 |                                    [0.0025, 0.3561, 3, 1, 1],
183 |                                    [0.5925, 0.8725, 3, 1, 0]]}
184 |         eq_(expected_output, out)
185 | 
186 | 
187 | class testABXFeatureLoader(unittest.TestCase):
188 | 
189 |     def setUp(self):
190 |         self.stepFeature = 10
191 |         self.test_data_dir = Path(__file__).parent / 'test_data'
192 | 
193 |     def dummy_feature_maker(path_file, *args):
194 |         data = torch.tensor(np.load(path_file))
195 |         assert(len(data.size()) == 1)
196 |         return data.view(1, -1, 1)
197 | 
198 |     def testBaseLoader(self):
199 |         seqList = [('2107', self.test_data_dir / '2107.npy'),
200 |                    ('42', self.test_data_dir / '42.npy'),
201 |                    ('23', self.test_data_dir / '23.npy'),
202 |                    ('407', self.test_data_dir / '407.npy')]
203 | 
204 |         dataset = abx_iterators.ABXFeatureLoader(self.test_data_dir / "dummy_item_file.item",
205 |                                                  seqList,
206 |                                                  testABXFeatureLoader.dummy_feature_maker,
207 |                                                  self.stepFeature,
208 |                                                  False)
209 |         print(dataset.features)
210 |         eq_(dataset.feature_dim, 1)
211 |         eq_(len(dataset), 9)
212 |         eq_(len(dataset.data.size()), 2)
213 |         eq_(len(dataset.data), 16)
214 |         data, size, coords = dataset[0]
215 |         eq_(size, 1)
216 |         eq_(coords, (0, 0, 0))
217 |         eq_(data.tolist(), [[3]])
218 | 
219 |         data, size, coords = dataset[3]
220 |         eq_(size, 1)
221 |         eq_(coords, (1, 1, 1))
222 |         eq_(data.tolist(), [[5]])
223 | 
224 |     def testWithinIterator(self):
225 |         seqList = [('2107', self.test_data_dir / '2107.npy'),
226 |                    ('42', self.test_data_dir / '42.npy')]
227 |         dataset = abx_iterators.ABXFeatureLoader(self.test_data_dir / "dummy_item_within.item",
228 |                                                  seqList,
229 |                                                  testABXFeatureLoader.dummy_feature_maker,
230 |                                                  self.stepFeature,
231 |                                                  False)
232 |         iterator = dataset.get_iterator('within', 40)
233 |         eq_(iterator.index_csp, [0, 1, 2, 6, 3, 4, 5, 8, 7])
234 |         eq_(iterator.groups_csp, [[[(0, 1)]], [[(1, 3)]], [
235 |             [(3, 4)], [(4, 6), (6, 7)]], [[(7, 8)], [(8, 9)]]])
236 |         eq_(len(iterator), 1)
237 | 
238 |         it = iter(iterator)
239 |         c1, a_01, b_01, x_01 = next(it)
240 |         eq_(c1, (1, 1, 2, 2))
241 |         a_1, s_a = a_01
242 |         eq_(s_a.tolist(), [1, 1])
243 |         eq_(a_1.tolist(), [[[4.]], [[5.]]])
244 |         eq_(x_01[0].tolist(), a_1.tolist())
245 |         eq_(x_01[1].tolist(), s_a.tolist())
246 |         eq_(b_01[0].tolist(), [[[1.]]])
247 |         eq_(b_01[1].item(), 1)
248 | 
249 |         eq_(next(it, False), False)
250 |         eq_(iterator.get_board_size(), (2, 3, 3, 4))
251 | 


--------------------------------------------------------------------------------
/cpc/eval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/cpc/eval/build_zeroSpeech_features.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import os
  6 | import json
  7 | import torch
  8 | import progressbar
  9 | import argparse
 10 | import numpy as np
 11 | 
 12 | from cpc.dataset import findAllSeqs
 13 | from cpc.feature_loader import buildFeature, FeatureModule, \
 14 |     ModelPhoneCombined, loadSupervisedCriterion, loadModel
 15 | 
 16 | 
 17 | def getArgs(pathCheckpoints):
 18 |     pathArgs = os.path.join(os.path.dirname(pathCheckpoints),
 19 |                             "checkpoint_args.json")
 20 |     with open(pathArgs, 'rb') as file:
 21 |         return json.load(file)
 22 | 
 23 | 
 24 | def buildAllFeature(featureMaker, pathDB, pathOut,
 25 |                     seqList, stepSize=0.01, strict=False,
 26 |                     maxSizeSeq=64000, format='fea',
 27 |                     seqNorm=False):
 28 | 
 29 |     totSeqs = len(seqList)
 30 |     startStep = stepSize / 2
 31 |     bar = progressbar.ProgressBar(maxval=totSeqs)
 32 |     bar.start()
 33 |     for nseq, seqPath in enumerate(seqList):
 34 |         bar.update(nseq)
 35 |         feature = buildFeature(featureMaker,
 36 |                                os.path.join(pathDB, seqPath),
 37 |                                strict=strict or seqNorm,
 38 |                                maxSizeSeq=maxSizeSeq,
 39 |                                seqNorm=seqNorm)
 40 | 
 41 |         _, nSteps, hiddenSize = feature.size()
 42 |         outName = os.path.basename(os.path.splitext(seqPath)[0]) + f'.{format}'
 43 |         fname = os.path.join(pathOut, outName)
 44 | 
 45 |         if format == 'npz':
 46 |             time = [startStep + step * stepSize for step in range(nSteps)]
 47 |             values = feature.squeeze(0).float().cpu().numpy()
 48 |             totTime = np.array([stepSize * nSteps], dtype=np.float32)
 49 |             with open(fname, 'wb') as f:
 50 |                 np.savez(f, time=time, features=values, totTime=totTime)
 51 |         elif format == 'npy':
 52 |             time = [startStep + step * stepSize for step in range(nSteps)]
 53 |             values = feature.squeeze(0).float().cpu().numpy()
 54 |             with open(fname, 'wb') as f:
 55 |                 np.save(f, values)
 56 |         elif format == 'af':
 57 |             import arrayfire as af
 58 |             time = [startStep + step * stepSize for step in range(nSteps)]
 59 |             values = feature.squeeze(0).float().cpu().numpy()
 60 |             totTime = np.array([stepSize * nSteps], dtype=np.float32)
 61 |             af.save_array("time", af.Array(time, dtype=af.Dtype.f32), fname)
 62 |             af.save_array("totTime", af.interop.from_ndarray(totTime),
 63 |                           fname, append=True)
 64 |             af.save_array("features", af.interop.from_ndarray(values),
 65 |                           fname, append=True)
 66 |         else:
 67 |             with open(fname, 'w') as f:
 68 |                 _, nSteps, hiddenSize = feature.size()
 69 |                 for step in range(nSteps):
 70 |                     line = [startStep + step * stepSize] + \
 71 |                         feature[0, step, :].tolist()
 72 |                     line = [str(x) for x in line]
 73 |                     linestr = ' '.join(line) + '\n'
 74 |                     f.write(linestr)
 75 |     bar.finish()
 76 | 
 77 | 
 78 | if __name__ == "__main__":
 79 | 
 80 |     parser = argparse.ArgumentParser('Build features for zerospeech \
 81 |                                       Track1 evaluation')
 82 |     parser.add_argument('pathDB', help='Path to the reference dataset')
 83 |     parser.add_argument('pathOut', help='Path to the output features')
 84 |     parser.add_argument('pathCheckpoint', help='Checkpoint to load')
 85 |     parser.add_argument('--extension', type=str, default='.wav')
 86 |     parser.add_argument('--addCriterion', action='store_true')
 87 |     parser.add_argument('--oneHot', action='store_true')
 88 |     parser.add_argument('--maxSizeSeq', default=64000, type=int)
 89 |     parser.add_argument('--train_mode', action='store_true')
 90 |     parser.add_argument('--format', default='fea', type=str,
 91 |                         choices=['npz', 'fea', 'npy', 'af'])
 92 |     parser.add_argument('--strict', action='store_true')
 93 |     parser.add_argument('--dimReduction', type=str, default=None)
 94 |     parser.add_argument('--centroidLimits', type=int, nargs=2, default=None)
 95 |     parser.add_argument('--getEncoded', action='store_true')
 96 |     parser.add_argument('--clusters', type=str, default=None)
 97 |     parser.add_argument('--seqNorm', action='store_true')
 98 | 
 99 |     args = parser.parse_args()
100 | 
101 |     if not os.path.isdir(args.pathOut):
102 |         os.mkdir(args.pathOut)
103 | 
104 |     with open(os.path.join(os.path.dirname(args.pathOut),
105 |                            f"{os.path.basename(args.pathOut)}.json"), 'w') \
106 |             as file:
107 |         json.dump(vars(args), file, indent=2)
108 | 
109 |     outData = [x[1] for x in
110 |                findAllSeqs(args.pathDB, extension=args.extension,
111 |                            loadCache=False)[0]]
112 | 
113 |     featureMaker = loadModel([args.pathCheckpoint])[0]
114 |     stepSize = featureMaker.gEncoder.DOWNSAMPLING / 16000
115 |     print(f"stepSize : {stepSize}")
116 |     featureMaker = FeatureModule(featureMaker, args.getEncoded)
117 |     featureMaker.collapse = False
118 | 
119 |     if args.addCriterion:
120 |         criterion, nPhones = loadSupervisedCriterion(args.pathCheckpoint)
121 |         featureMaker = ModelPhoneCombined(featureMaker, criterion,
122 |                                           nPhones, args.oneHot)
123 |     featureMaker = featureMaker.cuda(device=0)
124 | 
125 |     if not args.train_mode:
126 |         featureMaker.eval()
127 | 
128 |     buildAllFeature(featureMaker, args.pathDB, args.pathOut,  outData,
129 |                     stepSize=stepSize, strict=args.strict,
130 |                     maxSizeSeq=args.maxSizeSeq,
131 |                     format=args.format,
132 |                     seqNorm=args.seqNorm)
133 | 


--------------------------------------------------------------------------------
/cpc/eval/utils/adjust_sample_rate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | import argparse
 6 | import torchaudio
 7 | import progressbar
 8 | import os
 9 | import sys
10 | from pathlib import Path
11 | 
12 | 
13 | def adjust_sample_rate(path_db, file_list, path_db_out,
14 |                        target_sr):
15 |     bar = progressbar.ProgressBar(maxval=len(file_list))
16 |     bar.start()
17 | 
18 |     for index, item in enumerate(file_list):
19 |         path_in = os.path.join(path_db, item)
20 |         path_out = os.path.join(path_db_out, item)
21 | 
22 |         bar.update(index)
23 |         data, sr = torchaudio.load(path_in)
24 |         transform = torchaudio.transforms.Resample(orig_freq=sr,
25 |                                                    new_freq=target_sr,
26 |                                                    resampling_method='sinc_interpolation')
27 |         data = transform(data)
28 |         torchaudio.save(path_out, data, target_sr,
29 |                         precision=16, channels_first=True)
30 | 
31 |     bar.finish()
32 | 
33 | 
34 | def get_names_list(path_tsv_file):
35 | 
36 |     with open(path_tsv_file, 'r') as file:
37 |         data = file.readlines()
38 | 
39 |     return [x.split()[0] for x in data]
40 | 
41 | 
42 | def parse_args(argv):
43 | 
44 |     parser = argparse.ArgumentParser(description='Adjust the sample rate of '
45 |                                      'a given group of audio files')
46 | 
47 |     parser.add_argument('path_db', type=str,
48 |                         help='Path to the directory containing the audio '
49 |                         'files')
50 |     parser.add_argument("path_phone_files", type=str,
51 |                         help='Path to the .txt file containing the list of '
52 |                         'the files with a phone transcription')
53 |     parser.add_argument("path_out", type=str,
54 |                         help='Path out the output directory')
55 |     parser.add_argument("--out_sample_rate", type=int, default=16000,
56 |                         help="Sample rate of the output audio files "
57 |                         "(default is 160000)")
58 |     parser.add_argument('--file_extension', type=str, default='.mp3')
59 | 
60 |     return parser.parse_args(argv)
61 | 
62 | 
63 | def main(argv):
64 | 
65 |     args = parse_args(argv)
66 | 
67 |     file_list_db = [f for f in os.listdir(args.path_db)
68 |                     if Path(f).suffix == args.file_extension]
69 | 
70 |     print(f"Found {len(file_list_db)} in the dataset")
71 |     file_list_phone = get_names_list(args.path_phone_files)
72 |     print(f"Found {len(file_list_phone)} with a phone transcription")
73 | 
74 |     file_list_db.sort()
75 |     file_list_phone.sort()
76 |     out_list = []
77 |     index_phone = 0
78 |     for file_name in file_list_db:
79 |         while Path(file_name).stem > file_list_phone[index_phone]:
80 |             index_phone += 1
81 |             if index_phone >= len(file_list_phone):
82 |                 break
83 |         if Path(file_name).stem == file_list_phone[index_phone]:
84 |             out_list.append(file_name)
85 | 
86 |     print(f"Converting {len(out_list)} files")
87 | 
88 |     Path(args.path_out).mkdir(parents=True, exist_ok=True)
89 |     adjust_sample_rate(args.path_db, out_list,
90 |                        args.path_out, args.out_sample_rate)
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     main(sys.argv[1:])
95 | 


--------------------------------------------------------------------------------
/cpc/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torchaudio
  8 | 
  9 | import torch
 10 | 
 11 | ###########################################
 12 | # Networks
 13 | ###########################################
 14 | 
 15 | 
 16 | class IDModule(nn.Module):
 17 | 
 18 |     def __init__(self, *args, **kwargs):
 19 |         super(IDModule, self).__init__()
 20 | 
 21 |     def forward(self, x):
 22 |         return x
 23 | 
 24 | 
 25 | class ChannelNorm(nn.Module):
 26 | 
 27 |     def __init__(self,
 28 |                  numFeatures,
 29 |                  epsilon=1e-05,
 30 |                  affine=True):
 31 | 
 32 |         super(ChannelNorm, self).__init__()
 33 |         if affine:
 34 |             self.weight = nn.parameter.Parameter(torch.Tensor(1,
 35 |                                                               numFeatures, 1))
 36 |             self.bias = nn.parameter.Parameter(torch.Tensor(1, numFeatures, 1))
 37 |         else:
 38 |             self.weight = None
 39 |             self.bias = None
 40 |         self.epsilon = epsilon
 41 |         self.p = 0
 42 |         self.affine = affine
 43 |         self.reset_parameters()
 44 | 
 45 |     def reset_parameters(self):
 46 |         if self.affine:
 47 |             torch.nn.init.ones_(self.weight)
 48 |             torch.nn.init.zeros_(self.bias)
 49 | 
 50 |     def forward(self, x):
 51 | 
 52 |         cumMean = x.mean(dim=1, keepdim=True)
 53 |         cumVar = x.var(dim=1, keepdim=True)
 54 |         x = (x - cumMean)*torch.rsqrt(cumVar + self.epsilon)
 55 | 
 56 |         if self.weight is not None:
 57 |             x = x * self.weight + self.bias
 58 |         return x
 59 | 
 60 | 
 61 | class CPCEncoder(nn.Module):
 62 | 
 63 |     def __init__(self,
 64 |                  sizeHidden=512,
 65 |                  normMode="layerNorm"):
 66 | 
 67 |         super(CPCEncoder, self).__init__()
 68 | 
 69 |         validModes = ["batchNorm", "instanceNorm", "ID", "layerNorm"]
 70 |         if normMode not in validModes:
 71 |             raise ValueError(f"Norm mode must be in {validModes}")
 72 | 
 73 |         if normMode == "instanceNorm":
 74 |             def normLayer(x): return nn.InstanceNorm1d(x, affine=True)
 75 |         elif normMode == "ID":
 76 |             normLayer = IDModule
 77 |         elif normMode == "layerNorm":
 78 |             normLayer = ChannelNorm
 79 |         else:
 80 |             normLayer = nn.BatchNorm1d
 81 | 
 82 |         self.dimEncoded = sizeHidden
 83 |         self.conv0 = nn.Conv1d(1, sizeHidden, 10, stride=5, padding=3)
 84 |         self.batchNorm0 = normLayer(sizeHidden)
 85 |         self.conv1 = nn.Conv1d(sizeHidden, sizeHidden, 8, stride=4, padding=2)
 86 |         self.batchNorm1 = normLayer(sizeHidden)
 87 |         self.conv2 = nn.Conv1d(sizeHidden, sizeHidden, 4,
 88 |                                stride=2, padding=1)
 89 |         self.batchNorm2 = normLayer(sizeHidden)
 90 |         self.conv3 = nn.Conv1d(sizeHidden, sizeHidden, 4, stride=2, padding=1)
 91 |         self.batchNorm3 = normLayer(sizeHidden)
 92 |         self.conv4 = nn.Conv1d(sizeHidden, sizeHidden, 4, stride=2, padding=1)
 93 |         self.batchNorm4 = normLayer(sizeHidden)
 94 |         self.DOWNSAMPLING = 160
 95 | 
 96 |     def getDimOutput(self):
 97 |         return self.conv4.out_channels
 98 | 
 99 |     def forward(self, x):
100 |         x = F.relu(self.batchNorm0(self.conv0(x)))
101 |         x = F.relu(self.batchNorm1(self.conv1(x)))
102 |         x = F.relu(self.batchNorm2(self.conv2(x)))
103 |         x = F.relu(self.batchNorm3(self.conv3(x)))
104 |         x = F.relu(self.batchNorm4(self.conv4(x)))
105 |         return x
106 | 
107 | 
108 | class MFCCEncoder(nn.Module):
109 | 
110 |     def __init__(self,
111 |                  dimEncoded):
112 | 
113 |         super(MFCCEncoder, self).__init__()
114 |         melkwargs = {"n_mels": max(128, dimEncoded), "n_fft": 321}
115 |         self.dimEncoded = dimEncoded
116 |         self.MFCC = torchaudio.transforms.MFCC(n_mfcc=dimEncoded,
117 |                                                melkwargs=melkwargs)
118 | 
119 |     def forward(self, x):
120 |         x = x.view(x.size(0), -1)
121 |         x = self.MFCC(x)
122 |         return x.permute(0, 2, 1)
123 | 
124 | 
125 | class LFBEnconder(nn.Module):
126 | 
127 |     def __init__(self, dimEncoded, normalize=True):
128 | 
129 |         super(LFBEnconder, self).__init__()
130 |         self.dimEncoded = dimEncoded
131 |         self.conv = nn.Conv1d(1, 2 * dimEncoded,
132 |                               400, stride=1)
133 |         self.register_buffer('han', torch.hann_window(400).view(1, 1, 400))
134 |         self.instancenorm = nn.InstanceNorm1d(dimEncoded, momentum=1) \
135 |             if normalize else None
136 | 
137 |     def forward(self, x):
138 | 
139 |         N, C, L = x.size()
140 |         x = self.conv(x)
141 |         x = x.view(N, self.dimEncoded, 2, -1)
142 |         x = x[:, :, 0, :]**2 + x[:, :, 1, :]**2
143 |         x = x.view(N * self.dimEncoded, 1,  -1)
144 |         x = torch.nn.functional.conv1d(x, self.han, bias=None,
145 |                                        stride=160, padding=350)
146 |         x = x.view(N, self.dimEncoded,  -1)
147 |         x = torch.log(1 + torch.abs(x))
148 | 
149 |         # Normalization
150 |         if self.instancenorm is not None:
151 |             x = self.instancenorm(x)
152 |         return x
153 | 
154 | 
155 | class CPCAR(nn.Module):
156 | 
157 |     def __init__(self,
158 |                  dimEncoded,
159 |                  dimOutput,
160 |                  keepHidden,
161 |                  nLevelsGRU,
162 |                  mode="GRU",
163 |                  reverse=False):
164 | 
165 |         super(CPCAR, self).__init__()
166 |         self.RESIDUAL_STD = 0.1
167 | 
168 |         if mode == "LSTM":
169 |             self.baseNet = nn.LSTM(dimEncoded, dimOutput,
170 |                                    num_layers=nLevelsGRU, batch_first=True)
171 |         elif mode == "RNN":
172 |             self.baseNet = nn.RNN(dimEncoded, dimOutput,
173 |                                   num_layers=nLevelsGRU, batch_first=True)
174 |         else:
175 |             self.baseNet = nn.GRU(dimEncoded, dimOutput,
176 |                                   num_layers=nLevelsGRU, batch_first=True)
177 | 
178 |         self.hidden = None
179 |         self.keepHidden = keepHidden
180 |         self.reverse = reverse
181 | 
182 |     def getDimOutput(self):
183 |         return self.baseNet.hidden_size
184 | 
185 |     def forward(self, x):
186 | 
187 |         if self.reverse:
188 |             x = torch.flip(x, [1])
189 |         try:
190 |             self.baseNet.flatten_parameters()
191 |         except RuntimeError:
192 |             pass
193 |         x, h = self.baseNet(x, self.hidden)
194 |         if self.keepHidden:
195 |             if isinstance(h, tuple):
196 |                 self.hidden = tuple(x.detach() for x in h)
197 |             else:
198 |                 self.hidden = h.detach()
199 | 
200 |         # For better modularity, a sequence's order should be preserved
201 |         # by each module
202 |         if self.reverse:
203 |             x = torch.flip(x, [1])
204 |         return x
205 | 
206 | 
207 | class NoAr(nn.Module):
208 | 
209 |     def __init__(self, *args):
210 |         super(NoAr, self).__init__()
211 | 
212 |     def forward(self, x):
213 |         return x
214 | 
215 | 
216 | class BiDIRARTangled(nn.Module):
217 |     r"""
218 |     Research: bidirectionnal model for BERT training.
219 |     """
220 |     def __init__(self,
221 |                  dimEncoded,
222 |                  dimOutput,
223 |                  nLevelsGRU):
224 | 
225 |         super(BiDIRARTangled, self).__init__()
226 |         assert(dimOutput % 2 == 0)
227 | 
228 |         self.ARNet = nn.GRU(dimEncoded, dimOutput // 2,
229 |                             num_layers=nLevelsGRU, batch_first=True,
230 |                             bidirectional=True)
231 | 
232 |     def getDimOutput(self):
233 |         return self.ARNet.hidden_size * 2
234 | 
235 |     def forward(self, x):
236 | 
237 |         self.ARNet.flatten_parameters()
238 |         xf, _ = self.ARNet(x)
239 |         return xf
240 | 
241 | 
242 | class BiDIRAR(nn.Module):
243 |     r"""
244 |     Research: bidirectionnal model for BERT training.
245 |     """
246 |     def __init__(self,
247 |                  dimEncoded,
248 |                  dimOutput,
249 |                  nLevelsGRU):
250 | 
251 |         super(BiDIRAR, self).__init__()
252 |         assert(dimOutput % 2 == 0)
253 | 
254 |         self.netForward = nn.GRU(dimEncoded, dimOutput // 2,
255 |                                  num_layers=nLevelsGRU, batch_first=True)
256 |         self.netBackward = nn.GRU(dimEncoded, dimOutput // 2,
257 |                                   num_layers=nLevelsGRU, batch_first=True)
258 | 
259 |     def getDimOutput(self):
260 |         return self.netForward.hidden_size * 2
261 | 
262 |     def forward(self, x):
263 | 
264 |         self.netForward.flatten_parameters()
265 |         self.netBackward.flatten_parameters()
266 |         xf, _ = self.netForward(x)
267 |         xb, _ = self.netBackward(torch.flip(x, [1]))
268 |         return torch.cat([xf, torch.flip(xb, [1])], dim=2)
269 | 
270 | 
271 | ###########################################
272 | # Model
273 | ###########################################
274 | 
275 | 
276 | class CPCModel(nn.Module):
277 | 
278 |     def __init__(self,
279 |                  encoder,
280 |                  AR):
281 | 
282 |         super(CPCModel, self).__init__()
283 |         self.gEncoder = encoder
284 |         self.gAR = AR
285 | 
286 |     def forward(self, batchData, label):
287 |         encodedData = self.gEncoder(batchData).permute(0, 2, 1)
288 |         cFeature = self.gAR(encodedData)
289 |         return cFeature, encodedData, label
290 | 
291 | class CPCModelNullspace(nn.Module):
292 | 
293 |     def __init__(self,
294 |                  cpc,
295 |                  nullspace):
296 | 
297 |         super(CPCModelNullspace, self).__init__()
298 |         self.cpc = cpc
299 |         self.nullspace = nn.Linear(nullspace.shape[0], nullspace.shape[1], bias=False)
300 |         self.nullspace.weight = nn.Parameter(nullspace.T)
301 |         self.gEncoder = self.cpc.gEncoder
302 | 
303 | 
304 |     def forward(self, batchData, label):
305 |         cFeature, encodedData, label = self.cpc(batchData, label)
306 |         cFeature = self.nullspace(cFeature)
307 |         encodedData = self.nullspace(encodedData)
308 |         return cFeature, encodedData, label
309 | 
310 | 
311 | class ConcatenatedModel(nn.Module):
312 | 
313 |     def __init__(self, model_list):
314 | 
315 |         super(ConcatenatedModel, self).__init__()
316 |         self.models = torch.nn.ModuleList(model_list)
317 | 
318 |     def forward(self, batchData, label):
319 | 
320 |         outFeatures = []
321 |         outEncoded = []
322 |         for model in self.models:
323 |             cFeature, encodedData, label = model(batchData, label)
324 |             outFeatures.append(cFeature)
325 |             outEncoded.append(encodedData)
326 |         return torch.cat(outFeatures, dim=2), \
327 |             torch.cat(outEncoded, dim=2), label
328 | 


--------------------------------------------------------------------------------
/cpc/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/stats/__init__.py


--------------------------------------------------------------------------------
/cpc/stats/empty_stat.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # this is a "template" for stat class, plus each stat must extend this
 3 | class Stat:
 4 | 
 5 |     def computeForBatch(self, batch):
 6 |         raise Exception("computeForBatch not implemented")
 7 | 
 8 |     def mergeStatResults(self, prev, current):
 9 |         raise Exception("mergeStatResults not implemented")
10 | 
11 |     def logStat(self, statValue, epochNr):
12 |         raise Exception("logStat not implemented")
13 |         # should return it's values in dict format, can also log somewhere
14 |         # e.g. subclass can take 'where to log' as additional its-state arg
15 | 
16 |     def getStatName(self):
17 |         raise Exception("getStatName not implemented")
18 |         # has to differ if want to compute both at a time;
19 |         # can be e.g. framewise_ctx_euclid_diff, framewise_ctx_cosine_diff dep. on Stat settings/state


--------------------------------------------------------------------------------
/cpc/stats/repr_diff_stat.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import cpc.stats.empty_stat as statTempl
 4 | import torch
 5 | import math
 6 | import os
 7 | from copy import deepcopy
 8 | import matplotlib.pyplot as plt
 9 | 
10 | def euclideanDist(vecs1, vecs2):
11 |     return torch.sqrt(torch.square(vecs1).sum(1) + torch.square(vecs2).sum(1) - (2*vecs1*vecs2).sum(1))
12 | 
13 | def euclideanDistSq(vecs1, vecs2):
14 |     return torch.square(vecs1).sum(1) + torch.square(vecs2).sum(1) - (2*vecs1*vecs2).sum(1)
15 | 
16 | def cosineDist(vecs1, vecs2):
17 |     cosSim = (vecs1*vecs2).sum(1) / (torch.sqrt(torch.square(vecs1).sum(1)) * torch.sqrt(torch.square(vecs2).sum(1)))
18 |     return -cosSim + 1.
19 | 
20 | def cosineCorr(vecs1, vecs2):
21 |     cosSim = (vecs1*vecs2).sum(1) / (torch.sqrt(torch.square(vecs1).sum(1)) * torch.sqrt(torch.square(vecs2).sum(1)))
22 |     return torch.abs(cosSim)
23 | 
24 | class ReprDiffStat(statTempl.Stat):
25 | 
26 |     def __init__(self, metric, reprType, stepSize, histDir):
27 |         super().__init__()
28 |         assert metric in ('cosine', 'euclid', 'euclidsq', 'coscorr')
29 |         assert reprType in ('conv_repr', 'ctx_repr')
30 |         self.metric = metric
31 |         self.reprType = reprType
32 |         self.stepSize = stepSize
33 |         self.histDir = histDir
34 |         if not os.path.exists(self.histDir):
35 |             os.makedirs(self.histDir)
36 | 
37 |     @staticmethod
38 |     def convertArgsFromStrings(metric, reprType, stepSize, histDir):
39 |         return (metric, reprType, float(stepSize), histDir)
40 | 
41 |     def computeForBatch(self, batch):
42 |         reprData = batch[self.reprType]
43 |         reprData1 = reprData[:,1:].contiguous().view(-1, reprData.shape[2])
44 |         reprData2 = reprData[:,:-1].contiguous().view(-1, reprData.shape[2])
45 |         if self.metric == 'euclid':
46 |             distances = euclideanDist(reprData1, reprData2)
47 |         elif self.metric == 'euclidsq':
48 |             distances = euclideanDistSq(reprData1, reprData2)
49 |         elif self.metric == 'cosine':
50 |             distances = cosineDist(reprData1, reprData2)
51 |         elif self.metric == 'coscorr':
52 |             distances = cosineCorr(reprData1, reprData2)
53 |         distances = torch.div(distances, self.stepSize)  #, rounding_mode='floor')
54 |         occurences = {}
55 |         l = 0
56 |         for d in distances:
57 |             if math.isnan(d):
58 |                 continue
59 |             l += 1
60 |             df = math.floor(d) * self.stepSize
61 |             if df in occurences:
62 |                 occurences[df] = occurences[df] + 1
63 |             else:
64 |                 occurences[df] = 1
65 |         return {
66 |             'hist': occurences,
67 |             'sum': l
68 |         }
69 | 
70 |     def mergeStatResults(self, prev, current):
71 |         merged = {}
72 |         merged['sum'] = prev['sum'] + current['sum']
73 |         currentHist = current['hist']
74 |         mergedHist = deepcopy(prev['hist'])
75 |         for step in currentHist:
76 |             if step in mergedHist:
77 |                 mergedHist[step] = mergedHist[step] + currentHist[step]
78 |             else:
79 |                 mergedHist[step] = currentHist[step]
80 |         merged['hist'] = mergedHist
81 |         return merged
82 | 
83 |     def logStat(self, statValue, epochNr):
84 |         histValues = statValue['hist']
85 |         histKeys = sorted(list(histValues.keys()))
86 |         histHeights = [histValues[k] for k in histKeys]
87 |         plt.figure()
88 |         plt.bar(histKeys, histHeights, width=self.stepSize)
89 |         plt.savefig(os.path.join(self.histDir, self.getStatName() + "_" + str(epochNr) + ".png"))
90 |         return {
91 |             'mean': sum([a*b for a,b in zip (histKeys, histHeights)]) / sum(histHeights)
92 |         }
93 | 
94 |     def getStatName(self):
95 |         return "reprDiff_" + self.reprType + "_" + self.metric + "_by" + str(self.stepSize)


--------------------------------------------------------------------------------
/cpc/stats/stat_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import cpc.stats.stats_collector as sc
 3 | import cpc.stats.repr_diff_stat as repr_diff
 4 | 
 5 | # --valSetStats stat1:a,b,c_stat2
 6 | # --captureSetStats stat1:_stat2:p1,p2_stat3:p1
 7 | 
 8 | def getStatFromSpec(spec):
 9 |     specSplit = spec.split(":")
10 |     statName, statArgs = specSplit[0], specSplit[1]
11 |     statArgs = statArgs.split(",")
12 |     assert statName in ("reprDiff,")
13 |     if statName == "reprDiff":
14 |         statArgs = repr_diff.ReprDiffStat.convertArgsFromStrings(*statArgs)
15 |         return repr_diff.ReprDiffStat(*statArgs)
16 | 
17 | def constructStatCollectorFromSpecs(specs):
18 |     specList = specs.split('_')
19 |     collector = sc.StatsCollector()
20 |     for spec in specList:
21 |         collector.registerStat(getStatFromSpec(spec))
22 |     return collector
23 | 


--------------------------------------------------------------------------------
/cpc/stats/stats_collector.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import cpc.stats.empty_stat as statTempl
 3 | 
 4 | class StatsCollector:
 5 | 
 6 |     def __init__(self):
 7 |         self.stats = []
 8 |         self.statValues = []
 9 |         self.zeroed = True
10 |         self.statNames = set()
11 | 
12 |     def registerStat(self, stat):
13 |         assert issubclass(type(stat), statTempl.Stat)
14 |         assert stat.getStatName not in self.statNames
15 |         self.statNames.add(stat.getStatName)
16 |         self.stats.append(stat)
17 | 
18 |     def zeroStats(self):
19 |         self.zeroed = True
20 | 
21 |     def batchUpdate(self, batch):
22 |         if self.zeroed:
23 |             self.statValues = [stat.computeForBatch(batch) for stat in self.stats]
24 |             self.zeroed = False
25 |         else:
26 |             oldValues = self.statValues
27 |             newValues = [stat.computeForBatch(batch) for stat in self.stats]
28 |             self.statValues = [stat.mergeStatResults(prev, current) for stat, (prev, current) 
29 |                                 in zip (self.stats, zip(oldValues, newValues))]
30 | 
31 |     def dataLoaderUpdate(self, dataLoader):
32 |         for batch in dataLoader:
33 |             self.batchUpdate(batch)
34 | 
35 |     def logStats(self, epochNr):
36 |         statLogs = {}
37 |         for stat, statValue in zip(self.stats, self.statValues):
38 |             statLogs.update({ stat.getStatName() + "_" + k: v for k, v in stat.logStat(statValue, epochNr).items()})
39 |         return statLogs
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/cpc/test_data/seq_list.txt:
--------------------------------------------------------------------------------
1 | 6476-57446-0019
2 | 5678-43303-0032
3 | 5678-43303-0024
4 | 5678-43301-0021
5 | 5393-19218-0024
6 | 4397-15668-0007
7 | 4397-15668-0003
8 | 


--------------------------------------------------------------------------------
/cpc/test_data/test_db/2911/12359/2911-12359-0007.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/2911/12359/2911-12359-0007.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/4051/11218/4051-11218-0044.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/4051/11218/4051-11218-0044.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/4397/15668/4397-15668-0003.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/4397/15668/4397-15668-0003.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/4397/15668/4397-15668-0007.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/4397/15668/4397-15668-0007.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/5393/19218/5393-19218-0024.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5393/19218/5393-19218-0024.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/5678/43301/5678-43301-0021.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5678/43301/5678-43301-0021.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/5678/43303/5678-43303-0024.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5678/43303/5678-43303-0024.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/5678/43303/5678-43303-0032.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5678/43303/5678-43303-0032.flac


--------------------------------------------------------------------------------
/cpc/test_data/test_db/6476/57446/6476-57446-0019.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/6476/57446/6476-57446-0019.flac


--------------------------------------------------------------------------------
/cpc/transformers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import torch
  6 | import torch.nn as nn
  7 | import math
  8 | 
  9 | 
 10 | class ScaledDotProductAttention(nn.Module):
 11 |     def __init__(self,
 12 |                  sizeSeq,         # Size of the input sequence
 13 |                  dk,              # Dimension of the input sequence
 14 |                  dropout,         # Dropout parameter
 15 |                  relpos=False):   # Do we retrieve positional information ?
 16 |         super(ScaledDotProductAttention, self).__init__()
 17 | 
 18 |         self.drop = nn.Dropout(dropout)
 19 |         self.softmax = nn.Softmax(dim=2)
 20 |         self.relpos = relpos
 21 |         self.sizeSeq = sizeSeq
 22 | 
 23 |         if relpos:
 24 |             self.Krelpos = nn.Parameter(torch.Tensor(dk, sizeSeq))
 25 |             self.initmat_(self.Krelpos)
 26 |             self.register_buffer('z', torch.zeros(1, sizeSeq, 1))
 27 | 
 28 |         # A mask is set so that a node never queries data in the future
 29 |         mask = torch.tril(torch.ones(sizeSeq, sizeSeq), diagonal=0)
 30 |         mask = 1 - mask
 31 |         mask[mask == 1] = -float('inf')
 32 |         self.register_buffer('mask', mask.unsqueeze(0))
 33 | 
 34 |     def initmat_(self, mat, dim=0):
 35 |         stdv = 1. / math.sqrt(mat.size(dim))
 36 |         mat.data.uniform_(-stdv, stdv)
 37 | 
 38 |     def forward(self, Q, K, V):
 39 |         # Input dim : N x sizeSeq x dk
 40 |         QK = torch.bmm(Q, K.transpose(-2, -1))
 41 | 
 42 |         if self.relpos:
 43 |             bsz = Q.size(0)
 44 |             QP = Q.matmul(self.Krelpos)
 45 |             # This trick with z fills QP's diagonal with zeros
 46 |             QP = torch.cat((self.z.expand(bsz, -1, -1), QP), 2)
 47 |             QK += QP.view(bsz, self.sizeSeq + 1, self.sizeSeq)[:, 1:, :]
 48 |         A = self.softmax(QK / math.sqrt(K.size(-1)) + self.mask)
 49 |         return torch.bmm(self.drop(A), V)
 50 | 
 51 | 
 52 | class MultiHeadAttention(nn.Module):
 53 |     def __init__(self,
 54 |                  sizeSeq,   # Size of a sequence
 55 |                  dropout,   # Dropout parameter
 56 |                  dmodel,    # Model's dimension
 57 |                  nheads,    # Number of heads in the model
 58 |                  abspos):   # Is positional information encoded in the input ?
 59 |         super(MultiHeadAttention, self).__init__()
 60 |         self.Wo = nn.Linear(dmodel, dmodel, bias=False)
 61 |         self.Wk = nn.Linear(dmodel, dmodel, bias=False)
 62 |         self.Wq = nn.Linear(dmodel, dmodel, bias=False)
 63 |         self.Wv = nn.Linear(dmodel, dmodel, bias=False)
 64 |         self.nheads = nheads
 65 |         self.dk = dmodel // nheads
 66 |         self.Att = ScaledDotProductAttention(sizeSeq, self.dk,
 67 |                                              dropout, not abspos)
 68 | 
 69 |     def trans_(self, x):
 70 |         bsz, bptt, h, dk = x.size(0), x.size(1), self.nheads, self.dk
 71 |         return x.view(bsz, bptt, h, dk).transpose(1, 2).contiguous().view(bsz * h, bptt, dk)
 72 | 
 73 |     def reverse_trans_(self, x):
 74 |         bsz, bptt, h, dk = x.size(
 75 |             0) // self.nheads, x.size(1), self.nheads, self.dk
 76 |         return x.view(bsz, h, bptt, dk).transpose(1, 2).contiguous().view(bsz, bptt, h * dk)
 77 | 
 78 |     def forward(self, Q, K, V):
 79 |         q = self.trans_(self.Wq(Q))
 80 |         k = self.trans_(self.Wk(K))
 81 |         v = self.trans_(self.Wv(V))
 82 |         y = self.reverse_trans_(self.Att(q, k, v))
 83 |         return self.Wo(y)
 84 | 
 85 | 
 86 | class FFNetwork(nn.Module):
 87 |     def __init__(self, din, dout, dff, dropout):
 88 |         super(FFNetwork, self).__init__()
 89 |         self.lin1 = nn.Linear(din, dff, bias=True)
 90 |         self.lin2 = nn.Linear(dff, dout, bias=True)
 91 |         self.relu = nn.ReLU()
 92 |         self.drop = nn.Dropout(dropout)
 93 | 
 94 |     def forward(self, x):
 95 |         return self.lin2(self.drop(self.relu(self.lin1(x))))
 96 | 
 97 | 
 98 | class TransformerLayer(nn.Module):
 99 |     def __init__(self, sizeSeq=32, dmodel=512, dff=2048,
100 |                  dropout=0.1, nheads=8,
101 |                  abspos=False):
102 |         super(TransformerLayer, self).__init__()
103 |         self.multihead = MultiHeadAttention(sizeSeq, dropout,
104 |                                             dmodel, nheads, abspos)
105 |         self.ln_multihead = nn.LayerNorm(dmodel)
106 |         self.ffnetwork = FFNetwork(dmodel, dmodel, dff, dropout)
107 |         self.ln_ffnetwork = nn.LayerNorm(dmodel)
108 | 
109 |     def forward(self, x):
110 |         y = self.ln_multihead(x + self.multihead(Q=x, K=x, V=x))
111 |         return self.ln_ffnetwork(y + self.ffnetwork(y))
112 | 
113 | 
114 | class StaticPositionEmbedding(nn.Module):
115 |     def __init__(self, seqlen, dmodel):
116 |         super(StaticPositionEmbedding, self).__init__()
117 |         pos = torch.arange(0., seqlen).unsqueeze(1).repeat(1, dmodel)
118 |         dim = torch.arange(0., dmodel).unsqueeze(0).repeat(seqlen, 1)
119 |         div = torch.exp(- math.log(10000) * (2*(dim//2)/dmodel))
120 |         pos *= div
121 |         pos[:, 0::2] = torch.sin(pos[:, 0::2])
122 |         pos[:, 1::2] = torch.cos(pos[:, 1::2])
123 |         self.register_buffer('pe', pos.unsqueeze(0))
124 | 
125 |     def forward(self, x):
126 |         return x + self.pe[:, :x.size(1), :]
127 | 
128 | 
129 | def buildTransformerAR(dimEncoded,    # Output dimension of the encoder
130 |                        nLayers,       # Number of transformer layers
131 |                        sizeSeq,       # Expected size of the input sequence
132 |                        abspos):
133 |     layerSequence = []
134 |     if abspos:
135 |         layerSequence += [StaticPositionEmbedding(sizeSeq, dimEncoded)]
136 |     layerSequence += [TransformerLayer(sizeSeq=sizeSeq,
137 |                                        dmodel=dimEncoded, abspos=abspos)
138 |                       for i in range(nLayers)]
139 |     return nn.Sequential(*layerSequence)
140 | 


--------------------------------------------------------------------------------
/cpc/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/cpc/utils/capture_loader.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import re
 4 | import torch
 5 | 
 6 | # reads from torch files
 7 | class CaptureLoader:
 8 | 
 9 |     def __init__(self, rootDir, onlyReadThose=None):
10 |         self.rootDir = rootDir
11 |         self.onlyReadThose = onlyReadThose
12 |         
13 |         self.prepare()
14 |         
15 |     def prepare(self):
16 |         self.batchData = {}
17 |         for p,sd,f in sorted(os.walk(self.rootDir)):
18 |             for name in sorted(f):
19 |                 #print(p,sd,f,name)
20 |                 capturedThing = '_'.join(name.split('_')[:-1])
21 |                 if self.onlyReadThose and capturedThing not in self.onlyReadThose:
22 |                     continue
23 |                 batchDescr = name.split('_')[-1].split('.')[0]
24 |                 batchNums = list(map(int, re.findall(r'\d+', batchDescr)))
25 |                 batchBegin, batchEnd = batchNums[0], batchNums[1]
26 |                 if (batchBegin, batchEnd) in self.batchData:
27 |                     self.batchData[(batchBegin, batchEnd)][capturedThing] = os.path.join(p, name)
28 |                 else:
29 |                     self.batchData[(batchBegin, batchEnd)] = {capturedThing: os.path.join(p, name)}
30 |                 #tensor = torch.load(os.path.join(p, name))
31 |         self.batchesNamesInOrder = sorted(self.batchData.keys())
32 | 
33 |     def __len__(self):
34 |         return len(self.batchesNamesInOrder)
35 |         
36 |     def __getitem__(self, idx):
37 |         paths = self.batchData[self.batchesNamesInOrder[idx]]
38 |         return {whatCaptured: torch.load(tensorPath) for whatCaptured, tensorPath in paths.items()}
39 | 
40 |     def __iter__(self):
41 |         for i in range(len(self)):
42 |             yield self[i]
43 | 
44 |     
45 | if __name__ == '__main__':
46 | 
47 |     cl = CaptureLoader("/pio/scratch/1/i283340/MGR/zs/capture/try20/8")
48 | 
49 |     for data in cl:
50 |         print(data.keys(), [t.shape for t in data.values()])
51 | 
52 |     cl2 = CaptureLoader("/pio/scratch/1/i283340/MGR/zs/capture/try20/8", ('ctx', 'cpcctc_align', 'phone_align'))
53 | 
54 |     for data in cl2:
55 |         print(data.keys(), [t.shape for t in data.values()])


--------------------------------------------------------------------------------
/cpc/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | import json
  6 | import numpy as np
  7 | import random
  8 | import torch
  9 | import sys
 10 | import psutil
 11 | from copy import deepcopy
 12 | from bisect import bisect_left
 13 | 
 14 | def seDistancesToCentroids(vecs, centroids, doNorm=False):
 15 |     
 16 |     if len(vecs.shape) == 2:
 17 |         vecs = vecs.view(1, *(vecs.shape))
 18 | 
 19 |     B = vecs.shape[0]
 20 |     N = vecs.shape[1]
 21 |     k = centroids.shape[0]
 22 | 
 23 |     # vecs: B x L x Dim
 24 |     # centroids: k x Dim
 25 | 
 26 |     if doNorm:
 27 |         vecLengths = torch.sqrt((vecs*vecs).sum(-1))
 28 |         vecs = vecs / vecLengths.view(B, N, 1)
 29 |         centrLengths = torch.sqrt((centroids*centroids).sum(-1))
 30 |         centroids = centroids / centrLengths.view(k, 1)
 31 |         
 32 |     return torch.square(centroids).sum(1).view(1, 1, -1) + torch.square(vecs).sum(-1).view(B, N, 1) \
 33 |         - 2*(vecs.view(B, N, 1, -1) * centroids.view(1, 1, k, -1)).sum(-1)  #torch.matmul(vecs, centroids.T)
 34 | 
 35 | 
 36 | def pushToClosestForBatch(points, centers, deg=0.5, doNorm=False, doNormForPush=False):
 37 | 
 38 |     B = points.shape[0]   
 39 |     N = points.shape[1]
 40 |     k = centers.shape[0]
 41 | 
 42 |     if doNormForPush:
 43 |         pointsLengths = torch.sqrt((points*points).sum(-1))
 44 |         points = points / pointsLengths.view(B, N, 1)
 45 |         centrLengths = torch.sqrt((centers*centers).sum(-1))
 46 |         centers = centers / centrLengths.view(k, 1)
 47 | 
 48 |     distsSq = seDistancesToCentroids(points, centers, doNorm=doNorm)
 49 |     dists = torch.sqrt(distsSq)
 50 |      
 51 |     closest = dists.argmin(-1)
 52 |     diffs = centers[closest].view(B, N, -1) - points
 53 |     res = deg * diffs + points
 54 |      
 55 |     return res
 56 | 
 57 | 
 58 | def untensor(d):
 59 |     if isinstance(d, list):
 60 |         return [untensor(v) for v in d]
 61 |     if isinstance(d, dict):
 62 |         return dict((k, untensor(v)) for k, v in d.items())
 63 |     if hasattr(d, 'tolist'):
 64 |         return d.tolist()
 65 |     return d
 66 | 
 67 | 
 68 | def save_logs(data, pathLogs):
 69 |     with open(pathLogs, 'w') as file:
 70 |         json.dump(data, file, indent=2)
 71 | 
 72 | 
 73 | def update_logs(logs, logStep, prevlogs=None):
 74 |     out = {}
 75 |     for key in logs:
 76 |         out[key] = deepcopy(logs[key])
 77 | 
 78 |         if prevlogs is not None:
 79 |             out[key] -= prevlogs[key]
 80 |         out[key] /= logStep
 81 |     return out
 82 | 
 83 | 
 84 | def show_logs(text, logs):
 85 |     print("")
 86 |     print('-'*50)
 87 |     print(text)
 88 | 
 89 |     for key in logs:
 90 | 
 91 |         if key == "iter":
 92 |             continue
 93 | 
 94 |         nPredicts = logs[key].shape[0]
 95 | 
 96 |         strSteps = ['Step'] + [str(s) for s in range(1, nPredicts + 1)]
 97 |         formatCommand = ' '.join(['{:>16}' for x in range(nPredicts + 1)])
 98 |         print(formatCommand.format(*strSteps))
 99 | 
100 |         strLog = [key] + ["{:10.6f}".format(s) for s in logs[key]]
101 |         print(formatCommand.format(*strLog))
102 | 
103 |     print('-'*50)
104 | 
105 | 
106 | def set_seed(seed):
107 |     random.seed(seed)
108 |     torch.manual_seed(seed)
109 |     np.random.seed(seed)
110 |     if torch.cuda.is_available():
111 |         torch.cuda.manual_seed_all(seed)
112 | 
113 | 
114 | def cpu_stats():
115 |     print(sys.version)
116 |     print(psutil.cpu_percent())
117 |     print(psutil.virtual_memory())
118 | 
119 | 
120 | def ramp_scheduling_function(n_epoch_ramp, epoch):
121 |     if epoch >= n_epoch_ramp:
122 |         return 1
123 |     else:
124 |         return (epoch + 1) / n_epoch_ramp
125 | 
126 | 
127 | class SchedulerCombiner:
128 |     r"""
129 |     An object which applies a list of learning rate schedulers sequentially.
130 |     """
131 | 
132 |     def __init__(self, scheduler_list, activation_step, curr_step=0):
133 |         r"""
134 |         Args:
135 |             - scheduler_list (list): a list of learning rate schedulers
136 |             - activation_step (list): a list of int. activation_step[i]
137 |             indicates at which step scheduler_list[i] should be activated
138 |             - curr_step (int): the starting step. Must be lower than
139 |             activation_step[0]
140 |         """
141 | 
142 |         if len(scheduler_list) != len(activation_step):
143 |             raise ValueError("The number of scheduler must be the same as "
144 |                              "the number of activation step")
145 |         if activation_step[0] > curr_step:
146 |             raise ValueError("The first activation step cannot be higher than "
147 |                              "the current step.")
148 |         self.scheduler_list = scheduler_list
149 |         self.activation_step = deepcopy(activation_step)
150 |         self.curr_step = curr_step
151 | 
152 |     def step(self):
153 |         self.curr_step += 1
154 |         index = bisect_left(self.activation_step, self.curr_step) - 1
155 |         for i in reversed(range(index, len(self.scheduler_list))):
156 |             self.scheduler_list[i].step()
157 | 
158 |     def __str__(self):
159 |         out = "SchedulerCombiner \n"
160 |         out += "(\n"
161 |         for index, scheduler in enumerate(self.scheduler_list):
162 |             out += f"({index}) {scheduler.__str__()} \n"
163 |         out += ")\n"
164 |         return out
165 | 


--------------------------------------------------------------------------------
/cpc/utils/unit_tests.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | import unittest
 6 | import torch
 7 | import os
 8 | from nose.tools import eq_, ok_
 9 | 
10 | from .misc import SchedulerCombiner, ramp_scheduling_function
11 | 
12 | 
13 | class TestCombineSchedulers(unittest.TestCase):
14 | 
15 |     def setUp(self):
16 |         self.baseLR = 1
17 |         self.module = torch.nn.Linear(1, 1)
18 |         self.optimizer = torch.optim.SGD(
19 |             list(self.module.parameters()), lr=self.baseLR)
20 | 
21 |     def testCombineRamp(self):
22 |         scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer,
23 |                                                       lr_lambda=lambda epoch: ramp_scheduling_function(
24 |                                                           3, epoch))
25 |         self.optimizer.step()
26 |         eq_(self.optimizer.param_groups[0]['lr'], self.baseLR / 3)
27 |         scheduler.step()
28 |         eq_(self.optimizer.param_groups[0]['lr'], 2 * self.baseLR / 3)
29 |         scheduler.step()
30 |         eq_(self.optimizer.param_groups[0]['lr'], 1)
31 | 
32 |         for i in range(12):
33 |             scheduler.step()
34 |             eq_(self.optimizer.param_groups[0]['lr'], 1)
35 | 
36 |     def testCombineRampStep(self):
37 |         scheduler_step = torch.optim.lr_scheduler.StepLR(
38 |             self.optimizer, 6, gamma=0.5)
39 |         scheduler_ramp = torch.optim.lr_scheduler.LambdaLR(self.optimizer,
40 |                                                            lr_lambda=lambda epoch: ramp_scheduling_function(
41 |                                                                3, epoch))
42 | 
43 |         scheduler = SchedulerCombiner([scheduler_ramp, scheduler_step], [0, 3])
44 |         self.optimizer.step()
45 |         # Epoch 0
46 |         eq_(self.optimizer.param_groups[0]['lr'], self.baseLR / 3)
47 |         scheduler.step()
48 |         # Epoch 1
49 |         eq_(self.optimizer.param_groups[0]['lr'], 2 * self.baseLR / 3)
50 |         scheduler.step()
51 |         # Epoch 2
52 |         eq_(self.optimizer.param_groups[0]['lr'], 1)
53 |         scheduler.step()
54 | 
55 |         # Epoch 3, 4, 5
56 |         for i in range(3):
57 |             eq_(self.optimizer.param_groups[0]['lr'], 1)
58 |             scheduler.step()
59 | 
60 |         # Epoch 6
61 |         eq_(self.optimizer.param_groups[0]['lr'], 0.5)
62 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: cpc37
 2 | channels:
 3 |   - pytorch
 4 |   - anaconda
 5 |   - conda-forge
 6 |   - defaults
 7 | dependencies:
 8 |   - pytorch
 9 |   - torchvision
10 |   - cudatoolkit=9.2
11 |   - psutil
12 |   - pip
13 |   - openblas-devel
14 |   - tqdm
15 |   - nose
16 |   - cython
17 |   - pysoundfile
18 |   - pip:
19 |     - progressbar2
20 |     - matplotlib
21 |     - torchaudio
22 | 


--------------------------------------------------------------------------------
/experiments/train_pro_1gpu.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script for the Prometheus slurm cluster
  4 | 
  5 | set -x
  6 | 
  7 | RVERB=""  # =-v
  8 | 
  9 | REMOTE_USER=plgjch
 10 | REMOTE_HOST=pro.cyfronet.pl
 11 | 
 12 | # location of the main repository (contains data/)
 13 | CPC_DIR=/pio/scratch/2/jch/wav2vec/CPC_audio #"$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 14 | REMOTE_CPC_DIR=/net/people/plgjch/scratch/CPC_audio
 15 | REMOTE_MINICONDA_DIR=/net/archive/groups/plggneurony/os/miniconda3
 16 | 
 17 | # top-level directory for experiments
 18 | REMOTE_EXPERIMENT_RUNDIR=/net/scratch/people/plgjch/cpc/
 19 | 
 20 | # adjust the main loop
 21 | # (it can go over .yaml files, over hyperparameters, etc.
 22 | for DUMMY in aa \
 23 | ; do
 24 | 
 25 | # low-level directory for experiments
 26 | EXP_TAG=remote_pro
 27 | NAME=baseline_1gpu
 28 | DIR=$EXP_TAG/$NAME
 29 | EXP_DIR=$REMOTE_EXPERIMENT_RUNDIR/$DIR
 30 | 
 31 | ssh -q $REMOTE_USER@$REMOTE_HOST mkdir -p $EXP_DIR
 32 | 
 33 | TMP_DIR=`mktemp -d`
 34 | mkdir $TMP_DIR/code
 35 | # symlink the data from the main dir
 36 | 
 37 | cat > $TMP_DIR/exp_train.sh <<EOF
 38 | #!/bin/bash -l
 39 | ## Job name
 40 | #SBATCH -J ${EXP_TAG}_${NAME}
 41 | ## Nodes
 42 | #SBATCH -N 1
 43 | ## CPU per Node
 44 | #SBATCH -c 8
 45 | ## GPU
 46 | #SBATCH --gres=gpu:1
 47 | ##
 48 | #SBATCH --mem=64GB
 49 | ##
 50 | #SBATCH --time=72:00:00
 51 | ##
 52 | #SBATCH -A plgzerospeech2021gpu
 53 | ##
 54 | #SBATCH -p plgrid-gpu
 55 | ##
 56 | #SBATCH --output="$EXP_DIR/exp_%j.out"
 57 | ##
 58 | #SBATCH --error="$EXP_DIR/exp_%j.out"
 59 | 
 60 | ## go to the exp dir
 61 | set -e
 62 | set -x
 63 | 
 64 | cd "$EXP_DIR/code"
 65 | 
 66 | /bin/hostname
 67 | 
 68 | eval "\$($REMOTE_MINICONDA_DIR/bin/conda shell.bash hook)"
 69 | conda activate 202102-cpc
 70 | export PYTHONPATH=$EXP_DIR/code
 71 | 
 72 | python -u cpc/train.py \
 73 |     --pathCheckpoint $EXP_DIR \
 74 |     --pathDB /net/archive/groups/plggneurony/data/librispeech/LibriSpeech/train-clean-100 --file_extension .flac \
 75 |     --pathTrain /net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split/train_split.txt \
 76 |     --pathVal /net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split/test_split.txt \
 77 |     --n_process_loader 1 --max_size_loaded 4000000000 --batchSizeGPU 32 \
 78 |     --normMode layerNorm --dropout --rnnMode transformer  --nLevelsGRU 2  \
 79 |     --schedulerRamp 10 --nPredicts 12 \
 80 |     --CPCCTC --limitNegsInBatch 8  --CPCCTCNumMatched 12  --nPredicts 12
 81 | 
 82 | EOF
 83 | 
 84 | # Transmit the startup script
 85 | rsync $RVERB -lrpt -e "ssh -q" $TMP_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/
 86 | 
 87 | # Transmit the rest
 88 | rsync --exclude '.*' \
 89 |       --exclude data \
 90 |       --exclude pretrained_models \
 91 |       --exclude '__pycache__' \
 92 |       --exclude '*runs*' \
 93 |       --exclude '*.pyc' \
 94 |       --exclude '*.ipynb' \
 95 |       --filter=':- .gitignore' \
 96 |     $RVERB -lrpt -e "ssh -q" $CPC_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/code/
 97 | 
 98 | ssh -q $REMOTE_USER@$REMOTE_HOST sbatch \
 99 |     `#--gres="" --time=00:10:00 -p plgrid-testing` \
100 |     $EXP_DIR/exp_train.sh
101 | 
102 | rm -Rf $TMP_DIR
103 | 
104 | done
105 | 
106 | echo "Queue status"
107 | ssh -q $REMOTE_USER@$REMOTE_HOST squeue
108 | 


--------------------------------------------------------------------------------
/experiments/train_pro_2gpu.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script for the Prometheus slurm cluster
  4 | 
  5 | set -x
  6 | 
  7 | RVERB=""  # =-v
  8 | 
  9 | REMOTE_USER=plgjch
 10 | REMOTE_HOST=pro.cyfronet.pl
 11 | 
 12 | # location of the main repository (contains data/)
 13 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 14 | REMOTE_CPC_DIR=/net/people/plgjch/scratch/CPC_audio
 15 | REMOTE_MINICONDA_DIR=/net/archive/groups/plggneurony/os/miniconda3
 16 | 
 17 | # top-level directory for experiments
 18 | REMOTE_EXPERIMENT_RUNDIR=/net/scratch/people/plgjch/cpc/
 19 | 
 20 | # adjust the main loop
 21 | # (it can go over .yaml files, over hyperparameters, etc.
 22 | for DUMMY in aa \
 23 | ; do
 24 | 
 25 | # low-level directory for experiments
 26 | EXP_TAG=remote_pro
 27 | NAME=baseline
 28 | DIR=$EXP_TAG/$NAME
 29 | EXP_DIR=$REMOTE_EXPERIMENT_RUNDIR/$DIR
 30 | 
 31 | ssh -q $REMOTE_USER@$REMOTE_HOST mkdir -p $EXP_DIR
 32 | 
 33 | TMP_DIR=`mktemp -d`
 34 | mkdir $TMP_DIR/code
 35 | # symlink the data from the main dir
 36 | 
 37 | cat > $TMP_DIR/exp_train.sh <<EOF
 38 | #!/bin/bash -l
 39 | ## Job name
 40 | #SBATCH -J ${EXP_TAG}_${NAME}
 41 | ## Nodes
 42 | #SBATCH -N 1
 43 | ## CPU per Node
 44 | #SBATCH -c 8
 45 | ## GPU
 46 | #SBATCH --gres=gpu:2
 47 | ##
 48 | #SBATCH --mem=64GB
 49 | ##
 50 | #SBATCH --time=72:00:00
 51 | ##
 52 | #SBATCH -A plgzerospeech2021gpu
 53 | ##
 54 | #SBATCH -p plgrid-gpu
 55 | ##
 56 | #SBATCH --output="$EXP_DIR/exp_%j.out"
 57 | ##
 58 | #SBATCH --error="$EXP_DIR/exp_%j.out"
 59 | 
 60 | ## go to the exp dir
 61 | set -e
 62 | set -x
 63 | 
 64 | cd "$EXP_DIR/code"
 65 | 
 66 | /bin/hostname
 67 | 
 68 | eval "\$($REMOTE_MINICONDA_DIR/bin/conda shell.bash hook)"
 69 | conda activate 202102-cpc
 70 | export PYTHONPATH=$EXP_DIR/code
 71 | 
 72 | python -u cpc/train.py \
 73 |     --pathCheckpoint $EXP_DIR \
 74 |     --pathDB /net/archive/groups/plggneurony/data/librispeech/LibriSpeech/train-clean-100 --file_extension .flac \
 75 |     --pathTrain /net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split/train_split.txt \
 76 |     --pathVal /net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split/test_split.txt \
 77 |     --n_process_loader 1 --max_size_loaded 4000000000 --batchSizeGPU 32 \
 78 |     --normMode layerNorm --dropout --rnnMode transformer  --nLevelsGRU 2  \
 79 |     --schedulerRamp 10 --nPredicts 12 \
 80 |     --CPCCTC --limitNegsInBatch 8  --CPCCTCNumMatched 12  --nPredicts 12
 81 | 
 82 | EOF
 83 | 
 84 | # Transmit the startup script
 85 | rsync $RVERB -lrpt -e "ssh -q" $TMP_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/
 86 | 
 87 | # Transmit the rest
 88 | rsync --exclude '.*' \
 89 |       --exclude data \
 90 |       --exclude pretrained_models \
 91 |       --exclude '__pycache__' \
 92 |       --exclude '*runs*' \
 93 |       --exclude '*.pyc' \
 94 |       --exclude '*.ipynb' \
 95 |       --filter=':- .gitignore' \
 96 |     $RVERB -lrpt -e "ssh -q" $CPC_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/code/
 97 | 
 98 | ssh -q $REMOTE_USER@$REMOTE_HOST sbatch \
 99 |     `#--gres="" --time=00:10:00 -p plgrid-testing` \
100 |     $EXP_DIR/exp_train.sh
101 | 
102 | rm -Rf $TMP_DIR
103 | 
104 | done
105 | 
106 | echo "Queue status"
107 | ssh -q $REMOTE_USER@$REMOTE_HOST squeue
108 | 


--------------------------------------------------------------------------------
/experiments/train_pro_cpcctc_bases.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script for the Prometheus slurm cluster
  4 | 
  5 | set -x
  6 | 
  7 | RVERB=""  # =-v
  8 | 
  9 | REMOTE_USER=plgjch
 10 | REMOTE_HOST=pro.cyfronet.pl
 11 | 
 12 | # location of the main repository (contains data/)
 13 | CPC_DIR=/pio/scratch/2/jch/wav2vec/CPC_audio  #"$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 14 | REMOTE_CPC_DIR=/net/people/plgjch/scratch/CPC_audio
 15 | REMOTE_MINICONDA_DIR=/net/archive/groups/plggneurony/os/miniconda3
 16 | REMOTE_LIBRISPEECH_DIR=/net/archive/groups/plggneurony/data/librispeech/LibriSpeech
 17 | REMOTE_LIBRISPEECH_DIR=/net/people/plgjch/lscratch/plgjch/LibriSpeech-wav
 18 | REMOTE_LIBRISPEECH100_SPLITS=/net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split
 19 | 
 20 | # top-level directory for experiments
 21 | REMOTE_EXPERIMENT_RUNDIR=/net/scratch/people/plgjch/cpc/
 22 | 
 23 | # adjust the main loop
 24 | # (it can go over .yaml files, over hyperparameters, etc.
 25 | 
 26 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0" \
 27 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 3" \
 28 | # "--CPCCTCNumMatched 12 --nPredicts 5 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 4" \
 29 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 4" \
 30 | # "--CPCCTCNumMatched 20 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 10" \
 31 | # "--CPCCTCNumMatched 30 --nPredicts 20 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0" \
 32 | # "--CPCCTCNumMatched 20 --nPredicts 7 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 7" \
 33 | # "--CPCCTCNumMatched 30 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 5" \
 34 | # "--CPCCTCNumMatched 30 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 10" \
 35 | # "--CPCCTCNumMatched 30 --nPredicts 15 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 15" \
 36 | 
 37 | 
 38 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 0" \
 39 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0" \
 40 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0 --CPCCTCNoNegsMatchWin" \
 41 | # "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 12" \
 42 | # "--CPCCTCNumMatched 12 --nPredicts 10 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 0" \
 43 | # "--CPCCTCNumMatched 15 --nPredicts 8 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 0" \
 44 | # "--CPCCTCNumMatched 15 --nPredicts 10 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 0" \
 45 | # "--CPCCTCNumMatched 15 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0" \
 46 | # "--CPCCTCNumMatched 15 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 15" \
 47 | # "--CPCCTCNumMatched 15 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0 --CPCCTCNoNegsMatchWin" \
 48 | # "--CPCCTCNumMatched 15 --nPredicts 10 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 15 --CPCCTCNoNegsMatchWin" \
 49 | 
 50 | for PARAMS in \
 51 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 12" \
 52 | "--CPCCTCNumMatched 12 --nPredicts 6 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 12" \
 53 | "--CPCCTCNumMatched 12 --nPredicts 10 --CPCCTCSkipBeg 0 --CPCCTCSkipEnd 12" \
 54 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 12" \
 55 | "--CPCCTCNumMatched 12 --nPredicts 6 --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 12" \
 56 | "--CPCCTCNumMatched 12 --nPredicts 10 --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 12" \
 57 | ; do
 58 | 
 59 | # low-level directory for experiments
 60 | EXP_TAG=remote_pro
 61 | PRINT_PARAMS=$(echo $PARAMS | tr -d ' ' | sed -e 's/-\+/_/g')
 62 | NAME=test_cpcctc${PRINT_PARAMS}
 63 | DIR=$EXP_TAG/$NAME
 64 | EXP_DIR=$REMOTE_EXPERIMENT_RUNDIR/$DIR
 65 | 
 66 | echo $EXP_DIR
 67 | 
 68 | ssh -q $REMOTE_USER@$REMOTE_HOST mkdir -p $EXP_DIR
 69 | 
 70 | TMP_DIR=`mktemp -d`
 71 | mkdir $TMP_DIR/code
 72 | # symlink the data from the main dir
 73 | 
 74 | cat > $TMP_DIR/exp_train.sh <<EOF
 75 | #!/bin/bash -l
 76 | ## Job name
 77 | #SBATCH -J ${EXP_TAG}_${NAME}
 78 | ## Nodes
 79 | #SBATCH -N 1
 80 | ## CPU per Node
 81 | #SBATCH -c 6
 82 | ## GPU
 83 | #SBATCH --gres=gpu:2
 84 | ##
 85 | #SBATCH --mem=40GB
 86 | ##
 87 | #SBATCH --time=72:00:00
 88 | ##
 89 | #SBATCH -A plgzerospeech2021gpu
 90 | ##
 91 | #SBATCH -p plgrid-gpu
 92 | ##
 93 | #SBATCH --output="$EXP_DIR/exp_%j.out"
 94 | ##
 95 | #SBATCH --error="$EXP_DIR/exp_%j.out"
 96 | 
 97 | ## go to the exp dir
 98 | cd "$EXP_DIR/code"
 99 | 
100 | /bin/hostname
101 | 
102 | eval "\$($REMOTE_MINICONDA_DIR/bin/conda shell.bash hook)"
103 | conda activate 202102-cpc
104 | 
105 | set -e
106 | set -x
107 | 
108 | export PYTHONPATH=$EXP_DIR/code
109 | 
110 | python -u cpc/train.py \
111 |     --pathCheckpoint $EXP_DIR \
112 |     --pathDB ${REMOTE_LIBRISPEECH_DIR}/train-clean-100 --file_extension .wav \
113 |     --pathTrain ${REMOTE_LIBRISPEECH100_SPLITS}/train_split.txt \
114 |     --pathVal ${REMOTE_LIBRISPEECH100_SPLITS}/test_split.txt \
115 |     --n_process_loader 1 --max_size_loaded 4000000000 --batchSizeGPU 32 \
116 |     --normMode layerNorm --dropout --rnnMode transformer  --nLevelsGRU 2  \
117 |     `#--schedulerRamp 10` --nEpoch 75 \
118 |     --CPCCTC --limitNegsInBatch 8  $PARAMS
119 | 
120 | CP=\$(ls $EXP_DIR/checkpoint*.pt | sed -e 's/.*_\([0-9]\+\).pt/\1/' | sort -n | tail -1)
121 | mkdir -p $EXP_DIR/lineval_\${CP}
122 | python -u cpc/eval/linear_separability.py \
123 |     ${REMOTE_LIBRISPEECH_DIR}/train-clean-100 \
124 |     ${REMOTE_LIBRISPEECH100_SPLITS}/train_split.txt \
125 |     ${REMOTE_LIBRISPEECH100_SPLITS}/test_split.txt \
126 |     $EXP_DIR/checkpoint_\${CP}.pt \
127 |     --pathPhone ${REMOTE_LIBRISPEECH100_SPLITS}/converted_aligned_phones.txt \
128 |     --file_extension .wav \
129 |     --pathCheckpoint $EXP_DIR/lineval_\${CP} \
130 |     2>&1 | tee -ai $EXP_DIR/lineval_\${CP}/out.txt
131 | EOF
132 | 
133 | # Transmit the startup script
134 | rsync $RVERB -lrpt -e "ssh -q" $TMP_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/
135 | 
136 | # Transmit the rest
137 | rsync --exclude '.*' \
138 |       --exclude data \
139 |       --exclude pretrained_models \
140 |       --exclude '__pycache__' \
141 |       --exclude '*runs*' \
142 |       --exclude '*.pyc' \
143 |       --exclude '*.ipynb' \
144 |       --filter=':- .gitignore' \
145 |     $RVERB -lrpt -e "ssh -q" $CPC_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/code/
146 | 
147 | ssh -q $REMOTE_USER@$REMOTE_HOST sbatch \
148 |     `#--gres="" --time=00:10:00 -p plgrid-testing` \
149 |     $EXP_DIR/exp_train.sh
150 | 
151 | rm -Rf $TMP_DIR
152 | 
153 | done
154 | 
155 | echo "Queue status"
156 | ssh -q $REMOTE_USER@$REMOTE_HOST squeue
157 | 


--------------------------------------------------------------------------------
/finetune_nullspace.sh:
--------------------------------------------------------------------------------
 1 | SAVE_DIR="/pio/scratch/1/i273233/linear_separability/cpc/gru_level2/cpc_official"
 2 | SPEAKERS="speakers_factorized"
 3 | PHONEMES="phonemes_nullspace"
 4 | SPEAKERS_NULLSPACE="speakers_nullspace"
 5 | 
 6 | DIM_INTER=$1
 7 | FROM_STEP=$SPEAKERS
 8 | if [[ $# -ge 2 ]]; then
 9 |     FROM_STEP=$2
10 | fi
11 | 
12 | case $FROM_STEP in
13 | $SPEAKERS)
14 |     echo $SPEAKERS
15 |     mkdir -p ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER} --mode $SPEAKERS --max_size_loaded 40000000 --n_process_loader 2 --model cpc --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/log.txt
16 |     ;&
17 | $PHONEMES)
18 |     echo $PHONEMES
19 |     mkdir -p ${SAVE_DIR}_${PHONEMES}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${PHONEMES}_${DIM_INTER} --mode $PHONEMES --max_size_loaded 40000000 --n_process_loader 2 --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt --path_speakers_factorized ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/checkpoint_9.pt --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${PHONEMES}_${DIM_INTER}/log.txt
20 |     ;&
21 | $SPEAKERS_NULLSPACE)
22 |     echo $SPEAKERS_NULLSPACE
23 |     mkdir -p ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER} --mode $SPEAKERS_NULLSPACE --max_size_loaded 40000000 --n_process_loader 2 --model cpc --path_speakers_factorized ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/checkpoint_9.pt --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER}/log.txt
24 |     ;;
25 | *)
26 |     echo "Invalid from step: ${FROM_STEP} while it should be either ${SPEAKERS}, ${PHONEMES} or ${SPEAKERS_NULLSPACE}"
27 |     ;;
28 | esac
29 | 
30 | exit 0


--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | import argparse
 6 | import torch
 7 | from cpc.model import CPCModel as cpcmodel
 8 | from cpc.cpc_default_config import get_default_cpc_config
 9 | from cpc.feature_loader import getEncoder, getAR, loadArgs
10 | dependencies = ['torch', 'torchaudio']
11 | 
12 | 
13 | def CPC_audio(pretrained=False,
14 |               **kwargs):
15 |     """
16 |     Contrast predictive learning model for audio data
17 |     pretrained: if True, load a model trained on libri-light 60k
18 |     (https://arxiv.org/abs/1912.07875)
19 |     **kwargs : see cpc/cpc_default_config to get the list of possible arguments
20 |     """
21 |     locArgs = get_default_cpc_config()
22 |     if pretrained:
23 |         checkpoint_url = 'https://dl.fbaipublicfiles.com/librilight/CPC_checkpoints/60k_epoch4-d0f474de.pt'
24 |         checkpoint = torch.hub.load_state_dict_from_url(checkpoint_url,
25 |                                                         progress=False)
26 |         loadArgs(locArgs, argparse.Namespace(**checkpoint["config"]))
27 |     else:
28 |         args = argparse.Namespace(**kwargs)
29 |         loadArgs(locArgs, args)
30 |     encoderNet = getEncoder(locArgs)
31 |     arNet = getAR(locArgs)
32 |     model = cpcmodel(encoderNet, arNet)
33 |     if pretrained:
34 |         model.load_state_dict(checkpoint["weights"], strict=False)
35 |     return model
36 | 


--------------------------------------------------------------------------------
/jch_experiments:
--------------------------------------------------------------------------------
 1 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 64 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl.log
 2 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 64 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 16 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16.log
 3 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16_12/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 48 --nPredicts 12 --CPCCTC --CPCCTCNumMatched 16 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16_12.log
 4 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_largebatch/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 64 --nPredicts 12  2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_largebatch.log
 5 | 
 6 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .wav --normMode layerNorm --dropout --rnnMode transformer --n_process_loader 1 --max_size_loaded 4000000000 --batchSizeGPU 32 --limitNegsInBatch 8 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan_stdout.txt
 7 | 
 8 | # should be??
 9 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .wav --normMode layerNorm --dropout --rnnMode transformer --n_process_loader 1 --max_size_loaded 4000000000 --nLevelsGRU 2 --batchSizeGPU 32 --limitNegsInBatch 8 --schedulerRamp 10  2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan_stdout.txt
10 | 


--------------------------------------------------------------------------------
/lineval_ls100.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | RVERB="-v --dry-run"
 7 | RVERB=""
 8 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 9 | SAVE_DIR="$(
10 | python - "$@" << END
11 | if 1:
12 |   import argparse
13 |   import os.path
14 |   parser = argparse.ArgumentParser(description='Process some integers.')
15 |   parser.add_argument('load', type=str,
16 |                         help="Path to the checkpoint to evaluate.")
17 |   parser.add_argument('--pathCheckpoint')
18 |   parser.add_argument('--CTC', action='store_true')
19 |   args, _ = parser.parse_known_args()
20 |   checkpoint_dir = os.path.dirname(args.load)
21 |   checkpoint_no = args.load.split('_')[-1][:-3]
22 |   eval_ctc = ""
23 |   if args.CTC:
24 |     eval_ctc = "_ctc"
25 |   print(f"{checkpoint_dir}/lineval{eval_ctc}_{checkpoint_no}")
26 | END
27 | )"
28 | 
29 | mkdir -p ${SAVE_DIR}/code
30 | rsync --exclude '.*' \
31 |       --exclude data \
32 |       --exclude pretrained_models \
33 |       --exclude '__pycache__' \
34 |       --exclude '*runs*' \
35 |       --exclude '*.pyc' \
36 |       --exclude '*.ipynb' \
37 |       --filter=':- .gitignore' \
38 |     $RVERB -lrpt $CPC_DIR/ ${SAVE_DIR}/code/
39 | 
40 | echo $0 "$@" >> ${SAVE_DIR}/out.txt
41 | exec python -u cpc/eval/linear_separability.py \
42 |     /pio/data/zerospeech2021/LibriSpeech-wav/train-clean-100 \
43 |     /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt \
44 |     /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \
45 |     "$@" \
46 |     --pathPhone /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/converted_aligned_phones.txt \
47 |     --file_extension .wav \
48 |     --pathCheckpoint $SAVE_DIR \
49 |     2>&1 | tee -ai ${SAVE_DIR}/out.txt
50 | 


--------------------------------------------------------------------------------
/run_clustering.sh:
--------------------------------------------------------------------------------
1 | NULLSPACE_SIZE=$1
2 | BATCH_SIZE_GPU=$2
3 | MAX_ITER=$3
4 | 
5 | python cpc/criterion/clustering/clustering_script.py     --pathDB $zd/LibriSpeech/train-clean-100/ --recursionLevel 1     --nClusters 50 --MAX_ITER $MAX_ITER --level_gru 2     --save --load --batchSizeGPU $BATCH_SIZE_GPU --max_size_loaded 40000000 --n_process_loader 2 --nullspace     ../linear_separability/cpc/gru_level2/cpc_official_phonemes_nullspace_$NULLSPACE_SIZE/checkpoint_9.pt     checkpoints/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE.pt
6 | for directory in dev-clean dev-other test-clean test-other train-clean-100 train-full-960
7 | do
8 | 	python ./scripts/quantize_audio.py $cpc/checkpoints/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE.pt $zd/LibriSpeech/$directory/ /pio/gluster/i273233/quantized/nullspace_$NULLSPACE_SIZE/LibriSpeech/$directory --file_extension flac --nobatch --nullspace
9 | done


--------------------------------------------------------------------------------
/scripts/build_1hot_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import progressbar
  6 | from pathlib import Path
  7 | from time import time
  8 | import numpy as np
  9 | 
 10 | from utils.utils_functions import writeArgs
 11 | 
 12 | def parseArgs(argv):
 13 |     # Run parameters
 14 |     parser = argparse.ArgumentParser(description='Export 1-hot features from quantized units of audio files.')
 15 |     parser.add_argument('pathQuantizedUnits', type=str,
 16 |                         help='Path to the quantized units. Each line of the input file must be'
 17 |                         'of the form file_name[tab]pseudo_units (ex. hat  1,1,2,3,4,4)')
 18 |     parser.add_argument('pathOutputDir', type=str,
 19 |                         help='Path to the output directory.')
 20 |     parser.add_argument('--n_units', type=int, default=50,
 21 |                         help='Number of discrete units (default: 50). If a dictionary is given,'
 22 |                         'this is automatically set as vocab size.')
 23 |     parser.add_argument('--dict', type=str,
 24 |                        help='Path to the dictionary file containing vocab of the pseudo units on the dataset'
 25 |                        '(this is required if the quantized units are not digits, i.e. multi-group case).')
 26 |     parser.add_argument('--debug', action='store_true',
 27 |                         help="Load only a very small amount of files for "
 28 |                         "debugging purposes.")
 29 |     return parser.parse_args(argv)
 30 | 
 31 | def main(argv):
 32 |     # Args parser
 33 |     args = parseArgs(argv)
 34 | 
 35 |     print("=============================================================")
 36 |     print(f"Building 1-hot features from {args.pathQuantizedUnits}")
 37 |     print("=============================================================")
 38 | 
 39 |     # Load input file
 40 |     print("")
 41 |     print(f"Reading input file from {args.pathQuantizedUnits}")
 42 |     seqNames = []
 43 |     seqInputs = []
 44 |     with open(args.pathQuantizedUnits, 'r') as f:
 45 |         for line in f:
 46 |             file_name, file_seq = line.strip().split("\t")
 47 |             # Convert sequence to the desired input form
 48 |             file_seq = file_seq.replace(",", " ")
 49 |             # Add to lists
 50 |             seqNames.append(file_name)
 51 |             seqInputs.append(file_seq)
 52 |     print(f"Found {len(seqNames)} sequences!")
 53 | 
 54 |     # Verify the output directory
 55 |     if os.path.exists(args.pathOutputDir):
 56 |         existing_files = set([os.path.splitext(os.path.basename(x))[0]
 57 |                             for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"])
 58 |         seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files]
 59 |         print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!")
 60 |     else:
 61 |         print("")
 62 |         print(f"Creating the output directory at {args.pathOutputDir}")
 63 |         Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
 64 |     writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)
 65 | 
 66 |     # Debug mode
 67 |     if args.debug:
 68 |         nsamples=20
 69 |         print("")
 70 |         print(f"Debug mode activated, only load {nsamples} samples!")
 71 |         # shuffle(seqNames)
 72 |         seqNames = seqNames[:nsamples]
 73 |         seqInputs = seqInputs[:nsamples]
 74 | 
 75 |     # Load 1hot dictionary in case we use it
 76 |     if seqInputs and not seqInputs[0].split()[0].isdigit(): #multi-group ie. 65-241
 77 |         assert args.dict is not None, \
 78 |             "A dictionary must be given when the quantized outputs is not digits (multi-group case)!"
 79 |     if args.dict:
 80 |         print("")
 81 |         print(f"Loading onehot dictionary from {args.dict}...")
 82 |         with open(args.dict, "r") as f:
 83 |             lines = f.read().split("\n")
 84 |         pair2idx={word.split()[0]: i for i, word in enumerate(lines) if word and not word.startwith("madeupword")}
 85 |         args.n_units = len(pair2idx)
 86 | 
 87 |     # Define onehot_feature_function
 88 |     def onehot_feature_function(input_sequence):
 89 |         if args.dict:
 90 |             indexes_sequence = np.array([pair2idx[item] for item in input_sequence.split()])
 91 |         else:
 92 |             indexes_sequence = np.array([int(item) for item in input_sequence.split()])
 93 | 
 94 |         onehotFeatures = np.eye(args.n_units)[indexes_sequence]
 95 | 
 96 |         return onehotFeatures
 97 | 
 98 |     # Building features
 99 |     print("")
100 |     print(f"Building 1-hot features and saving outputs to {args.pathOutputDir}...")
101 |     bar = progressbar.ProgressBar(maxval=len(seqNames))
102 |     bar.start()
103 |     start_time = time()
104 |     for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)):
105 |         bar.update(index)
106 | 
107 |         # Computing features
108 |         onehot_features = onehot_feature_function(input_seq)
109 | 
110 |         # Save the outputs
111 |         file_name = os.path.splitext(name_seq)[0] + ".txt"
112 |         file_out = os.path.join(args.pathOutputDir, file_name)
113 |         np.savetxt(file_out, onehot_features)
114 |     bar.finish()
115 |     print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
116 | 
117 | if __name__ == "__main__":
118 |     args = sys.argv[1:]
119 |     main(args)
120 | 


--------------------------------------------------------------------------------
/scripts/build_BERT_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import progressbar
  6 | from pathlib import Path
  7 | from time import time
  8 | import numpy as np
  9 | 
 10 | import torch
 11 | 
 12 | from utils.utils_functions import writeArgs, loadRobertaCheckpoint
 13 | 
 14 | def parseArgs(argv):
 15 |     # Run parameters
 16 |     parser = argparse.ArgumentParser(description='Export BERT features from quantized units of audio files.')
 17 |     parser.add_argument('pathQuantizedUnits', type=str,
 18 |                         help='Path to the quantized units. Each line of the input file must be'
 19 |                         'of the form file_name[tab]pseudo_units (ex. hat  1,1,2,3,4,4)')
 20 |     parser.add_argument('pathOutputDir', type=str,
 21 |                         help='Path to the output directory.')
 22 |     parser.add_argument('pathBERTCheckpoint', type=str,
 23 |                         help='Path to the trained fairseq BERT(RoBERTa) model.')
 24 |     parser.add_argument('--dict', type=str,
 25 |                        help='Path to the dictionary file (dict.txt) used to train the BERT model'
 26 |                        '(if not speficied, look for dict.txt in the model directory)')
 27 |     parser.add_argument('--hidden_level', type=int, default=-1,
 28 |                           help="Hidden layer of BERT to extract features from (default: -1, last layer).")
 29 |     parser.add_argument('--debug', action='store_true',
 30 |                         help="Load only a very small amount of files for "
 31 |                         "debugging purposes.")
 32 |     parser.add_argument('--cpu', action='store_true',
 33 |                         help="Run on a cpu machine.")
 34 |     return parser.parse_args(argv)
 35 | 
 36 | def main(argv):
 37 |     # Args parser
 38 |     args = parseArgs(argv)
 39 | 
 40 |     print("=============================================================")
 41 |     print(f"Building BERT features from {args.pathQuantizedUnits}")
 42 |     print("=============================================================")
 43 | 
 44 |     # Load input file
 45 |     print("")
 46 |     print(f"Reading input file from {args.pathQuantizedUnits}")
 47 |     seqNames = []
 48 |     seqInputs = []
 49 |     with open(args.pathQuantizedUnits, 'r') as f:
 50 |         for line in f:
 51 |             file_name, file_seq = line.strip().split("\t")
 52 |             # Convert sequence to the desired input form
 53 |             file_seq = file_seq.replace(",", " ")
 54 |             # Add to lists
 55 |             seqNames.append(file_name)
 56 |             seqInputs.append(file_seq)
 57 |     print(f"Found {len(seqNames)} sequences!")
 58 | 
 59 |     # Verify the output directory
 60 |     if os.path.exists(args.pathOutputDir):
 61 |         existing_files = set([os.path.splitext(os.path.basename(x))[0]
 62 |                             for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"])
 63 |         seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files]
 64 |         print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!")
 65 |     else:
 66 |         print("")
 67 |         print(f"Creating the output directory at {args.pathOutputDir}")
 68 |         Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
 69 |     writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)
 70 | 
 71 |     # Debug mode
 72 |     if args.debug:
 73 |         nsamples=20
 74 |         print("")
 75 |         print(f"Debug mode activated, only load {nsamples} samples!")
 76 |         # shuffle(seqNames)
 77 |         seqNames = seqNames[:nsamples]
 78 |         seqInputs = seqInputs[:nsamples]
 79 | 
 80 |     # Load BERT model
 81 |     if args.dict is None:
 82 |         pathData = os.path.dirname(args.pathBERTCheckpoint)
 83 |     else:
 84 |         pathData = os.path.dirname(args.dict)
 85 |     assert os.path.exists(os.path.join(pathData, "dict.txt")), \
 86 |         f"Dictionary file (dict.txt) not found in {pathData}"
 87 |     print("")
 88 |     print(f"Loading RoBERTa model from {args.pathBERTCheckpoint}...")
 89 |     print(f"Path data {pathData}")
 90 |     roberta = loadRobertaCheckpoint(
 91 |                 args.pathBERTCheckpoint, 
 92 |                 pathData, 
 93 |                 from_pretrained=False)
 94 |     roberta.eval()  # disable dropout (or leave in train mode to finetune)
 95 |     if not args.cpu:
 96 |         roberta.cuda()
 97 |     print("Model loaded !")
 98 | 
 99 |     # Define BERT_feature_function
100 |     def BERT_feature_function(input_sequence, n_hidden=-1):
101 |         sentence_tokens = roberta.task.source_dictionary.encode_line(
102 |                             "<s> " + input_sequence,
103 |                             append_eos=True,
104 |                             add_if_not_exist=False).type(torch.LongTensor)
105 |         if not args.cpu:
106 |             sentence_tokens = sentence_tokens.cuda()
107 | 
108 |         with torch.no_grad():
109 |             outputs = roberta.extract_features(sentence_tokens, return_all_hiddens=True)
110 | 
111 |         return outputs[n_hidden].squeeze(0).float().cpu().numpy()
112 | 
113 |     # Building features
114 |     print("")
115 |     print(f"Building BERT features and saving outputs to {args.pathOutputDir}...")
116 |     bar = progressbar.ProgressBar(maxval=len(seqNames))
117 |     bar.start()
118 |     start_time = time()
119 |     for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)):
120 |         bar.update(index)
121 | 
122 |         # Computing features
123 |         BERT_features = BERT_feature_function(input_seq, n_hidden=args.hidden_level)
124 | 
125 |         # Save the outputs
126 |         file_name = os.path.splitext(name_seq)[0] + ".txt"
127 |         file_out = os.path.join(args.pathOutputDir, file_name)
128 |         np.savetxt(file_out, BERT_features)
129 |     bar.finish()
130 |     print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
131 | 
132 | if __name__ == "__main__":
133 |     args = sys.argv[1:]
134 |     main(args)
135 | 


--------------------------------------------------------------------------------
/scripts/build_CPC_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import progressbar
  6 | from pathlib import Path
  7 | from time import time
  8 | import numpy as np
  9 | 
 10 | from cpc.dataset import findAllSeqs
 11 | from cpc.feature_loader import buildFeature, FeatureModule, loadModel
 12 | 
 13 | from utils.utils_functions import writeArgs, loadCPCFeatureMaker
 14 | 
 15 | def parseArgs(argv):
 16 |     # Run parameters
 17 |     parser = argparse.ArgumentParser(description='Export CPC features from audio files.')
 18 |     parser.add_argument('pathCPCCheckpoint', type=str,
 19 |                         help='Path to the CPC checkpoint.')
 20 |     parser.add_argument('pathDB', type=str,
 21 |                         help='Path to the dataset that we want to quantize.')
 22 |     parser.add_argument('pathOutputDir', type=str,
 23 |                         help='Path to the output directory.')
 24 |     parser.add_argument('--file_extension', type=str, default="wav",
 25 |                           help="Extension of the audio files in the dataset (default: wav).")
 26 |     parser.add_argument('--get_encoded', type=bool, default=False,
 27 |                         help='If True, get the outputs of the encoder layer only (default: False).')
 28 |     parser.add_argument('--gru_level', type=int, default=-1,
 29 |                         help='Hidden level of the LSTM autoregressive model to be taken'
 30 |                         '(default: -1, last layer).')
 31 |     parser.add_argument('--max_size_seq', type=int, default=64000,
 32 |                         help='Maximal number of frames to consider in each chunk'
 33 |                         'when computing CPC features (defaut: 64000).')
 34 |     parser.add_argument('--seq_norm', type=bool, default=False,
 35 |                         help='If True, normalize the output along the time'
 36 |                         'dimension to get chunks of mean zero and var 1 (default: False).')
 37 |     parser.add_argument('--strict', type=bool, default=True,
 38 |                         help='If True, each batch of feature '
 39 |                         'will contain exactly max_size_seq frames (defaut: True).')
 40 |     parser.add_argument('--debug', action='store_true',
 41 |                         help="Load only a very small amount of files for "
 42 |                         "debugging purposes.")
 43 |     parser.add_argument('--cpu', action='store_true',
 44 |                         help="Run on a cpu machine.")
 45 |     return parser.parse_args(argv)
 46 | 
 47 | def main(argv):
 48 |     # Args parser
 49 |     args = parseArgs(argv)
 50 | 
 51 |     print("=============================================================")
 52 |     print(f"Building CPC features from {args.pathDB}")
 53 |     print("=============================================================")
 54 | 
 55 |     # Find all sequences
 56 |     print("")
 57 |     print(f"Looking for all {args.file_extension} files in {args.pathDB}")
 58 |     seqNames, _ = findAllSeqs(args.pathDB,
 59 |                                  speaker_level=1,
 60 |                                  extension=args.file_extension,
 61 |                                  loadCache=True)
 62 |     if len(seqNames) == 0 or not os.path.splitext(seqNames[0][-1])[1].endswith(args.file_extension):
 63 |         print(f"Seems like the _seq_cache.txt does not contain the correct extension, reload the file list")
 64 |         seqNames, _ = findAllSeqs(args.pathDB,
 65 |                                     speaker_level=1,
 66 |                                     extension=args.file_extension,
 67 |                                     loadCache=False)
 68 |     print(f"Done! Found {len(seqNames)} files!")
 69 | 
 70 |     # Verify the output directory
 71 |     if os.path.exists(args.pathOutputDir):
 72 |         existing_files = set([os.path.splitext(os.path.basename(x))[0]
 73 |                             for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"])
 74 |         seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files]
 75 |         print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!")
 76 |     else:
 77 |         print("")
 78 |         print(f"Creating the output directory at {args.pathOutputDir}")
 79 |         Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
 80 |     writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)
 81 | 
 82 |     # Debug mode
 83 |     if args.debug:
 84 |         nsamples=20
 85 |         print("")
 86 |         print(f"Debug mode activated, only load {nsamples} samples!")
 87 |         # shuffle(seqNames)
 88 |         seqNames = seqNames[:nsamples]
 89 | 
 90 |     # Load CPC feature maker
 91 |     print("")
 92 |     print(f"Loading CPC featureMaker from {args.pathCPCCheckpoint}")
 93 |     featureMaker = loadCPCFeatureMaker(
 94 |         args.pathCPCCheckpoint, 
 95 |         gru_level = args.gru_level, 
 96 |         get_encoded = args.get_encoded, 
 97 |         keep_hidden = True)
 98 |     featureMaker.eval()
 99 |     if not args.cpu:
100 |         featureMaker.cuda()
101 |     print("CPC FeatureMaker loaded!")
102 | 
103 |     # Define CPC_feature_function
104 |     def CPC_feature_function(x): 
105 |         CPC_features = buildFeature(featureMaker, x,
106 |                                     seqNorm=args.seq_norm,
107 |                                     strict=args.strict,
108 |                                     maxSizeSeq=args.max_size_seq)
109 |         return CPC_features.squeeze(0).float().cpu().numpy()
110 | 
111 |     # Building features
112 |     print("")
113 |     print(f"Building CPC features and saving outputs to {args.pathOutputDir}...")
114 |     bar = progressbar.ProgressBar(maxval=len(seqNames))
115 |     bar.start()
116 |     start_time = time()
117 |     for index, vals in enumerate(seqNames):
118 |         bar.update(index)
119 | 
120 |         file_path = vals[1]
121 |         file_path = os.path.join(args.pathDB, file_path)
122 | 
123 |         # Computing features
124 |         CPC_features = CPC_feature_function(file_path)
125 | 
126 |         # Save the outputs
127 |         file_name = os.path.splitext(os.path.basename(file_path))[0] + ".txt"
128 |         file_out = os.path.join(args.pathOutputDir, file_name)
129 |         np.savetxt(file_out, CPC_features)
130 |     bar.finish()
131 |     print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
132 | 
133 | if __name__ == "__main__":
134 |     args = sys.argv[1:]
135 |     main(args)


--------------------------------------------------------------------------------
/scripts/build_LSTM_features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import progressbar
  6 | from pathlib import Path
  7 | from time import time
  8 | import numpy as np
  9 | from copy import deepcopy
 10 | 
 11 | import torch
 12 | 
 13 | from utils.utils_functions import writeArgs, loadLSTMLMCheckpoint
 14 | 
 15 | def parseArgs(argv):
 16 |     # Run parameters
 17 |     parser = argparse.ArgumentParser(description='Export LSTM features from quantized units of audio files.')
 18 |     parser.add_argument('pathQuantizedUnits', type=str,
 19 |                         help='Path to the quantized units. Each line of the input file must be'
 20 |                         'of the form file_name[tab]pseudo_units (ex. hat  1,1,2,3,4,4)')
 21 |     parser.add_argument('pathOutputDir', type=str,
 22 |                         help='Path to the output directory.')
 23 |     parser.add_argument('pathLSTMCheckpoint', type=str,
 24 |                         help='Path to the trained fairseq lstm_lm model.')
 25 |     parser.add_argument('--dict', type=str,
 26 |                        help='Path to the dictionary file (dict.txt) used to train the LSTM LM model'
 27 |                        '(if not speficied, look for dict.txt in the model directory)')
 28 |     parser.add_argument('--hidden_level', type=int, default=-1,
 29 |                           help="Hidden layer of BERT to extract features from (default: -1, last layer).")
 30 |     parser.add_argument('--debug', action='store_true',
 31 |                         help="Load only a very small amount of files for "
 32 |                         "debugging purposes.")
 33 |     parser.add_argument('--cpu', action='store_true',
 34 |                         help="Run on a cpu machine.")
 35 |     return parser.parse_args(argv)
 36 | 
 37 | def main(argv):
 38 |     # Args parser
 39 |     args = parseArgs(argv)
 40 | 
 41 |     print("=============================================================")
 42 |     print(f"Building BERT features from {args.pathQuantizedUnits}")
 43 |     print("=============================================================")
 44 | 
 45 |     # Load input file
 46 |     print("")
 47 |     print(f"Reading input file from {args.pathQuantizedUnits}")
 48 |     seqNames = []
 49 |     seqInputs = []
 50 |     with open(args.pathQuantizedUnits, 'r') as f:
 51 |         for line in f:
 52 |             file_name, file_seq = line.strip().split("\t")
 53 |             # Convert sequence to the desired input form
 54 |             file_seq = file_seq.replace(",", " ")
 55 |             # Add to lists
 56 |             seqNames.append(file_name)
 57 |             seqInputs.append(file_seq)
 58 |     print(f"Found {len(seqNames)} sequences!")
 59 | 
 60 |     # Verify the output directory
 61 |     if os.path.exists(args.pathOutputDir):
 62 |         existing_files = set([os.path.splitext(os.path.basename(x))[0]
 63 |                             for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"])
 64 |         seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files]
 65 |         print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!")
 66 |     else:
 67 |         print("")
 68 |         print(f"Creating the output directory at {args.pathOutputDir}")
 69 |         Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
 70 |     writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)
 71 | 
 72 |     # Debug mode
 73 |     if args.debug:
 74 |         nsamples=20
 75 |         print("")
 76 |         print(f"Debug mode activated, only load {nsamples} samples!")
 77 |         # shuffle(seqNames)
 78 |         seqNames = seqNames[:nsamples]
 79 |         seqInputs = seqInputs[:nsamples]
 80 | 
 81 |     # Load LSTM model
 82 |     if args.dict is None:
 83 |         pathData = os.path.dirname(args.pathLSTMCheckpoint)
 84 |     else:
 85 |         pathData = os.path.dirname(args.dict)
 86 |     assert os.path.exists(os.path.join(pathData, "dict.txt")), \
 87 |         f"Dictionary file (dict.txt) not found in {pathData}"
 88 |     print("")
 89 |     print(f"Loading LSTM model from {args.pathLSTMCheckpoint}...")
 90 |     print(f"Path data {pathData}")
 91 |     model, task = loadLSTMLMCheckpoint(
 92 |                     args.pathLSTMCheckpoint, 
 93 |                     pathData)
 94 |     model.eval()  # disable dropout (or leave in train mode to finetune)
 95 |     if not args.cpu:
 96 |         model.cuda()
 97 |     print("Model loaded !")
 98 | 
 99 |     # Define LSTM_feature_function
100 |     def LSTM_feature_function(input_sequence, n_hidden=-1):
101 |         # Get the number of layers
102 |         num_layers = len(model.decoder.layers)
103 |         assert abs(n_hidden) <= num_layers, \
104 |             "absolute value of n_hidden must be less than or equal to the number of hidden layers = {}".format(num_layers)
105 | 
106 |         if n_hidden < 0:
107 |             n_hidden = num_layers + 1 + n_hidden
108 | 
109 |         # Get input tensor
110 |         input_tensor = task.source_dictionary.encode_line(
111 |                             "<s> " + input_sequence,
112 |                             append_eos=True,
113 |                             add_if_not_exist=False).type(torch.LongTensor).unsqueeze(0)
114 |         if not args.cpu:
115 |             input_tensor = input_tensor.cuda()
116 |             
117 |         # Get the output
118 |         if n_hidden == 0: # Take the embedding layer
119 |             with torch.no_grad():
120 |                 output_tensor = model.decoder.embed_tokens(input_tensor)
121 | 
122 |         else:
123 |             decoder_clone = deepcopy(model.decoder)
124 |             
125 |             # We don't take the final fc features
126 |             decoder_clone.fc_out = torch.nn.Identity()
127 |             decoder_clone.additional_fc = torch.nn.Identity()
128 |             
129 |             # Restrict the number of hiddden layers to n_hidden
130 |             decoder_clone.layers = decoder_clone.layers[:n_hidden]
131 | 
132 |             with torch.no_grad():
133 |                 output_tensor = decoder_clone(input_tensor)[0]
134 | 
135 |         return output_tensor[0].data.cpu().numpy()
136 | 
137 |     # Building features
138 |     print("")
139 |     print(f"Building LSTM features and saving outputs to {args.pathOutputDir}...")
140 |     bar = progressbar.ProgressBar(maxval=len(seqNames))
141 |     bar.start()
142 |     start_time = time()
143 |     for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)):
144 |         bar.update(index)
145 | 
146 |         # Computing features
147 |         LSTM_features = LSTM_feature_function(input_seq, n_hidden=args.hidden_level)
148 | 
149 |         # Save the outputs
150 |         file_name = os.path.splitext(name_seq)[0] + ".txt"
151 |         file_out = os.path.join(args.pathOutputDir, file_name)
152 |         np.savetxt(file_out, LSTM_features)
153 |     bar.finish()
154 |     print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
155 | 
156 | if __name__ == "__main__":
157 |     args = sys.argv[1:]
158 |     main(args)
159 | 


--------------------------------------------------------------------------------
/scripts/compute_proba_BERT.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from os.path import exists, join, basename, dirname, abspath
  3 | import sys
  4 | import argparse
  5 | 
  6 | from utils.utils_functions import loadRobertaCheckpoint
  7 | from utils.lm_scoring import compute_proba_BERT_mlm_span
  8 | 
  9 | def parseArgs(argv):
 10 |     # Run parameters
 11 |     parser = argparse.ArgumentParser(description='Compute pseudo log-probabilities of quantized units with a trained BERT model.')
 12 |     parser.add_argument('pathQuantizedUnits', type=str,
 13 |                         help='Path to the quantized units. Each line of the input file must be'
 14 |                         'of the form file_name[tab]pseudo_units (ex. hat  1,1,2,3,4,4)')
 15 |     parser.add_argument('pathOutputFile', type=str,
 16 |                         help='Path to the output file containing scores.')
 17 |     parser.add_argument('pathBERTCheckpoint', type=str,
 18 |                         help='Path to the trained fairseq BERT(RoBERTa) model.')
 19 |     parser.add_argument('--dict', type=str,
 20 |                        help='Path to the dictionary file (dict.txt) used to train the BERT model'
 21 |                        '(if not speficied, look for dict.txt in the model directory)')
 22 |     parser.add_argument('--decoding_span_size', type=int, default=15,
 23 |                         help='The decoding span size (M_d) parameter used to compute'
 24 |                         'the pseudo-probability (default: 15).')
 25 |     parser.add_argument('--temporal_sliding_size', type=int, default=5,
 26 |                         help='The temporal sliding size (Delta_t) parameter used to'
 27 |                         'compute the pseudo-probability (defaut: 5).')
 28 |     parser.add_argument('--no_overlap', action="store_true",
 29 |                         help='If specified, not overlap the masking spans when computing the'
 30 |                         'pseudo-probability (temporal_sliding_size is set to decoding_span_size)')
 31 |     parser.add_argument('--batchsen_size', type=int, default=32,
 32 |                         help='The number of sentences to be considered in each outer batch'
 33 |                         '(batch of sentences) (defaut: 32). Decrease this for longer sentences (BLIMP).')
 34 |     parser.add_argument('--inner_batch_size', type=int, default=128,
 35 |                         help='For each sentence, the model has to compute the outputs of many different'
 36 |                         'masked sequences. This parameter controls the size of the inner batches for'
 37 |                         'each outer batch (defaut: 128). Decrease this for longer sentences (BLIMP).')
 38 |     parser.add_argument('--cpu', action='store_true',
 39 |                         help="Run on a cpu machine.")
 40 |     parser.add_argument('--resume', action='store_true',
 41 |                         help="Continue to compute score if the output file already exists.")
 42 |     return parser.parse_args(argv)
 43 | 
 44 | def main(argv):
 45 |     # Args parser
 46 |     args = parseArgs(argv)
 47 | 
 48 |     # Convert to absolute paths to get rid of exceptions
 49 |     args.pathQuantizedUnits = abspath(args.pathQuantizedUnits)
 50 |     args.pathOutputFile = abspath(args.pathOutputFile)
 51 |     args.pathBERTCheckpoint = abspath(args.pathBERTCheckpoint)
 52 |     if args.dict is not None:
 53 |         args.dict = abspath(args.dict)
 54 | 
 55 |     # Load input file
 56 |     print("")
 57 |     print(f"Reading input file from {args.pathQuantizedUnits}")
 58 |     input_file_names = []
 59 |     intput_file_seqs = []
 60 |     with open(args.pathQuantizedUnits, 'r') as f:
 61 |         for line in f:
 62 |             file_name, file_seq = line.strip().split("\t")
 63 |             # Convert sequence to the desired input form
 64 |             file_seq = file_seq.replace(",", " ")
 65 |             # Add to lists
 66 |             input_file_names.append(file_name)
 67 |             intput_file_seqs.append(file_seq)
 68 |     print(f"Found {len(input_file_names)} sequences!")
 69 | 
 70 |     # Check if directory exists
 71 |     pathOutputDir = dirname(args.pathOutputFile)
 72 |     if pathOutputDir and not exists(pathOutputDir):
 73 |         print("")
 74 |         print(f"Creating the output directory at {pathOutputDir}")
 75 |         Path(pathOutputDir).mkdir(parents=True, exist_ok=True)
 76 |     # writeArgs(join(pathOutputDir, "_info_args.json"), args)
 77 | 
 78 |     # Continue
 79 |     if args.resume:
 80 |         if exists(args.pathOutputFile):
 81 |             existing_file_names = []
 82 |             with open(args.pathOutputFile, 'r') as f:
 83 |                 lines = [line for line in f]
 84 |             for line in lines:
 85 |                 file_name, score = line.strip().split()
 86 |                 existing_file_names.append(file_name)
 87 |             assert input_file_names[:len(existing_file_names)] == existing_file_names, \
 88 |                 "The file names in the existing output file do not match the input file!!"
 89 |             input_file_names = input_file_names[len(existing_file_names):]
 90 |             intput_file_seqs = intput_file_seqs[len(existing_file_names):]
 91 |             print(f"Found existing output file, continue to compute scores of {len(intput_file_seqs)} sequences left!")
 92 |     else:
 93 |         assert not exists(args.pathOutputFile), \
 94 |             f"Output file {args.pathOutputFile} already exists !!! If you want to continue computing scores, please check the --resume option."
 95 | 
 96 |     assert len(intput_file_seqs) > 0, \
 97 |         "No file to compute probability!"
 98 | 
 99 |     # Load BERT model
100 |     if args.dict is None:
101 |         pathData = dirname(args.pathBERTCheckpoint)
102 |     else:
103 |         pathData = dirname(args.dict)
104 |     assert exists(join(pathData, "dict.txt")), \
105 |         f"Dictionary file (dict.txt) not found in {pathData}"
106 |     print("")
107 |     print(f"Loading RoBERTa model from {args.pathBERTCheckpoint}...")
108 |     print(f"Path data {pathData}")
109 |     roberta = loadRobertaCheckpoint(
110 |                 args.pathBERTCheckpoint, 
111 |                 pathData, 
112 |                 from_pretrained=False)
113 |     roberta.eval()  # disable dropout (or leave in train mode to finetune)
114 |     print("Model loaded !")
115 | 
116 |     # Run and save outputs
117 |     print("")
118 |     print(f"Computing log-probabilities and saving results to {args.pathOutputFile}...")
119 |     _ = compute_proba_BERT_mlm_span(
120 |                             intput_file_seqs, roberta, tokenized=True,
121 |                             decoding_span_size=args.decoding_span_size, temporal_sliding_size = args.temporal_sliding_size,
122 |                             span_overlap=not args.no_overlap,
123 |                             batchsen_size=args.batchsen_size, inner_batch_size = args.inner_batch_size,
124 |                             gpu=not args.cpu, print_tokens=False, verbose=False, print_shape_statistics=False,
125 |                             save_to=args.pathOutputFile, file_names=input_file_names)
126 | 
127 | if __name__ == "__main__":
128 |     args = sys.argv[1:]
129 |     main(args)
130 | 


--------------------------------------------------------------------------------
/scripts/compute_proba_LSTM.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from os.path import exists, join, basename, dirname, abspath
  3 | import sys
  4 | import argparse
  5 | 
  6 | from utils.utils_functions import loadLSTMLMCheckpoint
  7 | from utils.lm_scoring import compute_proba_LSTM
  8 | 
  9 | def parseArgs(argv):
 10 |     # Run parameters
 11 |     parser = argparse.ArgumentParser(description='Compute pseudo log-probabilities of quantized units with a trained BERT model.')
 12 |     parser.add_argument('pathQuantizedUnits', type=str,
 13 |                         help='Path to the quantized units. Each line of the input file must be'
 14 |                         'of the form file_name[tab]pseudo_units (ex. hat  1,1,2,3,4,4)')
 15 |     parser.add_argument('pathOutputFile', type=str,
 16 |                         help='Path to the output file containing scores.')
 17 |     parser.add_argument('pathLSTMCheckpoint', type=str,
 18 |                         help='Path to the trained fairseq LSTM model.')
 19 |     parser.add_argument('--dict', type=str,	
 20 |                        help='Path to the dictionary file (dict.txt) used to train the LSTM model'
 21 |                        '(if not speficied, look for dict.txt in the model directory)')
 22 |     parser.add_argument('--batchSize', type=int, default=128,
 23 |                         help='The number of sentences to be in each batch (defaut: 128)')
 24 |     parser.add_argument('--cpu', action='store_true',
 25 |                         help="Run on a cpu machine.")
 26 |     parser.add_argument('--resume', action='store_true',
 27 |                         help="Continue to compute score if the output file already exists.")
 28 |     return parser.parse_args(argv)
 29 | 
 30 | def main(argv):
 31 |     # Args parser
 32 |     args = parseArgs(argv)
 33 | 
 34 |     # Convert to absolute paths to get rid of exceptions
 35 |     args.pathQuantizedUnits = abspath(args.pathQuantizedUnits)
 36 |     args.pathOutputFile = abspath(args.pathOutputFile)
 37 |     args.pathLSTMCheckpoint = abspath(args.pathLSTMCheckpoint)
 38 |     if args.dict is not None:
 39 |         args.dict = abspath(args.dict)
 40 | 
 41 |     # Load input file
 42 |     print("")
 43 |     print(f"Reading input file from {args.pathQuantizedUnits}")
 44 |     input_file_names = []
 45 |     intput_file_seqs = []
 46 |     with open(args.pathQuantizedUnits, 'r') as f:
 47 |         for line in f:
 48 |             file_name, file_seq = line.strip().split("\t")
 49 |             # Convert sequence to the desired input form
 50 |             file_seq = file_seq.replace(",", " ")
 51 |             # Add to lists
 52 |             input_file_names.append(file_name)
 53 |             intput_file_seqs.append(file_seq)
 54 |     print(f"Found {len(input_file_names)} sequences!")
 55 | 
 56 |     # Check if directory exists
 57 |     pathOutputDir = dirname(args.pathOutputFile)
 58 |     if pathOutputDir and not exists(pathOutputDir):
 59 |         print("")
 60 |         print(f"Creating the output directory at {pathOutputDir}")
 61 |         Path(pathOutputDir).mkdir(parents=True, exist_ok=True)
 62 |     # writeArgs(join(pathOutputDir, "_info_args.json"), args)
 63 |     
 64 |     # Continue
 65 |     if args.resume:
 66 |         if exists(args.pathOutputFile):
 67 |             existing_file_names = []
 68 |             with open(args.pathOutputFile, 'r') as f:
 69 |                 lines = [line for line in f]
 70 |             for line in lines:
 71 |                 file_name, score = line.strip().split()
 72 |                 existing_file_names.append(file_name)
 73 |             assert input_file_names[:len(existing_file_names)] == existing_file_names, \
 74 |                 "The file names in the existing output file do not match the input file!!"
 75 |             input_file_names = input_file_names[len(existing_file_names):]
 76 |             intput_file_seqs = intput_file_seqs[len(existing_file_names):]
 77 |             print(f"Found existing output file, continue to compute scores of {len(intput_file_seqs)} sequences left!")
 78 |     else:
 79 |         assert not exists(args.pathOutputFile), \
 80 |             f"Output file {args.pathOutputFile} already exists !!! If you want to continue computing scores, please check the --resume option."
 81 | 
 82 |     # Load LSTM model
 83 |     if args.dict is None:
 84 |         pathData = dirname(args.pathLSTMCheckpoint)
 85 |     else:
 86 |         pathData = dirname(args.dict)
 87 |     assert exists(join(pathData, "dict.txt")), \
 88 |         f"Dictionary file (dict.txt) not found in {pathData}"
 89 |     print("")
 90 |     print(f"Loading LSTM model from {args.pathLSTMCheckpoint}...")
 91 |     print(f"Path data {pathData}")
 92 |     model, task = loadLSTMLMCheckpoint(args.pathLSTMCheckpoint, pathData)
 93 |     model.eval()
 94 |     print("Model loaded !")
 95 | 
 96 |     # Run and save outputs
 97 |     print("")
 98 |     print(f"Computing log-probabilities and saving results to {args.pathOutputFile}...")
 99 |     _ = compute_proba_LSTM(
100 |                         intput_file_seqs, model, task, 
101 |                         batch_size = args.batchSize, gpu = not args.cpu,
102 |                         verbose=False, print_tokens=False,
103 |                         save_to=args.pathOutputFile, file_names=input_file_names)
104 | 
105 | if __name__ == "__main__":
106 |     args = sys.argv[1:]
107 |     main(args)


--------------------------------------------------------------------------------
/scripts/utils/utils_functions.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | 
  4 | import torch
  5 | from cpc.feature_loader import FeatureModule, loadModel
  6 | from cpc.criterion.clustering import kMeanCluster
  7 | 
  8 | #from fairseq import tasks, checkpoint_utils
  9 | #from fairseq.models.roberta import RobertaModel, RobertaHubInterface
 10 | 
 11 | def readArgs(pathArgs):
 12 |     print(f"Loading args from {pathArgs}")
 13 |     with open(pathArgs, 'r') as file:
 14 |         args = argparse.Namespace(**json.load(file))
 15 |     return args
 16 | 
 17 | def writeArgs(pathArgs, args):
 18 |     print(f"Writing args to {pathArgs}")
 19 |     with open(pathArgs, 'w') as file:
 20 |         json.dump(vars(args), file, indent=2)
 21 | 
 22 | def loadCPCFeatureMaker(pathCheckpoint, gru_level=-1, get_encoded=False, keep_hidden=True, load_nullspace=False):
 23 |     """
 24 |     Load CPC Feature Maker from CPC checkpoint file.
 25 |     """
 26 |     # Set LSTM level
 27 |     if gru_level is not None and gru_level > 0:
 28 |         updateConfig = argparse.Namespace(nLevelsGRU=gru_level)
 29 |     else:
 30 |         updateConfig = None
 31 | 
 32 |     # Load CPC model
 33 |     model, nHiddenGar, nHiddenEncoder = loadModel([pathCheckpoint], updateConfig=updateConfig, load_nullspace=load_nullspace)
 34 |     
 35 |     # Keep hidden units at LSTM layers on sequential batches
 36 |     if load_nullspace:
 37 |         model.cpc.gAR.keepHidden = keep_hidden
 38 |     else:
 39 |         model.gAR.keepHidden = keep_hidden
 40 | 
 41 |     # Build CPC Feature Maker from CPC model
 42 |     featureMaker = FeatureModule(model, get_encoded=get_encoded)
 43 | 
 44 |     return featureMaker
 45 | 
 46 | def loadClusterModule(pathCheckpoint, norm_vec_len=False):
 47 |     """
 48 |     Load CPC Clustering Module from Clustering checkpoint file.
 49 |     """
 50 |     state_dict = torch.load(pathCheckpoint, map_location=torch.device('cpu'))
 51 |     clusterModule = kMeanCluster(torch.zeros(1, state_dict["n_clusters"], state_dict["dim"]), norm_vec_len=norm_vec_len)
 52 |     clusterModule.load_state_dict(state_dict["state_dict"])
 53 |     return clusterModule
 54 | 
 55 | #def loadRobertaCheckpoint(pathBERTCheckpoint, pathData, from_pretrained=False):
 56 | #    """
 57 | #    Load Roberta model from checkpoint.
 58 | #    If load a pretrained model from fairseq, set from_pretrained=True.
 59 | #    """
 60 | #    if from_pretrained: # Require connection to download bpe, possible errors for trained checkpoint that contains cfg 
 61 | #        roberta = RobertaModel.from_pretrained(dirname(pathBERTCheckpoint), basename(pathBERTCheckpoint), pathData)
 62 | #    else:
 63 | #        # Set up the args Namespace
 64 | #        model_args = argparse.Namespace(
 65 | #            task='masked_lm',
 66 | #            seed=-1,
 67 | #            output_dictionary_size=-1,
 68 | #            data=pathData,
 69 | #            path=pathBERTCheckpoint
 70 | #            )
 71 | #
 72 | #        # Setup task
 73 | #        task = tasks.setup_task(model_args)
 74 | #
 75 | #        # Load model
 76 | #        models, _model_args = checkpoint_utils.load_model_ensemble([model_args.path], task=task)
 77 | #        model = models[0]
 78 | #
 79 | #        # Wrap-up to RobertaHubInterface (to be consistent with RobertaModel.from_pretrained)
 80 | #        roberta = RobertaHubInterface(_model_args, task, model)
 81 | #    
 82 | #    return roberta
 83 | 
 84 | #def loadLSTMLMCheckpoint(pathLSTMCheckpoint, pathData):
 85 | #    """
 86 | #    Load lstm_lm model from checkpoint.
 87 | #    """
 88 | #    # Set up the args Namespace
 89 | #    model_args = argparse.Namespace(
 90 | #        task='language_modeling',
 91 | #        output_dictionary_size=-1,
 92 | #        data=pathData,
 93 | #        path=pathLSTMCheckpoint
 94 | #        )
 95 | #
 96 | #    # Setup task
 97 | #    task = tasks.setup_task(model_args)
 98 | #    
 99 | #    # Load model
100 | #    models, _model_args = checkpoint_utils.load_model_ensemble([model_args.path], task=task)
101 | #    model = models[0]
102 | #    
103 | #    return model, task


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | from setuptools import setup, find_packages
 6 | from setuptools.extension import Extension
 7 | from Cython.Build import cythonize
 8 | import numpy
 9 | 
10 | extensions = [
11 |     Extension(
12 |         "cpc.eval.ABX.dtw",
13 |         ["cpc/eval/ABX/dtw.pyx"],
14 |         include_dirs=[numpy.get_include()],
15 |     ),
16 | ]
17 | 
18 | setup(
19 |     name='CPC_audio',
20 |     version='1.0',
21 |     description='An implementation of the contrast predictive coding (CPC) '
22 |     'training method for audio data.',
23 |     author='Facebook AI Research',
24 |     packages=find_packages(),
25 |     classifiers=["License :: OSI Approved :: MIT License",
26 |                  "Intended Audience :: Science/Research",
27 |                  "Topic :: Scientific/Engineering",
28 |                  "Programming Language :: Python"],
29 |     ext_modules=cythonize(extensions, language_level="3")
30 | )
31 | 


--------------------------------------------------------------------------------
/train_ls100.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | set -x
 5 | 
 6 | RVERB="-v --dry-run"
 7 | RVERB=""
 8 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 9 | SAVE_DIR="$(
10 | python - "$@" << END
11 | if 1:
12 |   import argparse
13 |   parser = argparse.ArgumentParser(description='Process some integers.')
14 |   parser.add_argument('--pathCheckpoint')
15 |   args, _ = parser.parse_known_args()
16 |   print(args.pathCheckpoint)
17 | END
18 | )"
19 | 
20 | mkdir -p ${SAVE_DIR}/code
21 | rsync --exclude '.*' \
22 |       --exclude data \
23 |       --exclude pretrained_models \
24 |       --exclude '__pycache__' \
25 |       --exclude '*runs*' \
26 |       --exclude '*.pyc' \
27 |       --exclude '*.ipynb' \
28 |       --filter=':- .gitignore' \
29 |     $RVERB -lrpt $CPC_DIR/ ${SAVE_DIR}/code/
30 | 
31 | echo $0 "$@" >> ${SAVE_DIR}/out.txt
32 | exec python -u cpc/train.py \
33 |     --pathDB /pio/data/zerospeech2021/LibriSpeech-wav/train-clean-100 \
34 |     --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt \
35 |     --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \
36 |     --file_extension .wav \
37 |     "$@" 2>&1 | tee -ai ${SAVE_DIR}/out.txt
38 | 


--------------------------------------------------------------------------------