├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── _train_pro.sh ├── centerpush_nonullspace_phoneme_classification.sh ├── centerpush_nullspace_phoneme_classification.sh ├── cpc ├── README.md ├── __init__.py ├── cpc_default_config.py ├── criterion │ ├── __init__.py │ ├── clustering │ │ ├── __init__.py │ │ ├── clustering.py │ │ ├── clustering_quantization.py │ │ └── clustering_script.py │ ├── criterion.py │ ├── custom_layers.py │ ├── seq_alignment.py │ └── soft_align.py ├── dataset.py ├── eval │ ├── ABX.py │ ├── ABX │ │ ├── __init__.py │ │ ├── abx_group_computation.py │ │ ├── abx_iterators.py │ │ ├── dtw.pyx │ │ ├── test_data │ │ │ ├── 2107.npy │ │ │ ├── 23.npy │ │ │ ├── 407.npy │ │ │ ├── 42.npy │ │ │ ├── dummy_item_file.item │ │ │ └── dummy_item_within.item │ │ └── unit_tests.py │ ├── __init__.py │ ├── build_zeroSpeech_features.py │ ├── common_voices_eval.py │ ├── linear_separability.py │ └── utils │ │ └── adjust_sample_rate.py ├── feature_loader.py ├── model.py ├── stats │ ├── __init__.py │ ├── empty_stat.py │ ├── repr_diff_stat.py │ ├── stat_utils.py │ └── stats_collector.py ├── test_data │ ├── phone_labels.txt │ ├── seq_list.txt │ └── test_db │ │ ├── 2911 │ │ └── 12359 │ │ │ └── 2911-12359-0007.flac │ │ ├── 4051 │ │ └── 11218 │ │ │ └── 4051-11218-0044.flac │ │ ├── 4397 │ │ └── 15668 │ │ │ ├── 4397-15668-0003.flac │ │ │ └── 4397-15668-0007.flac │ │ ├── 5393 │ │ └── 19218 │ │ │ └── 5393-19218-0024.flac │ │ ├── 5678 │ │ ├── 43301 │ │ │ └── 5678-43301-0021.flac │ │ └── 43303 │ │ │ ├── 5678-43303-0024.flac │ │ │ └── 5678-43303-0032.flac │ │ └── 6476 │ │ └── 57446 │ │ └── 6476-57446-0019.flac ├── train.py ├── transformers.py ├── unit_tests.py └── utils │ ├── __init__.py │ ├── capture_loader.py │ ├── misc.py │ └── unit_tests.py ├── cpc_ctc_visualization.ipynb ├── environment.yml ├── experiments ├── train_pro_1gpu.sh ├── train_pro_2gpu.sh └── train_pro_cpcctc_bases.sh ├── finetune_nullspace.sh ├── hubconf.py ├── jch_experiments ├── lineval_ls100.sh ├── run_clustering.sh ├── scripts ├── build_1hot_features.py ├── build_BERT_features.py ├── build_CPC_features.py ├── build_LSTM_features.py ├── compute_proba_BERT.py ├── compute_proba_LSTM.py ├── quantize_audio.py └── utils │ ├── lm_scoring.py │ └── utils_functions.py ├── setup.py └── train_ls100.sh /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | .idea 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at . All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to CPC_audio 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## Coding Style 30 | * 2 spaces for indentation rather than tabs 31 | * 80 character line length 32 | * ... 33 | 34 | ## License 35 | By contributing to CPC_audio, you agree that your contributions will be licensed 36 | under the LICENSE file in the root directory of this source tree. 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /_train_pro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script for the Prometheus slurm cluster 4 | 5 | set -x 6 | 7 | RVERB="" # =-v 8 | 9 | REMOTE_USER=plgjch 10 | REMOTE_HOST=pro.cyfronet.pl 11 | 12 | # location of the main repository (contains data/) 13 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 14 | REMOTE_CPC_DIR=/net/people/plgjch/scratch/CPC_audio 15 | REMOTE_MINICONDA_DIR=/net/archive/groups/plggneurony/os/miniconda3 16 | REMOTE_LIBRISPEECH_DIR=/net/people/plgjch/lscratch/plgjch/LibriSpeech-wav 17 | REMOTE_LIBRISPEECH100_SPLITS=/net/archive/groups/plggneurony/data/librispeech/LibriSpeech100_labels_split 18 | 19 | # top-level directory for experiments 20 | REMOTE_EXPERIMENT_RUNDIR=/net/scratch/people/plgjch/cpc/ 21 | 22 | # adjust the main loop 23 | # (it can go over .yaml files, over hyperparameters, etc. 24 | for PARAMS in \ 25 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSkipEnd 0" \ 26 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 0" \ 27 | "--CPCCTCNumMatched 12 --nPredicts 8 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 3" \ 28 | ; do 29 | 30 | # low-level directory for experiments 31 | EXP_TAG=remote_pro 32 | PRINT_PARAMS = $(echo $PARAMS | tr -d ' ' | sed -e 's/-\+/_/g') 33 | NAME=cpcctc${PRINT_PARAMS} 34 | DIR=$EXP_TAG/$NAME 35 | EXP_DIR=$REMOTE_EXPERIMENT_RUNDIR/$DIR 36 | 37 | echo $EXP_DIR 38 | 39 | continue 40 | 41 | ssh -q $REMOTE_USER@$REMOTE_HOST mkdir -p $EXP_DIR 42 | 43 | TMP_DIR=`mktemp -d` 44 | mkdir $TMP_DIR/code 45 | # symlink the data from the main dir 46 | 47 | cat > $TMP_DIR/exp_train.sh <&1 | tee -ai $EXP_DIR/lineval_${CP}/out.txt 104 | EOF 105 | 106 | # Transmit the startup script 107 | rsync $RVERB -lrpt -e "ssh -q" $TMP_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/ 108 | 109 | # Transmit the rest 110 | rsync --exclude '.*' \ 111 | --exclude data \ 112 | --exclude pretrained_models \ 113 | --exclude '__pycache__' \ 114 | --exclude '*runs*' \ 115 | --exclude '*.pyc' \ 116 | --exclude '*.ipynb' \ 117 | --filter=':- .gitignore' \ 118 | $RVERB -lrpt -e "ssh -q" $CPC_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/code/ 119 | 120 | ssh -q $REMOTE_USER@$REMOTE_HOST sbatch \ 121 | `#--gres="" --time=00:10:00 -p plgrid-testing` \ 122 | $EXP_DIR/exp_train.sh 123 | 124 | rm -Rf $TMP_DIR 125 | 126 | done 127 | 128 | echo "Queue status" 129 | ssh -q $REMOTE_USER@$REMOTE_HOST squeue 130 | -------------------------------------------------------------------------------- /centerpush_nonullspace_phoneme_classification.sh: -------------------------------------------------------------------------------- 1 | 2 | for deg in 0 0.2 0.3 0.4 0.5 0.6 0.7 3 | do 4 | echo $deg 5 | mkdir ${centerpushDir}/phoneme_classif_nonull_${deg}/ 6 | python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ \ 7 | $zd/LibriSpeech/labels_split/train_split_100.txt \ 8 | $zd/LibriSpeech/labels_split/test_split_100.txt \ 9 | $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt \ 10 | --centerpushFile $zd/checkpoints/CPC-big-kmeans50/clustering_kmeans50/clustering_CPC_big_kmeans50.pt \ 11 | --centerpushDeg $deg \ 12 | --pathCheckpoint ${centerpushDir}/phoneme_classif_nonull_${deg}/ \ 13 | --mode phonemes --max_size_loaded 40000000 --n_process_loader 2 \ 14 | --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt \ 15 | --gru_level 2 --batchSizeGPU 32 | tee ${centerpushDir}/phoneme_classif_nonull_${deg}/log.txt 16 | done -------------------------------------------------------------------------------- /centerpush_nullspace_phoneme_classification.sh: -------------------------------------------------------------------------------- 1 | 2 | for deg in 0 0.2 0.3 0.4 0.5 0.6 0.7 3 | do 4 | echo $deg 5 | mkdir ${centerpushDir}/phoneme_classif_null_${deg}/ 6 | python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ \ 7 | $zd/LibriSpeech/labels_split/train_split_100.txt \ 8 | $zd/LibriSpeech/labels_split/test_split_100.txt \ 9 | $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt \ 10 | --centerpushFile $cpcClustDir/checkpoints/clustering_CPC_big_kmeans50_nullspace_64/clustering_CPC_big_kmeans50_nullspace_64.pt \ 11 | --centerpushDeg $deg \ 12 | --pathCheckpoint ${centerpushDir}/phoneme_classif_null_${deg}/ \ 13 | --mode phonemes_nullspace --max_size_loaded 40000000 --n_process_loader 2 \ 14 | --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt \ 15 | --path_speakers_factorized $nullspaceDir/linear_separability/cpc/gru_level2/cpc_official_speakers_factorized_64/checkpoint_9.pt \ 16 | --dim_inter 64 --gru_level 2 --batchSizeGPU 32 | tee ${centerpushDir}/phoneme_classif_null_${deg}/log.txt 17 | done 18 | 19 | -------------------------------------------------------------------------------- /cpc/README.md: -------------------------------------------------------------------------------- 1 | # Repository's architecture 2 | 3 | train.py : main script 4 | 5 | dataset.py : defintion of the Librispeech dataset format 6 | 7 | model.py : Basic encoders and AR models 8 | 9 | feature_loader.py: different tools to load and save a CPC model. 10 | 11 | transformers.py: an implementation of transformers 12 | 13 | unit_tests.py : unit tests 14 | 15 | criterion/: definition of the training criterions. Three criterion are currently available: CPC (unsupervised), speaker classification and phone classification. 16 | 17 | eval/: evaluation scripts. 18 | 19 | utils/: system utilities and misc. 20 | 21 | 22 | ## Stats module (initial) description 23 | 24 | Under `stats` there are utils for computing stats. `stats/repr_diff_stat.py` is an example, `stats/stats_collector.py` is used to aggregate stats given as arguments to `train.py` and therefore each stat needs to be registered in `stats/stats_utils.py` similarly as `reprDiffStat` (`stats/repr_diff_stat.py`) is. 25 | 26 | To compute stats for `train.py` run, use `--captureSetStats` which needs to be passed in format `stat1Name:arg1,arg2,arg3_stat2Name:arg1,arg2` where args are stat-specific (example: `reprDiff:cosine,ctx_repr,0.05,../reprDiffHistograms`). 27 | 28 | When specified like that (with `--captureSetStats`), stats are computed for "capture dataset" along with data capturing each specified number of epochs. One can specify to compute only stats and not capture data, but then captureDS still needs to be configured as described below under "CPC-CTC data capturing description". Example how to specify capture dataset: `--pathCaptureDS /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt --captureEachEpochs 2`. 29 | 30 | 31 | ## Linear separability automation description: 32 | 33 | This can be combined with data capturing described in the section below 34 | 35 | There are some args added to train.py, in group_supervised_data and group_supervised_metric. In case I forget something here they also have some description there. 36 | - --supervised_classif_metric is the flag to specify that additional linear separability task should be performed. Additionally, one/both of --speaker_sep, --path_phone_data should be specified to indicate which linear separabilities to perform - --speaker_sep for speaker classification and --path_phone_data for phoneme classification (this should be the path to the .txt file with phone alignments in their format, which they mention in the main readme of the repo) 37 | - linear separability task can be run in two modes, either once on the trained checkpoint (--only_classif_metric) or each --linsep_classif_each_epochs epochs during main CPC training. To automatically perform linsep once each training, e.g. --linsep_classif_each_epochs 180 can be specified (now linsep is not done at 0. epoch) 38 | - path where to store logs from linear separability task needs to be specified with --linsep_logs_dir; additionally, logging freqeuncy in epochs can be specified with --linsep_task_logging_step and those will be save under \<--linsep_logs_dir\>/\/phone or \<--linsep_logs_dir\>/\/speaker 39 | - path where to save classification models (state from best epoch for each separate classification training performed after X epoch of CPC training) can be specified with --linsep_checkpoint_dir and those will be save under \/\/phone or \/\/speaker 40 | - number of epochs to run each linear separability task for can be specified with --linsep_n_epoch 41 | - additional linear separability task parameters can be specified with: 42 | - params to set for Adam optimizer: --linsep_lr , --linsep_beta1 , --linsep_beta2 , --linsep_epsilon 43 | - --phone_get_encoded to use CNN encodings for classification instead of produced contexts (this is only for phoneme classification with regular loss, other of their classifiers don’t support it so it doesn’t affect them; specifying this with classification CTC loss (below) is not supported and will yield assertion error) 44 | - --CTCphones to use CTC-based loss for classification instead of ‘regular’ loss assuming representations/contexts should be aligned with audio data 45 | - --linsep_net_layers to use bigger fully connected net during classification training (default: 1 - then there is just one matrix without activations; each layer has classification_class_number neurons except for CTC-based loss which has additional 1 (in last layer for blank symbol)) 46 | - --linsepBatchSizeGPU can be specified to choose batch size for linear separability task; this is separate from batch size for CPC training 47 | 48 | example run combined with data capturing: 49 | some real training + capture 50 | ``` 51 | python train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 \ 52 | --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt \ 53 | --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \ 54 | --supervised_classif_metric \ 55 | --speaker_sep --path_phone_data /pio/scratch/1/i283340/MGR/zs/phones/converted_aligned_phones.txt \ 56 | --linsepBatchSizeGPU 32 --linsep_n_epoch 12 \ 57 | --linsep_logs_dir /pio/scratch/1/i283340/MGR/zs/linsep/logs2-001 \ 58 | --linsep_checkpoint_dir /pio/scratch/1/i283340/MGR/zs/linsep/checkp2-001 \ 59 | --linsep_classif_each_epochs 10 \ 60 | --pathCaptureDS /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt \ 61 | --captureDStotNr 100 --captureEachEpochs 10 \ 62 | --pathCaptureSave /pio/scratch/1/i283340/MGR/zs/capture/try2-001 \ 63 | --captureConvRepr --captureCtxRepr --captureSpeakerAlign --capturePhoneAlign --capturePred --captureCPCCTCalign --captureCPCCTClogScores \ 64 | --pathCheckpoint /pio/scratch/1/i283340/MGR/zs/checkpoints/cpcctc_tests2-001 \ 65 | --file_extension .flac --n_process_loader 1 --max_size_loaded 40000000 \ 66 | --batchSizeGPU 16 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12 \ 67 | --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 68 | ``` 69 | 70 | 71 | ## CPC-CTC data capturing description: 72 | 73 | There are some new args added to group_save and group_db in train.py for capturing - options are also described in the ‘help’ argument in definitions in case I forget something here: 74 | - Data capturing is possible in 2 modes: capture once for a teached model (use --onlyCapture and --pathCheckpoint) or capture each N epochs during training each N epochs (don’t use --onlyCapture and only specify --captureEachEpochs and things from two bullets below) 75 | - The data is captured for a separately specified dataset I call captureDataset. This can e.g. be just same as valDataset. It is specified with --pathCaptureDS that is the path to .txt file with sequences in this DS. Additionally, --captureDSfreq OR --captureDStotNr can be used to sample only the part of sequences specified in the file - some percentage of those with freq one, and total number with totNr one. (example: --pathCaptureDS \ --captureDStotNr 8 can be used to capture for just 8 audio files of the val dataset) 76 | - --pathCaptureSave tells where to save the data. Data for each epoch (for 1 epoch if --onlyCapture) is saved under \/\/\/ with file names {what_is_captured}_batch{batchBegin}-{batchEnd}.pt in one file each thing for each batch (example: ctx_repr_batch0-15.pt under ./captureRoot/0/ctx_repr/). What to capture is chosen with --captureConvRepr , --captureCtxRepr, --captureSpeakerAlign, --capturePhoneAlign, --capturePred , --captureCPCCTCalign , --captureCPCCTClogScores args (so, those are: representations, LSTM-produced contexts, speaker alignments for the audio, phoneme alignments for the audio, CPC predictions, CPC-CTC alignments). Note that capturing speaker and phoneme alignments is necessary for their visualization, as it is later impossible to tell from what audio file particular batch was taken (audio files are glued together and chunked, and also randomly permuted). There is also --captureEverything added for convenience that captures everything that is valid for given run config, but it’s alway safer to specify exactly what to capture. For capturing phoneme alignments --path_phone_data needs to be specified (this is the path to a .txt file with phoneme alignments in their format, they provide it somewhere in repo’s main readme) 77 | 78 | IN CASE YOU RUN DATA CAPTURE FOR AN ALREADY TRAINED MODEL, PASS SAME ARGUMENTS FOR THE MODEL TO LOAD CORRECTLY 79 | 80 | Example run that saves data each 2 epochs for 8 audio files of val dataset (with some very small dummy train=val datasets I made): 81 | ``` 82 | python train.py --pathDB /pio/scratch/1/i283340/MGR/zs/ds2 83 | --pathTrain /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt 84 | --pathVal /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt 85 | --pathCaptureDS /pio/scratch/1/i283340/MGR/zs/sometries/ds2part.txt 86 | --captureDStotNr 8 --captureEachEpochs 2 87 | --pathCaptureSave /pio/scratch/1/i283340/MGR/zs/capture/try1 88 | --path_phone_data /pio/scratch/1/i283340/MGR/zs/phones/converted_aligned_phones.txt 89 | --captureConvRepr --captureCtxRepr --captureSpeakerAlign --capturePhoneAlign --capturePred --captureCPCCTCalign --captureCPCCTClogScores 90 | --pathCheckpoint /pio/scratch/1/i283340/MGR/zs/checkpoints/cpcctc_tests2 91 | --file_extension .flac --n_process_loader 2 --max_size_loaded 40000000 92 | --batchSizeGPU 16 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12 93 | --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 94 | ``` 95 | 96 | Example with just capturing: 97 | ``` 98 | python train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --onlyCapture \ 99 | --pathCaptureDS /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \ 100 | --captureDStotNr 100 \ 101 | --pathCaptureSave /pio/gluster/i283340/cpccapture/ls100_cpcctc_match12_pred8/ \ 102 | --captureConvRepr --captureCtxRepr --captureSpeakerAlign --capturePhoneAlign --capturePred --captureCPCCTCalign --captureCPCCTClogScores \ 103 | --path_phone_data /pio/scratch/1/i283340/MGR/zs/phones/converted_aligned_phones.txt \ 104 | --pathCheckpoint /pio/gluster/i283340/modelcpy/ls100_cpcctc_match12_pred8 \ 105 | --file_extension .flac \ 106 | --normMode layerNorm --dropout --rnnMode transformer --n_process_loader 1 --max_size_loaded 4000000000 --nLevelsGRU 2 \ 107 | --batchSizeGPU 32 --limitNegsInBatch 8 --schedulerRamp 10 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12 108 | ``` -------------------------------------------------------------------------------- /cpc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /cpc/cpc_default_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import argparse 6 | 7 | 8 | def get_default_cpc_config(): 9 | parser = set_default_cpc_config(argparse.ArgumentParser()) 10 | return parser.parse_args([]) 11 | 12 | 13 | def set_default_cpc_config(parser): 14 | # Run parameters 15 | 16 | group = parser.add_argument_group('Architecture configuration', 17 | description="The arguments defining the " 18 | "model's architecture.") 19 | group.add_argument('--hiddenEncoder', type=int, default=256, 20 | help='Hidden dimension of the encoder network.') 21 | group.add_argument('--hiddenGar', type=int, default=256, 22 | help='Hidden dimension of the auto-regressive network') 23 | group.add_argument('--nPredicts', type=int, default=12, 24 | help='Number of steps to predict.') 25 | 26 | group.add_argument('--CPCCTC', action='store_true') 27 | group.add_argument('--CPCCTCNumMatched', type=int, default=16) 28 | group.add_argument('--CPCCTCSkipBeg', type=int, default=0) 29 | group.add_argument('--CPCCTCSkipEnd', type=int, default=0) 30 | group.add_argument('--CPCCTCSelfLoop', action='store_true') 31 | group.add_argument('--CPCCTCLearnBlank', action='store_true') 32 | group.add_argument('--CPCCTCNoNegsMatchWin', action='store_true') 33 | group.add_argument('--CPCCTCMasq', default="") 34 | group.add_argument('--CPCCTCLossTemp', type=float, default=1.0) 35 | group.add_argument('--CPCCTCNormalizeEncs', action='store_true') 36 | group.add_argument('--CPCCTCNormalizePreds', action='store_true') 37 | group.add_argument('--limitNegsInBatch', type=int, default=0, 38 | help='Limit the number of different seqs from whithc neg samples are taken.') 39 | 40 | 41 | group.add_argument('--negativeSamplingExt', type=int, default=128, 42 | help='Number of negative samples to take.') 43 | group.add_argument('--learningRate', type=float, default=2e-4) 44 | group.add_argument('--schedulerStep', type=int, default=-1, 45 | help='Step of the learning rate scheduler: at each ' 46 | 'step the learning rate is divided by 2. Default: ' 47 | 'no scheduler.') 48 | group.add_argument('--schedulerRamp', type=int, default=None, 49 | help='Enable a warm up phase for the learning rate: ' 50 | 'adds a linear ramp of the given size.') 51 | group.add_argument('--beta1', type=float, default=0.9, 52 | help='Value of beta1 for the Adam optimizer') 53 | group.add_argument('--beta2', type=float, default=0.999, 54 | help='Value of beta2 for the Adam optimizer') 55 | group.add_argument('--epsilon', type=float, default=1e-08, 56 | help='Value of epsilon for the Adam optimizer') 57 | group.add_argument('--sizeWindow', type=int, default=20480, 58 | help='Number of frames to consider at each batch.') 59 | group.add_argument('--nEpoch', type=int, default=200, 60 | help='Number of epoch to run') 61 | group.add_argument('--samplingType', type=str, default='samespeaker', 62 | choices=['samespeaker', 'uniform', 63 | 'samesequence', 'sequential'], 64 | help='How to sample the negative examples in the ' 65 | 'CPC loss.') 66 | group.add_argument('--nLevelsPhone', type=int, default=1, 67 | help='(Supervised mode only). Number of layers in ' 68 | 'the phone classification network.') 69 | group.add_argument('--cpc_mode', type=str, default=None, 70 | choices=['reverse', 'none'], 71 | help='Some variations on CPC.') 72 | group.add_argument('--encoder_type', type=str, 73 | choices=['cpc', 'mfcc', 'lfb'], 74 | default='cpc', 75 | help='Replace the encoder network by mfcc features ' 76 | 'or learned filter banks') 77 | group.add_argument('--normMode', type=str, default='layerNorm', 78 | choices=['instanceNorm', 'ID', 'layerNorm', 79 | 'batchNorm'], 80 | help="Type of normalization to use in the encoder " 81 | "network (default is layerNorm).") 82 | group.add_argument('--onEncoder', action='store_true', 83 | help="(Supervised mode only) Perform the " 84 | "classification on the encoder's output.") 85 | group.add_argument('--random_seed', type=int, default=None, 86 | help="Set a specific random seed.") 87 | group.add_argument('--speakerEmbedding', type=int, default=0, 88 | help="(Depreciated) Feed the prediction network with " 89 | "speaker embeddings along with the usual sequence.") 90 | group.add_argument('--arMode', default='LSTM', 91 | choices=['GRU', 'LSTM', 'RNN', 'no_ar', 'transformer'], 92 | help="Architecture to use for the auto-regressive " 93 | "network (default is lstm).") 94 | group.add_argument('--nLevelsGRU', type=int, default=1, 95 | help='Number of layers in the autoregressive network.') 96 | group.add_argument('--rnnMode', type=str, default='transformer', 97 | choices=['transformer', 'RNN', 'LSTM', 'linear', 98 | 'ffd', 'conv4', 'conv8', 'conv12'], 99 | help="Architecture to use for the prediction network") 100 | group.add_argument('--dropout', action='store_true', 101 | help="Add a dropout layer at the output of the " 102 | "prediction network.") 103 | group.add_argument('--abspos', action='store_true', 104 | help='If the prediction network is a transformer, ' 105 | 'active to use absolute coordinates.') 106 | 107 | return parser 108 | -------------------------------------------------------------------------------- /cpc/criterion/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | from .criterion import CPCUnsupersivedCriterion, SpeakerCriterion, \ 6 | PhoneCriterion, NoneCriterion, CTCPhoneCriterion, SpeakerDoubleCriterion 7 | -------------------------------------------------------------------------------- /cpc/criterion/clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from .clustering import kMeanCluster, kMeanGPU -------------------------------------------------------------------------------- /cpc/criterion/clustering/clustering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import progressbar 6 | import torch 7 | import torch.nn as nn 8 | import cpc.feature_loader as fl 9 | from .. import CTCPhoneCriterion 10 | from os.path import join, exists 11 | from os import remove 12 | from time import time 13 | 14 | 15 | class kMeanCluster(nn.Module): 16 | 17 | def __init__(self, Ck, norm_vec_len=False): 18 | 19 | super(kMeanCluster, self).__init__() 20 | self.register_buffer('Ck', Ck) 21 | self.k = Ck.size(1) 22 | self.norm_vec_len = norm_vec_len 23 | print("-----> kMeanCluster init") 24 | 25 | def forward(self, features): 26 | B, S, D = features.size() 27 | if self.norm_vec_len: 28 | featuresLengths = torch.sqrt((features*features).sum(2)) 29 | features = features / featuresLengths.view(*(featuresLengths.shape), 1) 30 | Ck = self.Ck 31 | 32 | if self.norm_vec_len: 33 | CkLengths = torch.sqrt((Ck*Ck).sum(2)) 34 | Ck = Ck / CkLengths.view(*(CkLengths.shape), 1) 35 | clen = torch.sqrt((Ck*Ck).sum(2)) 36 | features = features.contiguous().view(B*S, 1, -1) 37 | return ((features - Ck)**2).sum(dim=2).view(-1, S, self.k) 38 | 39 | 40 | class kMeanClusterStep(torch.nn.Module): 41 | 42 | def __init__(self, k, D, norm_vec_len=False): 43 | 44 | super(kMeanClusterStep, self).__init__() 45 | self.k = k 46 | self.register_buffer('Ck', torch.zeros(1, k, D)) 47 | self.norm_vec_len = norm_vec_len 48 | 49 | def forward(self, locF): 50 | 51 | if self.norm_vec_len: 52 | locFLengths = torch.sqrt((locF*locF).sum(2)) 53 | locF = locF / locFLengths.view(*(locFLengths.shape), 1) 54 | 55 | index = ((locF - self.Ck)**2).mean(dim=2).min(dim=1)[1] 56 | Ck1 = torch.cat([locF[index == p].sum(dim=0, keepdim=True) 57 | for p in range(self.k)], dim=1) 58 | nItems = torch.cat([(index == p).sum(dim=0, keepdim=True) 59 | for p in range(self.k)], dim=0).view(1, -1) 60 | 61 | return Ck1, nItems 62 | 63 | 64 | def kMeanGPU(dataLoader, featureMaker, k, n_group=1, 65 | MAX_ITER=100, EPSILON=1e-4, 66 | perIterSize=-1, start_clusters=None, 67 | save=False, load=False, save_dir=None, 68 | save_last=5, norm_vec_len=False): 69 | 70 | print(f"Start Kmean clustering with {k} clusters and {n_group} groups...") 71 | 72 | if save or load: 73 | assert save_dir is not None 74 | 75 | if start_clusters is None: 76 | if load and exists(join(save_dir, "checkpoint_last.pt")): 77 | print("Loading from last checkpoint") 78 | state_dict = torch.load(join(save_dir, "checkpoint_last.pt")) 79 | Ck = state_dict["state_dict"]["Ck"] 80 | D = Ck.size(2) 81 | else: 82 | Ck = [] 83 | with torch.no_grad(): 84 | for index, data in enumerate(dataLoader): 85 | cFeature = featureMaker(data) 86 | cFeature = cFeature.contiguous().view(-1, cFeature.size(2)//n_group) 87 | Ck.append(cFeature) 88 | if index > k: 89 | break 90 | Ck = torch.cat(Ck, dim=0) 91 | N, D = Ck.size() 92 | indexes = torch.randperm(N)[:k] 93 | Ck = Ck[indexes].view(k, D) #(1, k, D) 94 | # centers will be normalized from the very beginning and kept like that, later only norm points (AND re-normalize centers after each epoch-iter) 95 | if norm_vec_len: 96 | CkLengths = torch.sqrt((Ck*Ck).sum(1)) 97 | Ck = Ck / CkLengths.view(-1, 1) 98 | Ck = Ck.view(1, k, D) 99 | else: 100 | Ck = start_clusters 101 | D = Ck.size(2) 102 | 103 | if perIterSize < 0: 104 | perIterSize = len(dataLoader) 105 | 106 | clusterStep = kMeanClusterStep(k, D, norm_vec_len=norm_vec_len).cuda() 107 | clusterStep = torch.nn.DataParallel(clusterStep) 108 | clusterStep.module.Ck.copy_(Ck) 109 | 110 | bar = progressbar.ProgressBar(maxval=MAX_ITER) 111 | bar.start() 112 | iter, stored = 0, 0 113 | if load and start_clusters is None and exists(join(save_dir, "checkpoint_last.pt")): 114 | iter = state_dict["iteration"] 115 | lastDiff = state_dict["lastDiff"] 116 | print(f"Continuing training from iteration {iter}. lastDiff: {lastDiff}") 117 | with torch.no_grad(): 118 | while iter < MAX_ITER: 119 | start_time = time() 120 | Ck1 = torch.zeros(Ck.size()).cuda() 121 | nItemsClusters = torch.zeros(Ck.size(1), 122 | dtype=torch.long).cuda() 123 | for index, data in enumerate(dataLoader): 124 | cFeature = featureMaker(data).contiguous().view(-1, 1, D) 125 | locC, locN = clusterStep(cFeature) 126 | Ck1 += locC.sum(dim=0, keepdim=True) 127 | nItemsClusters += locN.sum(dim=0) 128 | ### If the training set is too big and we want to redude the number of item per iteration 129 | # stored += 1 130 | # if stored >= perIterSize: 131 | # bar.update(iter) 132 | # iter += 1 133 | # stored = 0 134 | # if iter >= MAX_ITER: 135 | # break 136 | 137 | iter += 1 138 | bar.update(iter) 139 | 140 | nItemsClusters = nItemsClusters.float().view(1, -1, 1) + 1e-8 141 | Ck1 /= nItemsClusters 142 | 143 | if norm_vec_len: # need to re-normalize, as mean of things of length 1 has length <= 1 144 | Ck1Lengths = torch.sqrt((Ck1*Ck1).sum(2)) 145 | print("clustNorm", Ck1.shape, Ck1Lengths.shape, Ck1Lengths.view(*(Ck1Lengths.shape), 1).shape) 146 | Ck1 = Ck1 / Ck1Lengths.view(*(Ck1Lengths.shape), 1) 147 | 148 | lastDiff = (clusterStep.module.Ck - Ck1).norm(dim=2).max().item() 149 | nItems = int(nItemsClusters.sum().cpu().detach().item()) 150 | info=f"ITER {iter} done in {time()-start_time:.2f} seconds. nItems: {nItems}. Difference with last checkpoint: {lastDiff}" 151 | print(info) 152 | with open(join(save_dir, "training_logs.txt"), "a") as f: 153 | f.write(info+"\n") 154 | if save: 155 | info=f"Saving last checkpoint to {join(save_dir, 'checkpoint_last.pt')}" 156 | print(info) 157 | with open(join(save_dir, "training_logs.txt"), "a") as f: 158 | f.write(info+"\n") 159 | out_state_dict = {} 160 | 161 | clusterModule = kMeanCluster(Ck1, norm_vec_len=norm_vec_len) 162 | out_state_dict["state_dict"] = clusterModule.state_dict() 163 | out_state_dict["n_clusters"] = Ck1.size(1) 164 | out_state_dict['dim'] = Ck1.size(2) 165 | out_state_dict["iteration"] = iter 166 | out_state_dict["lastDiff"] = lastDiff 167 | torch.save(out_state_dict, join(save_dir, "checkpoint_last.pt")) 168 | torch.save(out_state_dict, join(save_dir, f"checkpoint_{iter}.pt")) 169 | if exists(join(save_dir, f"checkpoint_{iter-save_last}.pt")): 170 | remove(join(save_dir, f"checkpoint_{iter-save_last}.pt")) 171 | if lastDiff < EPSILON: 172 | print( 173 | f"Clustering ended in {iter} iterations out of {MAX_ITER}") 174 | break 175 | clusterStep.module.Ck.copy_(Ck1) 176 | 177 | bar.finish() 178 | 179 | print(f"Clustering ended in {MAX_ITER} iterations out of {MAX_ITER}") 180 | print(f"Last diff {lastDiff}") 181 | if start_clusters is not None: 182 | nEmptyClusters = (nItemsClusters < 1).sum().item() 183 | print(f"{nEmptyClusters} empty clusters out of {k}") 184 | return clusterStep.module.Ck 185 | -------------------------------------------------------------------------------- /cpc/criterion/clustering/clustering_quantization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import progressbar 6 | from pathlib import Path 7 | from random import shuffle 8 | from time import time 9 | import torch 10 | from cpc.dataset import findAllSeqs 11 | from cpc.feature_loader import buildFeature, FeatureModule, loadModel, buildFeature_batch 12 | from cpc.criterion.clustering import kMeanCluster 13 | #from cpc.criterion.research.clustering import kMeanCluster 14 | 15 | 16 | def readArgs(pathArgs): 17 | print(f"Loading args from {pathArgs}") 18 | with open(pathArgs, 'r') as file: 19 | args = argparse.Namespace(**json.load(file)) 20 | 21 | return args 22 | 23 | 24 | def loadClusterModule(pathCheckpoint, norm_vec_len=False): 25 | print(f"Loading ClusterModule at {pathCheckpoint}") 26 | state_dict = torch.load(pathCheckpoint) 27 | if "state_dict" in state_dict: #kmeans 28 | clusterModule = kMeanCluster(torch.zeros(1, state_dict["n_clusters"], state_dict["dim"]), norm_vec_len) 29 | clusterModule.load_state_dict(state_dict["state_dict"]) 30 | else: #dpmeans 31 | clusterModule = kMeanCluster(state_dict["mu"]) 32 | clusterModule = clusterModule.cuda() 33 | return clusterModule 34 | 35 | def parseArgs(argv): 36 | # Run parameters 37 | parser = argparse.ArgumentParser(description='Quantize audio files using CPC Clustering Module.') 38 | parser.add_argument('pathCheckpoint', type=str, 39 | help='Path to the clustering checkpoint.') 40 | parser.add_argument('pathDB', type=str, 41 | help='Path to the dataset that we want to quantize.') 42 | parser.add_argument('pathOutput', type=str, 43 | help='Path to the output directory.') 44 | parser.add_argument('--pathSeq', type=str, 45 | help='Path to the sequences (file names) to be included used.') 46 | parser.add_argument('--split', type=str, default=None, 47 | help="If you want to divide the dataset in small splits, specify it " 48 | "with idxSplit-numSplits (idxSplit > 0), eg. --split 1-20.") 49 | parser.add_argument('--file_extension', type=str, default=".flac", 50 | help="Extension of the audio files in the dataset (default: .flac).") 51 | parser.add_argument('--max_size_seq', type=int, default=10240, 52 | help='Maximal number of frames to consider ' 53 | 'when computing a batch of features (defaut: 10240).') 54 | parser.add_argument('--batch_size', type=int, default=8, 55 | help='Batch size used to compute features ' 56 | 'when computing each file (defaut: 8).') 57 | parser.add_argument('--strict', type=bool, default=True, 58 | help='If activated, each batch of feature ' 59 | 'will contain exactly max_size_seq frames (defaut: True).') 60 | parser.add_argument('--debug', action='store_true', 61 | help="Load only a very small amount of files for " 62 | "debugging purposes.") 63 | parser.add_argument('--nobatch', action='store_true', 64 | help="Don't use batch implementation of when building features." 65 | "NOTE: This can have better quantized units as we can set " 66 | "model.gAR.keepHidden = True (line 162), but the quantization" 67 | "will be a bit longer.") 68 | parser.add_argument('--recursionLevel', type=int, default=1, 69 | help='Speaker level in pathDB (defaut: 1). This is only helpful' 70 | 'when --separate-speaker is activated.') 71 | parser.add_argument('--separate-speaker', action='store_true', 72 | help="Separate each speaker with a different output file.") 73 | 74 | 75 | parser.add_argument('--norm_vec_len', action='store_true', 76 | help="Normalize vector lengths.") 77 | 78 | return parser.parse_args(argv) 79 | 80 | def main(argv): 81 | # Args parser 82 | args = parseArgs(argv) 83 | 84 | print("=============================================================") 85 | print(f"Quantizing data from {args.pathDB}") 86 | print("=============================================================") 87 | 88 | # Check if directory exists 89 | if not os.path.exists(args.pathOutput): 90 | print("") 91 | print(f"Creating the output directory at {args.pathOutput}") 92 | Path(args.pathOutput).mkdir(parents=True, exist_ok=True) 93 | 94 | # Get splits 95 | if args.split: 96 | assert len(args.split.split("-"))==2 and int(args.split.split("-")[1]) >= int(args.split.split("-")[0]) >= 1, \ 97 | "SPLIT must be under the form idxSplit-numSplits (numSplits >= idxSplit >= 1), eg. --split 1-20" 98 | idx_split, num_splits = args.split.split("-") 99 | idx_split = int(idx_split) 100 | num_splits = int(num_splits) 101 | 102 | # Find all sequences 103 | print("") 104 | print(f"Looking for all {args.file_extension} files in {args.pathDB} with speakerLevel {args.recursionLevel}") 105 | seqNames, speakers = findAllSeqs(args.pathDB, 106 | speaker_level=args.recursionLevel, 107 | extension=args.file_extension, 108 | loadCache=True) 109 | 110 | if args.pathSeq: 111 | with open(args.pathSeq, 'r') as f: 112 | seqs = set([x.strip() for x in f]) 113 | 114 | filtered = [] 115 | for s in seqNames: 116 | if s[1].split('/')[-1].split('.')[0] in seqs: 117 | filtered.append(s) 118 | seqNames = filtered 119 | 120 | print(f"Done! Found {len(seqNames)} files and {len(speakers)} speakers!") 121 | if args.separate_speaker: 122 | seqNames_by_speaker = {} 123 | for seq in seqNames: 124 | speaker = seq[1].split("/")[args.recursionLevel-1] 125 | if speaker not in seqNames_by_speaker: 126 | seqNames_by_speaker[speaker] = [] 127 | seqNames_by_speaker[speaker].append(seq) 128 | 129 | # Check if output file exists 130 | if not args.split: 131 | nameOutput = "quantized_outputs.txt" 132 | else: 133 | nameOutput = f"quantized_outputs_split_{idx_split}-{num_splits}.txt" 134 | if args.separate_speaker is False: 135 | outputFile = os.path.join(args.pathOutput, nameOutput) 136 | assert not os.path.exists(outputFile), \ 137 | f"Output file {outputFile} already exists !!!" 138 | 139 | # Get splits 140 | if args.split: 141 | startIdx = len(seqNames) // num_splits * (idx_split-1) 142 | if idx_split == num_splits: 143 | endIdx = len(seqNames) 144 | else: 145 | endIdx = min(len(seqNames) // num_splits * idx_split, len(seqNames)) 146 | seqNames = seqNames[startIdx:endIdx] 147 | print("") 148 | print(f"Quantizing split {idx_split} out of {num_splits} splits, with {len(seqNames)} files (idx in range({startIdx}, {endIdx})).") 149 | 150 | # Debug mode 151 | if args.debug: 152 | nsamples=20 153 | print("") 154 | print(f"Debug mode activated, only load {nsamples} samples!") 155 | # shuffle(seqNames) 156 | seqNames = seqNames[:nsamples] 157 | 158 | # Load Clustering args 159 | assert args.pathCheckpoint[-3:] == ".pt" 160 | if os.path.exists(args.pathCheckpoint[:-3] + "_args.json"): 161 | pathConfig = args.pathCheckpoint[:-3] + "_args.json" 162 | elif os.path.exists(os.path.join(os.path.dirname(args.pathCheckpoint), "checkpoint_args.json")): 163 | pathConfig = os.path.join(os.path.dirname(args.pathCheckpoint), "checkpoint_args.json") 164 | else: 165 | assert False, \ 166 | f"Args file not found in the directory {os.path.dirname(args.pathCheckpoint)}" 167 | clustering_args = readArgs(pathConfig) 168 | print("") 169 | print(f"Clutering args:\n{json.dumps(vars(clustering_args), indent=4, sort_keys=True)}") 170 | print('-' * 50) 171 | 172 | # Load CluterModule 173 | clusterModule = loadClusterModule(args.pathCheckpoint, norm_vec_len=args.norm_vec_len) 174 | clusterModule.cuda() 175 | 176 | # Load FeatureMaker 177 | print("") 178 | print("Loading CPC FeatureMaker") 179 | if 'level_gru' in vars(clustering_args) and clustering_args.level_gru is not None: 180 | updateConfig = argparse.Namespace(nLevelsGRU=clustering_args.level_gru) 181 | else: 182 | updateConfig = None 183 | model = loadModel([clustering_args.pathCheckpoint], updateConfig=updateConfig)[0] 184 | ## If we don't apply batch implementation, we can set LSTM model to keep hidden units 185 | ## making the quality of the quantized units better 186 | if args.nobatch: 187 | model.gAR.keepHidden = True 188 | featureMaker = FeatureModule(model, clustering_args.encoder_layer) 189 | if clustering_args.dimReduction is not None: 190 | dimRed = loadDimReduction(clustering_args.dimReduction, clustering_args.centroidLimits) 191 | featureMaker = torch.nn.Sequential(featureMaker, dimRed) 192 | if not clustering_args.train_mode: 193 | featureMaker.eval() 194 | featureMaker.cuda() 195 | def feature_function(x): 196 | if args.nobatch is False: 197 | res0 = buildFeature_batch(featureMaker, x, 198 | seqNorm=False, 199 | strict=args.strict, 200 | maxSizeSeq=args.max_size_seq, 201 | batch_size=args.batch_size) 202 | if args.norm_vec_len: 203 | # [!] we actually used CPC_audio/scripts/quantize_audio.py for that in the end 204 | res0Lengths = torch.sqrt((res0*res0).sum(2)) 205 | res0 = res0 / res0Lengths.view(*(res0Lengths.shape), 1) 206 | return res0 207 | else: 208 | res0 = buildFeature(featureMaker, x, 209 | seqNorm=False, 210 | strict=args.strict) 211 | if args.norm_vec_len: 212 | # [!] we actually used CPC_audio/scripts/quantize_audio.py for that in the end 213 | res0Lengths = torch.sqrt((res0*res0).sum(2)) 214 | res0 = res0 / res0Lengths.view(*(res0Lengths.shape), 1) 215 | return res0 216 | print("CPC FeatureMaker loaded!") 217 | 218 | # Quantization of files 219 | print("") 220 | print(f"Quantizing audio files...") 221 | seqQuantLines = [] 222 | bar = progressbar.ProgressBar(maxval=len(seqNames)) 223 | bar.start() 224 | start_time = time() 225 | for index, vals in enumerate(seqNames): 226 | bar.update(index) 227 | 228 | file_path = vals[1] 229 | file_path = os.path.join(args.pathDB, file_path) 230 | 231 | # Get features & quantizing 232 | cFeatures = feature_function(file_path).cuda() 233 | 234 | nGroups = cFeatures.size(-1)//clusterModule.Ck.size(-1) 235 | 236 | cFeatures = cFeatures.view(1, -1, clusterModule.Ck.size(-1)) 237 | 238 | if len(vals) > 2 and int(vals[-1]) > 9400000: # Librilight, to avoid OOM 239 | clusterModule = clusterModule.cpu() 240 | cFeatures = cFeatures.cpu() 241 | qFeatures = torch.argmin(clusterModule(cFeatures), dim=-1) 242 | clusterModule = clusterModule.cuda() 243 | else: 244 | qFeatures = torch.argmin(clusterModule(cFeatures), dim=-1) 245 | qFeatures = qFeatures[0].detach().cpu().numpy() 246 | 247 | # Transform to quantized line 248 | quantLine = ",".join(["-".join([str(i) for i in item]) for item in qFeatures.reshape(-1, nGroups)]) 249 | seqQuantLines.append(quantLine) 250 | 251 | bar.finish() 252 | print(f"...done {len(seqQuantLines)} files in {time()-start_time} seconds.") 253 | 254 | # Saving outputs 255 | print("") 256 | print(f"Saving outputs to {outputFile}") 257 | outLines = [] 258 | for vals, quantln in zip(seqNames, seqQuantLines): 259 | file_path = vals[1] 260 | file_name = os.path.splitext(os.path.basename(file_path))[0] 261 | outLines.append("\t".join([file_name, quantln])) 262 | with open(outputFile, "w") as f: 263 | f.write("\n".join(outLines)) 264 | 265 | if __name__ == "__main__": 266 | args = sys.argv[1:] 267 | main(args) 268 | 269 | -------------------------------------------------------------------------------- /cpc/criterion/clustering/clustering_script.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import torch 6 | import numpy as np 7 | import time 8 | import argparse 9 | import sys 10 | import os 11 | import json 12 | from random import shuffle 13 | from cpc.criterion.clustering import kMeanCluster, kMeanGPU 14 | from pathlib import Path 15 | 16 | 17 | def getQuantile(sortedData, percent): 18 | return sortedData[int(percent * len(sortedData))] 19 | 20 | 21 | def parseArgs(argv): 22 | # Run parameters 23 | parser = argparse.ArgumentParser(description='Clustering module using kmeans or dpmeans.') 24 | parser.add_argument('pathCheckpoint', type=str, 25 | help="Path to the checkpoint of CPC module.") 26 | parser.add_argument('pathOutput', type=str, 27 | help="Path to the output clustering checkpoint.") 28 | parser.add_argument( 29 | '--pathDB', type=str, 30 | default="/datasets01/LibriSpeech/022219/train-clean-100/") 31 | parser.add_argument('-k', '--nClusters', type=int, default=50, 32 | help="Number of clusters for kmeans algorithm (default: 50).") 33 | parser.add_argument('-g', '--nGroups', type=int, default=1, 34 | help="Number of groups for kmeans algorithm (default: 1).") 35 | parser.add_argument('-n', '--MAX_ITER', type=int, default=100, 36 | help="Number of iterations (default: 150).") 37 | parser.add_argument('--recursionLevel', type=int, default=2, 38 | help="The speaker recursionLevel in the training dataset (default: 2).") 39 | parser.add_argument('--extension', type=str, default='.flac', 40 | help="The audio file extension (default: .flac).") 41 | parser.add_argument('--seqList', type=str, default=None, 42 | help="Specific the training sequence list (default: None).") 43 | parser.add_argument('--sizeWindow', type=int, default=10240, 44 | help="The size of the window when loading audio data (default: 10240).") 45 | parser.add_argument('--debug', action='store_true', 46 | help='Debug mode, only use a small number of training data.') 47 | parser.add_argument('--encoder_layer', action='store_true', 48 | help='Whether to use the output of the encoder for the clustering.') 49 | parser.add_argument('--level_gru', type=int, default=None, 50 | help='Specify the LSTM hidden level to take the representation (default: None).') 51 | parser.add_argument('--batchSizeGPU', type=int, default=50, 52 | help='Batch size of each GPU (default: 50).') 53 | parser.add_argument('--DPMean', action='store_true', 54 | help='Activate DPMeans training instead of Kmeans.') 55 | parser.add_argument('-l', '--DPLambda', type=float, default=11, 56 | help='Lambda parameter of DPMeans algo (default: 11).') 57 | parser.add_argument('--perIterSize', type=int, default=-1, 58 | help='(Depreciated) Number of items per iteration (default: -1).') 59 | parser.add_argument('--train_mode', action='store_true', 60 | help='Activate training CPC module too.') 61 | parser.add_argument('--dimReduction', type=str, default=None, 62 | help='Dimentionality reduction (default: None)') 63 | parser.add_argument('--centroidLimits', type=int, nargs=2, default=None, 64 | help='centroidLimits when using dimentionality reduction (default: None)') 65 | parser.add_argument('--getDistanceEstimation', action='store_true', 66 | help='Get distance estimation') 67 | parser.add_argument('--save', action='store_true', 68 | help='Save the intermediate checkpoints. The checkpoints will' 69 | 'be saved in the same directory as the output.') 70 | parser.add_argument('--load', action='store_true', 71 | help='Load the last checkpoint from the same directory as the output.') 72 | parser.add_argument('--save-last', type=int, default=5, 73 | help='Number of last checkpoints to be saved (default: 5).') 74 | 75 | parser.add_argument('--n_process_loader', type=int, default=8, 76 | help='Number of processes to call to load the ' 77 | 'dataset') 78 | parser.add_argument('--max_size_loaded', type=int, default=4000000000, 79 | help='Maximal amount of data (in byte) a dataset ' 80 | 'can hold in memory at any given time') 81 | 82 | parser.add_argument('--nullspace', action='store_true', 83 | help="Additionally load nullspace") 84 | 85 | parser.add_argument('--norm_vec_len', action='store_true', 86 | help="Normalize vector lengths.") 87 | 88 | return parser.parse_args(argv) 89 | 90 | # some example with nullspace and normalization making dists cosine: 91 | # python cpc/criterion/clustering/clustering_script.py --pathDB /pio/data/zerospeech2021/LibriSpeech/dev-clean \ 92 | # --recursionLevel 1 --nClusters 50 --MAX_ITER 10 --level_gru 2 --save --load --batchSizeGPU 200 --max_size_loaded 40000000 \ 93 | # --n_process_loader 2 --nullspace --norm_vec_len ../nspChp/64ok/checkpoint_9.pt ../nspChp/tryNew64-11/try11chp.pt 94 | 95 | 96 | if __name__ == "__main__": 97 | torch.cuda.empty_cache() 98 | 99 | import os 100 | from cpc.feature_loader import loadModel, FeatureModule 101 | from cpc.dataset import findAllSeqs, filterSeqs, AudioBatchData 102 | 103 | args = parseArgs(sys.argv[1:]) 104 | # Export absolute paths for later use 105 | args.pathCheckpoint = os.path.abspath(args.pathCheckpoint) 106 | args.pathOutput = os.path.abspath(args.pathOutput) 107 | args.pathDB = os.path.abspath(args.pathDB) 108 | 109 | if not args.load: 110 | assert os.path.exists(args.pathOutput) is False, \ 111 | f"The output file {args.pathOutput} already exists, please check the option --load !" 112 | assert os.path.exists(os.path.join(os.path.dirname(args.pathOutput), "checkpoint_last.pt")) is False, \ 113 | f"Found last_checkpoint.pt in the output directory, please check the option --load !" 114 | 115 | print(args) 116 | seqNames, speakers = findAllSeqs(args.pathDB, 117 | speaker_level=args.recursionLevel, 118 | extension=args.extension, 119 | loadCache=True) 120 | 121 | if args.seqList is not None: 122 | seqNames = filterSeqs(args.seqList, seqNames) 123 | if args.debug: 124 | nsamples=1000 125 | print(f"Debug mode activated, get only {nsamples} samples!") 126 | shuffle(seqNames) 127 | seqNames = seqNames[:nsamples] 128 | if args.getDistanceEstimation: 129 | shuffle(seqNames) 130 | seqNames = seqNames[:5000] 131 | 132 | print("") 133 | print(f'Loading audio data at {args.pathDB}') 134 | start_time = time.time() 135 | dataset = AudioBatchData(args.pathDB, 136 | args.sizeWindow, 137 | seqNames, 138 | None, 139 | len(speakers), 140 | nProcessLoader=args.n_process_loader, 141 | MAX_SIZE_LOADED=args.max_size_loaded) 142 | print(f"Dataset loaded in {time.time()-start_time} seconds !") 143 | print("") 144 | 145 | nGPUs = torch.cuda.device_count() 146 | batchSize = args.batchSizeGPU * nGPUs 147 | trainLoader = dataset.getDataLoader(batchSize, "uniform", 148 | False, numWorkers=0) 149 | print(f"Length of dataLoader: {len(trainLoader)}") 150 | print("") 151 | 152 | 153 | if args.level_gru is None: 154 | updateConfig = None 155 | else: 156 | updateConfig = argparse.Namespace(nLevelsGRU=args.level_gru) 157 | 158 | model = loadModel([args.pathCheckpoint], updateConfig=updateConfig, load_nullspace=args.nullspace)[0] 159 | #model = loadModel([args.pathCheckpoint])[0]#, updateConfig=updateConfig)[0] 160 | 161 | featureMaker = FeatureModule(model, args.encoder_layer) 162 | print("Checkpoint loaded!") 163 | print("") 164 | 165 | if not args.train_mode: 166 | featureMaker.eval() 167 | featureMaker.cuda() 168 | 169 | # Check if dir exists 170 | if not os.path.exists(os.path.dirname(args.pathOutput)) and os.path.dirname(args.pathOutput): 171 | Path(os.path.dirname(args.pathOutput)).mkdir(parents=True, exist_ok=True) 172 | 173 | pathConfig = f"{os.path.splitext(args.pathOutput)[0]}_args.json" 174 | with open(pathConfig, 'w') as file: 175 | json.dump(vars(args), file, indent=2) 176 | 177 | out_state_dict = {} 178 | print("Starting the clustering...") 179 | start_time = time.time() 180 | clusters = kMeanGPU(trainLoader, featureMaker, args.nClusters, args.nGroups, 181 | perIterSize=args.perIterSize, 182 | MAX_ITER=args.MAX_ITER, 183 | save=args.save, load=args.load, 184 | save_dir=os.path.dirname(args.pathOutput), 185 | save_last=args.save_last, 186 | norm_vec_len=args.norm_vec_len).cpu() 187 | 188 | 189 | print(f'Ran clustering ' 190 | f'in {time.time() - start_time:.2f} seconds') 191 | 192 | clusterModule = kMeanCluster(clusters, norm_vec_len=args.norm_vec_len) 193 | 194 | out_state_dict["state_dict"] = clusterModule.state_dict() 195 | out_state_dict["encoder_layer"] = args.encoder_layer 196 | out_state_dict["n_clusters"] = args.nClusters 197 | out_state_dict['dim'] = clusters.size(2) 198 | torch.save(out_state_dict, args.pathOutput) 199 | with open(pathConfig, 'w') as file: 200 | json.dump(vars(args), file, indent=2) 201 | -------------------------------------------------------------------------------- /cpc/criterion/custom_layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 6 | import math 7 | 8 | import torch.nn as nn 9 | 10 | from numpy import prod 11 | 12 | 13 | class NormalizationLayer(nn.Module): 14 | 15 | def __init__(self): 16 | super(NormalizationLayer, self).__init__() 17 | 18 | def forward(self, x, epsilon=1e-8): 19 | return x * (((x**2).mean(dim=1, keepdim=True) + epsilon).rsqrt()) 20 | 21 | 22 | def Upscale2d(x, factor=2): 23 | assert isinstance(factor, int) and factor >= 1 24 | if factor == 1: 25 | return x 26 | s = x.size() 27 | x = x.view(-1, s[1], s[2], 1, s[3], 1) 28 | x = x.expand(-1, s[1], s[2], factor, s[3], factor) 29 | x = x.contiguous().view(-1, s[1], s[2] * factor, s[3] * factor) 30 | return x 31 | 32 | 33 | def getLayerNormalizationFactor(x): 34 | r""" 35 | Get He's constant for the given layer 36 | https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf 37 | """ 38 | size = x.weight.size() 39 | fan_in = prod(size[1:]) 40 | 41 | return math.sqrt(2.0 / fan_in) 42 | 43 | 44 | class ConstrainedLayer(nn.Module): 45 | r""" 46 | A handy refactor that allows the user to: 47 | - initialize one layer's bias to zero 48 | - apply He's initialization at runtime 49 | """ 50 | 51 | def __init__(self, 52 | module, 53 | equalized=True, 54 | lrMul=1.0, 55 | initBiasToZero=True): 56 | r""" 57 | equalized (bool): if true, the layer's weight should evolve within 58 | the range (-1, 1) 59 | initBiasToZero (bool): if true, bias will be initialized to zero 60 | """ 61 | 62 | super(ConstrainedLayer, self).__init__() 63 | 64 | self.module = module 65 | self.equalized = equalized 66 | 67 | if initBiasToZero and module.bias is not None: 68 | self.module.bias.data.fill_(0) 69 | if self.equalized: 70 | self.module.weight.data.normal_(0, 1) 71 | self.weight = getLayerNormalizationFactor(self.module) * lrMul 72 | 73 | def forward(self, x): 74 | 75 | x = self.module(x) 76 | if self.equalized: 77 | x *= self.weight 78 | return x 79 | 80 | 81 | class EqualizedConv1d(ConstrainedLayer): 82 | 83 | def __init__(self, 84 | nChannelsPrevious, 85 | nChannels, 86 | kernelSize, 87 | padding=0, 88 | bias=True, 89 | stride=1, 90 | **kwargs): 91 | r""" 92 | A nn.Conv2d module with specific constraints 93 | Args: 94 | nChannelsPrevious (int): number of channels in the previous layer 95 | nChannels (int): number of channels of the current layer 96 | kernelSize (int): size of the convolutional kernel 97 | padding (int): convolution's padding 98 | bias (bool): with bias ? 99 | """ 100 | 101 | ConstrainedLayer.__init__(self, 102 | nn.Conv1d(nChannelsPrevious, nChannels, 103 | kernelSize, padding=padding, 104 | bias=bias, stride=stride), 105 | **kwargs) 106 | 107 | 108 | class EqualizedConv2d(ConstrainedLayer): 109 | 110 | def __init__(self, 111 | nChannelsPrevious, 112 | nChannels, 113 | kernelSize, 114 | padding=0, 115 | bias=True, 116 | **kwargs): 117 | r""" 118 | A nn.Conv2d module with specific constraints 119 | Args: 120 | nChannelsPrevious (int): number of channels in the previous layer 121 | nChannels (int): number of channels of the current layer 122 | kernelSize (int): size of the convolutional kernel 123 | padding (int): convolution's padding 124 | bias (bool): with bias ? 125 | """ 126 | 127 | ConstrainedLayer.__init__(self, 128 | nn.Conv2d(nChannelsPrevious, nChannels, 129 | kernelSize, padding=padding, 130 | bias=bias), 131 | **kwargs) 132 | 133 | 134 | class EqualizedLinear(ConstrainedLayer): 135 | 136 | def __init__(self, 137 | nChannelsPrevious, 138 | nChannels, 139 | bias=True, 140 | **kwargs): 141 | r""" 142 | A nn.Linear module with specific constraints 143 | Args: 144 | nChannelsPrevious (int): number of channels in the previous layer 145 | nChannels (int): number of channels of the current layer 146 | bias (bool): with bias ? 147 | """ 148 | 149 | ConstrainedLayer.__init__(self, 150 | nn.Linear(nChannelsPrevious, nChannels, 151 | bias=bias), **kwargs) 152 | -------------------------------------------------------------------------------- /cpc/criterion/seq_alignment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import progressbar 6 | import torch 7 | from multiprocessing import Lock, Manager, Process 8 | from copy import deepcopy 9 | 10 | 11 | def beam_search(score_preds, nKeep, blankLabel): 12 | 13 | T, P = score_preds.shape 14 | beams = set(['']) 15 | pb_t_1 = {"": 1} 16 | pnb_t_1 = {"": 0} 17 | 18 | def getLastNumber(b): 19 | return int(b.split(',')[-1]) 20 | 21 | for t in range(T): 22 | 23 | nextBeams = set() 24 | pb_t = {} 25 | pnb_t = {} 26 | for i_beam, b in enumerate(beams): 27 | if b not in pb_t: 28 | pb_t[b] = 0 29 | pnb_t[b] = 0 30 | 31 | if len(b) > 0: 32 | pnb_t[b] += pnb_t_1[b] * score_preds[t, getLastNumber(b)] 33 | pb_t[b] = (pnb_t_1[b] + pb_t_1[b]) * score_preds[t, blankLabel] 34 | nextBeams.add(b) 35 | 36 | for c in range(P): 37 | if c == blankLabel: 38 | continue 39 | 40 | b_ = b + "," + str(c) 41 | if b_ not in pb_t: 42 | pb_t[b_] = 0 43 | pnb_t[b_] = 0 44 | 45 | if b != "" and getLastNumber(b) == c: 46 | pnb_t[b_] += pb_t_1[b] * score_preds[t, c] 47 | else: 48 | pnb_t[b_] += (pb_t_1[b] + pnb_t_1[b]) * score_preds[t, c] 49 | nextBeams.add(b_) 50 | 51 | allPreds = [(pb_t[b] + pnb_t[b], b) for b in nextBeams] 52 | allPreds.sort(reverse=True) 53 | 54 | beams = [x[1] for x in allPreds[:nKeep]] 55 | pb_t_1 = deepcopy(pb_t) 56 | pnb_t_1 = deepcopy(pnb_t) 57 | 58 | output = [] 59 | for score, x in allPreds[:nKeep]: 60 | output.append((score, [int(y) for y in x.split(',') if len(y) > 0])) 61 | return output 62 | 63 | 64 | def collapseLabelChain(inputLabels): 65 | 66 | # Shape N,T 67 | N, T = inputLabels.size() 68 | outSizes = torch.zeros(N, device=inputLabels.device, dtype=torch.int64) 69 | output = [] 70 | for l in range(N): 71 | status = inputLabels[l, :-1] - inputLabels[l, 1:] 72 | status = torch.cat([torch.ones(1, device=status.device, 73 | dtype=status.dtype), 74 | status], dim=0) 75 | outSizes[l] = (status != 0).sum() 76 | output.append(inputLabels[l][status != 0]) 77 | maxSize = int(outSizes.max().item()) 78 | paddedOutput = torch.zeros(N, maxSize, 79 | device=inputLabels.device, 80 | dtype=torch.int64) 81 | 82 | for l in range(N): 83 | S = int(outSizes[l]) 84 | paddedOutput[l, :S] = output[l] 85 | 86 | return paddedOutput, outSizes 87 | 88 | 89 | def NeedlemanWunschAlignScore(seq1, seq2, d, m, r, normalize=True): 90 | 91 | N1, N2 = len(seq1), len(seq2) 92 | 93 | # Fill up the errors 94 | tmpRes_ = [[None for x in range(N2 + 1)] for y in range(N1 + 1)] 95 | for i in range(N1 + 1): 96 | tmpRes_[i][0] = i * d 97 | for j in range(N2 + 1): 98 | tmpRes_[0][j] = j * d 99 | 100 | for i in range(N1): 101 | for j in range(N2): 102 | 103 | match = r if seq1[i] == seq2[j] else m 104 | v1 = tmpRes_[i][j] + match 105 | v2 = tmpRes_[i + 1][j] + d 106 | v3 = tmpRes_[i][j + 1] + d 107 | tmpRes_[i + 1][j + 1] = max(v1, max(v2, v3)) 108 | 109 | i = j = 0 110 | res = -tmpRes_[N1][N2] 111 | if normalize: 112 | res /= float(N1) 113 | return res 114 | 115 | 116 | def get_seq_PER(seqLabels, detectedLabels): 117 | return NeedlemanWunschAlignScore(seqLabels, detectedLabels, -1, -1, 0, 118 | normalize=True) 119 | 120 | 121 | def getPER(dataLoader, featureMaker, blankLabel): 122 | 123 | bar = progressbar.ProgressBar(len(dataLoader)) 124 | bar.start() 125 | 126 | out = 0 127 | n_items = 0 128 | n_keep_beam_search = 100 129 | for index, data in enumerate(dataLoader): 130 | 131 | bar.update(index) 132 | with torch.no_grad(): 133 | output = featureMaker(data).cpu().numpy() 134 | labels = data[1] 135 | labels, targetSize = collapseLabelChain(labels) 136 | lock = Lock() 137 | 138 | def per(rank, outScore): 139 | S = int(targetSize[rank]) 140 | seqLabels = labels[rank, :S] 141 | preds = beam_search(output[rank], 142 | n_keep_beam_search, blankLabel)[0][1] 143 | value = get_seq_PER(seqLabels, preds) 144 | with lock: 145 | outScore.value += value 146 | 147 | manager = Manager() 148 | outScore = manager.Value('f', 0.) 149 | 150 | N, S, D = output.shape 151 | processes = [] 152 | for rank in range(N): 153 | p = Process( 154 | target=per, args=(rank, outScore)) 155 | p.start() 156 | processes.append(p) 157 | for p in processes: 158 | p.join() 159 | 160 | out += outScore.value 161 | n_items += N 162 | 163 | bar.finish() 164 | return (out / n_items) 165 | -------------------------------------------------------------------------------- /cpc/eval/ABX.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import argparse 6 | import sys 7 | import torch 8 | import json 9 | from pathlib import Path 10 | import ABX.abx_group_computation as abx_g 11 | import ABX.abx_iterators as abx_it 12 | from cpc.dataset import findAllSeqs 13 | from cpc.feature_loader import buildFeature, FeatureModule, loadModel 14 | 15 | 16 | def reduce_sparse_data(quotient, divisor): 17 | return quotient / (1e-08 * (divisor == 0) + divisor) 18 | 19 | 20 | def ABX(feature_function, 21 | path_item_file, 22 | seq_list, 23 | distance_mode, 24 | step_feature, 25 | modes, 26 | seq_norm=True, 27 | cuda=False, 28 | max_x_across=5, 29 | max_size_group=30): 30 | 31 | # ABX dataset 32 | ABXDataset = abx_it.ABXFeatureLoader(path_item_file, seq_list, 33 | feature_function, step_feature, True) 34 | 35 | if cuda: 36 | ABXDataset.cuda() 37 | 38 | # Distance function 39 | distance_function = abx_g.get_distance_function_from_name(distance_mode) 40 | 41 | # Output 42 | scores = {} 43 | 44 | # ABX within 45 | if 'within' in modes: 46 | print("Computing ABX within speakers...") 47 | ABXIterator = ABXDataset.get_iterator('within', max_size_group) 48 | group_confusion = abx_g.get_abx_scores_dtw_on_group(ABXIterator, 49 | distance_function, 50 | ABXIterator.symmetric) 51 | n_data = group_confusion._values().size(0) 52 | index_ = torch.sparse.LongTensor(group_confusion._indices(), 53 | torch.ones((n_data), 54 | dtype=torch.float), 55 | group_confusion.size()) 56 | divisor_context = torch.sparse.sum(index_, dim=3).to_dense() 57 | group_confusion = torch.sparse.sum(group_confusion, dim=3).to_dense() 58 | group_confusion = reduce_sparse_data(group_confusion, divisor_context) 59 | S, p1, p2 = group_confusion.size() 60 | 61 | index_speaker = divisor_context > 0 62 | divisor_speaker = index_speaker.sum(dim=0) 63 | phone_confusion = reduce_sparse_data(group_confusion.sum(dim=0), 64 | divisor_speaker) 65 | 66 | scores['within'] = (phone_confusion.sum() / 67 | (divisor_speaker > 0).sum()).item() 68 | print(f"...done. ABX within : {scores['within']}") 69 | 70 | # ABX across 71 | if 'across' in modes: 72 | print("Computing ABX across speakers...") 73 | ABXIterator = ABXDataset.get_iterator('across', max_size_group) 74 | ABXIterator.max_x = max_x_across 75 | group_confusion = abx_g.get_abx_scores_dtw_on_group(ABXIterator, 76 | distance_function, 77 | ABXIterator.symmetric) 78 | n_data = group_confusion._values().size(0) 79 | index_ = torch.sparse.LongTensor(group_confusion._indices(), 80 | torch.ones((n_data), 81 | dtype=torch.float), 82 | group_confusion.size()) 83 | divisor_context = torch.sparse.sum(index_, dim=[3, 4]).to_dense() 84 | group_confusion = torch.sparse.sum( 85 | group_confusion, dim=[3, 4]).to_dense() 86 | group_confusion = reduce_sparse_data(group_confusion, divisor_context) 87 | S, p1, p2 = group_confusion.size() 88 | 89 | index_speaker = divisor_context > 0 90 | divisor_speaker = index_speaker.sum(dim=0) 91 | phone_confusion = reduce_sparse_data(group_confusion.sum(dim=0), 92 | divisor_speaker) 93 | scores['across'] = (phone_confusion.sum() / 94 | (divisor_speaker > 0).sum()).item() 95 | print(f"...done. ABX across : {scores['across']}") 96 | 97 | return scores 98 | 99 | 100 | def update_base_parser(parser): 101 | parser.add_argument('--debug', action='store_true') 102 | parser.add_argument('--feature_size', type=int, default=0.01, 103 | help="Size (in s) of one feature") 104 | parser.add_argument('--cuda', action='store_true', 105 | help="Use the GPU to compute distances") 106 | parser.add_argument('--mode', type=str, default='all', 107 | choices=['all', 'within', 'across'], 108 | help="Type of ABX score to compute") 109 | parser.add_argument("--max_size_group", type=int, default=10, 110 | help="Max size of a group while computing the" 111 | "ABX score") 112 | parser.add_argument("--max_x_across", type=int, default=5, 113 | help="When computing the ABX across score, maximum" 114 | "number of speaker X to sample per couple A,B") 115 | parser.add_argument("--out", type=str, default=None, 116 | help="Path where the results should be saved") 117 | 118 | 119 | def parse_args(argv): 120 | 121 | base_parser = argparse.ArgumentParser(description='ABX metric') 122 | 123 | subparsers = base_parser.add_subparsers(dest='load') 124 | parser_checkpoint = subparsers.add_parser('from_checkpoint') 125 | update_base_parser(parser_checkpoint) 126 | parser_checkpoint.add_argument('path_checkpoint', type=str, 127 | help="Path to the model's checkpoint") 128 | parser_checkpoint.add_argument('path_item_file', type=str, 129 | help="Path to the ABX .item file containing " 130 | "the triplets labels") 131 | parser_checkpoint.add_argument('path_dataset', type=str, 132 | help="Path to the dataset") 133 | parser_checkpoint.add_argument('--seq_norm', action='store_true', 134 | help='If activated, normalize each batch ' 135 | 'of feature across the time channel before ' 136 | 'computing ABX.') 137 | parser_checkpoint.add_argument('--max_size_seq', default=64000, type=int, 138 | help='Maximal number of frames to consider ' 139 | 'when computing a batch of features.') 140 | parser_checkpoint.add_argument('--strict', action='store_true', 141 | help='If activated, each batch of feature ' 142 | 'will contain exactly max_size_seq frames.') 143 | parser_checkpoint.add_argument('--file_extension', type=str, 144 | default='.wav', 145 | help='Extension of ecah audio file in the ' 146 | 'dataset.') 147 | parser_checkpoint.add_argument('--get_encoded', action='store_true', 148 | help='If activated, compute the ABX score ' 149 | 'using the output of the encoder network.') 150 | 151 | parser_db = subparsers.add_parser('from_pre_computed') 152 | update_base_parser(parser_db) 153 | parser_db.add_argument('path_features', type=str, 154 | help="Path to pre-computed torch features (.pt)") 155 | parser_db.add_argument('--file_extension', type=str, 156 | default='.pt', help='Extension of each feature ' 157 | 'in the dataset') 158 | 159 | # multi-gpu / multi-node 160 | return base_parser.parse_args(argv) 161 | 162 | 163 | def main(argv): 164 | 165 | args = parse_args(argv) 166 | 167 | if args.load == 'from_checkpoint': 168 | # Checkpoint 169 | model = loadModel([args.path_checkpoint])[0] 170 | model.gAR.keepHidden = True 171 | # Feature maker 172 | feature_maker = FeatureModule(model, args.get_encoded).cuda().eval() 173 | 174 | def feature_function(x): return buildFeature(feature_maker, x, 175 | seqNorm=args.seq_norm, 176 | strict=args.strict, 177 | maxSizeSeq=args.max_size_seq) 178 | elif args.load == 'from_pre_computed': 179 | def feature_function(x): return torch.load(x, 'cpu') 180 | 181 | # Modes 182 | if args.mode == 'all': 183 | modes = ["within", "across"] 184 | else: 185 | modes = [args.mode] 186 | 187 | distance_mode = 'cosine' 188 | 189 | step_feature = 1 / args.feature_size 190 | 191 | # Get the list of sequences 192 | seq_list, _ = findAllSeqs(args.path_dataset, extension=args.file_extension) 193 | seq_list = [(str(Path(x).stem), str(Path(args.path_dataset) / x)) 194 | for (_, x) in seq_list] 195 | 196 | if args.debug: 197 | seq_list = seq_list[:1000] 198 | 199 | scores = ABX(feature_function, args.path_item_file, 200 | seq_list, distance_mode, 201 | step_feature, modes, 202 | cuda=args.cuda, 203 | seq_norm=args.seq_norm, 204 | max_x_across=args.max_x_across, 205 | max_size_group=args.max_size_group) 206 | 207 | out_dir = Path(args.path_checkpoint).parent if args.out is None \ 208 | else Path(args.out) 209 | out_dir.mkdir(exist_ok=True) 210 | 211 | path_score = out_dir / 'ABX_scores.json' 212 | with open(path_score, 'w') as file: 213 | json.dump(scores, file, indent=2) 214 | 215 | path_args = out_dir / 'ABX_args.json' 216 | with open(path_args, 'w') as file: 217 | json.dump(vars(args), file, indent=2) 218 | 219 | 220 | if __name__ == "__main__": 221 | args = sys.argv[1:] 222 | main(args) 223 | -------------------------------------------------------------------------------- /cpc/eval/ABX/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /cpc/eval/ABX/abx_group_computation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import torch 6 | import math 7 | from . import dtw 8 | import progressbar 9 | 10 | 11 | def get_distance_function_from_name(name_str): 12 | if name_str == 'euclidian': 13 | return get_euclidian_distance_batch 14 | if name_str == 'cosine': 15 | return get_cosine_distance_batch 16 | raise ValueError(f"Invalid distance mode") 17 | 18 | 19 | def check_dtw_group_validity(a, b, x): 20 | assert(len(a.size()) == len(b.size())) 21 | assert(len(a.size()) == len(x.size())) 22 | assert(a.size(2) == x.size(2)) 23 | assert(a.size(2) == b.size(2)) 24 | 25 | 26 | def get_cosine_distance_batch(a1, a2, epsilon=1e-8): 27 | r""" a1 and a2 must be normalized""" 28 | N1, S1, D = a1.size() # Batch x Seq x Channel 29 | N2, S2, D = a2.size() # Batch x Seq x Channel 30 | 31 | prod = (a1.view(N1, 1, S1, 1, D)) * (a2.view(1, N2, 1, S2, D)) 32 | # Sum accross the channel dimension 33 | prod = torch.clamp(prod.sum(dim=4), -1, 1).acos() / math.pi 34 | 35 | return prod 36 | 37 | 38 | def get_euclidian_distance_batch(a1, a2): 39 | N1, S1, D = a1.size() 40 | N2, S2, D = a2.size() 41 | diff = a1.view(N1, 1, S1, 1, D) - a2.view(1, N2, 1, S2, D) 42 | return torch.sqrt((diff**2).sum(dim=4)) 43 | 44 | 45 | def get_distance_group_dtw(a1, a2, size1, size2, 46 | ignore_diag=False, symmetric=False, 47 | distance_function=get_cosine_distance_batch): 48 | 49 | N1, S1, D = a1.size() 50 | N2, S2, D = a2.size() 51 | if size1.size(0) != N1: 52 | print(a1.size(), size1.size()) 53 | print(a2.size(), size2.size()) 54 | assert(size1.size(0) == N1) 55 | assert(size2.size(0) == N2) 56 | 57 | distance_mat = distance_function(a1, a2).detach().cpu().numpy() 58 | return dtw.dtw_batch(a1, a2, size1, size2, 59 | distance_mat, 60 | ignore_diag, symmetric) 61 | 62 | 63 | def get_theta_group_dtw(a, b, x, sa, sb, sx, distance_function, symmetric): 64 | 65 | check_dtw_group_validity(a, b, x) 66 | 67 | dxb = get_distance_group_dtw( 68 | x, b, sx, sb, distance_function=distance_function) 69 | dxa = get_distance_group_dtw(x, a, sx, sa, ignore_diag=symmetric, 70 | symmetric=symmetric, 71 | distance_function=distance_function) 72 | 73 | Nx, Na = dxa.size() 74 | Nx, Nb = dxb.size() 75 | 76 | if symmetric: 77 | n_pos = Na * (Na - 1) 78 | max_val = dxb.max().item() 79 | for i in range(Na): 80 | dxa[i, i] = max_val + 1 81 | else: 82 | n_pos = Na * Nx 83 | 84 | dxb = dxb.view(Nx, 1, Nb).expand(Nx, Na, Nb) 85 | dxa = dxa.view(Nx, Na, 1).expand(Nx, Na, Nb) 86 | 87 | sc = (dxa < dxb).sum() + 0.5 * (dxa == dxb).sum() 88 | sc /= (n_pos * Nb) 89 | 90 | return sc.item() 91 | 92 | 93 | def loc_dtw(data, distance_function, symmetric): 94 | coords, group_a, group_b, group_x = data 95 | group_a_data, group_a_size = group_a 96 | group_b_data, group_b_size = group_b 97 | group_x_data, group_x_size = group_x 98 | theta = get_theta_group_dtw(group_a_data, 99 | group_b_data, 100 | group_x_data, 101 | group_a_size, 102 | group_b_size, 103 | group_x_size, 104 | distance_function, 105 | symmetric) 106 | 107 | return (coords, 1 - theta) 108 | 109 | 110 | def get_abx_scores_dtw_on_group(group_iterator, 111 | distance_function, 112 | symmetric): 113 | 114 | data_list = [] 115 | coords_list = [] 116 | bar = progressbar.ProgressBar(maxval=len(group_iterator)) 117 | bar.start() 118 | 119 | with torch.no_grad(): 120 | for index, group in enumerate(group_iterator): 121 | bar.update(index) 122 | coords, abx = loc_dtw(group, distance_function, symmetric) 123 | data_list.append(abx) 124 | coords_list.append(coords) 125 | bar.finish() 126 | 127 | return torch.sparse.FloatTensor(torch.LongTensor(coords_list).t(), 128 | torch.FloatTensor(data_list), 129 | group_iterator.get_board_size()) 130 | -------------------------------------------------------------------------------- /cpc/eval/ABX/dtw.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import torch 6 | import numpy as np 7 | cimport numpy as np 8 | cimport cython 9 | from cpython cimport bool 10 | ctypedef np.float32_t CTYPE_t # cost type 11 | ctypedef np.intp_t IND_t # array index type 12 | CTYPE = np.float32 # cost type 13 | 14 | 15 | 16 | def dtw_batch(x,y, sx, sy, dist_mat, ignore_diag=False, symetric=False): 17 | 18 | Nx = dist_mat.shape[0] 19 | Ny = dist_mat.shape[1] 20 | 21 | out = torch.zeros((Nx, Ny)) 22 | 23 | for i in range(Nx): 24 | start_index = i if symetric else 0 25 | i_sx = sx[i] 26 | for j in range(start_index, Ny): 27 | 28 | j_sy = sy[j] 29 | if ignore_diag and i == j: 30 | continue 31 | distance = _dtw(i_sx, j_sy, dist_mat[i, j, :i_sx, :j_sy], True) 32 | out[i][j] = distance 33 | if symetric and i != j: 34 | out[j][i] = out[i][j] 35 | 36 | return out 37 | 38 | 39 | 40 | cpdef _dtw(IND_t N, IND_t M, CTYPE_t[:,:] dist_array, bool normalized): 41 | cdef IND_t i, j 42 | cdef CTYPE_t[:,:] cost = np.empty((N, M), dtype=CTYPE) 43 | cdef CTYPE_t final_cost, c_diag, c_left, c_up 44 | # initialization 45 | cost[0,0] = dist_array[0,0] 46 | for i in range(1,N): 47 | cost[i,0] = dist_array[i,0] + cost[i-1,0] 48 | for j in range(1,M): 49 | cost[0,j] = dist_array[0,j] + cost[0,j-1] 50 | # the dynamic programming loop 51 | for i in range(1,N): 52 | for j in range(1,M): 53 | cost[i, j] = dist_array[i, j] + min(cost[i-1,j], cost[i - 1, j - 1], cost[i, j - 1]) 54 | 55 | final_cost = cost[N-1, M-1] 56 | if normalized: 57 | path_len = 1 58 | i = N-1 59 | j = M-1 60 | while i > 0 and j > 0: 61 | c_up = cost[i - 1, j] 62 | c_left = cost[i, j-1] 63 | c_diag = cost[i-1, j-1] 64 | if c_diag <= c_left and c_diag <= c_up: 65 | i -= 1 66 | j -= 1 67 | elif c_left <= c_up: 68 | j -= 1 69 | else: 70 | i -= 1 71 | path_len += 1 72 | if i == 0: 73 | path_len += j 74 | if j == 0: 75 | path_len += i 76 | final_cost /= path_len 77 | return final_cost 78 | -------------------------------------------------------------------------------- /cpc/eval/ABX/test_data/2107.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/2107.npy -------------------------------------------------------------------------------- /cpc/eval/ABX/test_data/23.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/23.npy -------------------------------------------------------------------------------- /cpc/eval/ABX/test_data/407.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/407.npy -------------------------------------------------------------------------------- /cpc/eval/ABX/test_data/42.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/eval/ABX/test_data/42.npy -------------------------------------------------------------------------------- /cpc/eval/ABX/test_data/dummy_item_file.item: -------------------------------------------------------------------------------- 1 | #file onset offset #phone prev-phone next-phone speaker 2 | 2107 0.3225 0.5225 n ae d 8193 3 | 2107 0.4225 0.5925 d n l 2222 4 | 42 0.4525 0.6525 d n l 2222 5 | 42 0.5225 0.7325 ih l n 8193 6 | 42 0.5925 0.8725 n ih s 8193 7 | 23 0.6525 1.1025 s n ax 8193 8 | 23 0.7325 1.1925 s n ax 2222 9 | 407 0.8725 1.2425 s ax dh 2222 10 | 2107 1.1025 1.2925 dh s ax 12 11 | -------------------------------------------------------------------------------- /cpc/eval/ABX/test_data/dummy_item_within.item: -------------------------------------------------------------------------------- 1 | #file onset offset #phone prev-phone next-phone speaker 2 | 2107 0. 0.2 n p d 8193 3 | 2107 0.3225 0.5225 n ae d 8193 4 | 2107 0.6 0.75 n ae d 8193 5 | 2107 0.4225 0.5925 d n l 2222 6 | 42 0.4525 0.6525 d n l 2222 7 | 42 0.1301 0.2501 q n l 2222 8 | 42 0.5225 0.7325 d n l 8193 9 | 42 0.0025 0.3561 d p l 2222 10 | 42 0.5925 0.8725 d p l 8193 11 | -------------------------------------------------------------------------------- /cpc/eval/ABX/unit_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import unittest 6 | import torch 7 | from nose.tools import eq_, ok_ 8 | from . import abx_group_computation 9 | from . import abx_iterators 10 | from pathlib import Path 11 | import numpy as np 12 | import math 13 | 14 | 15 | class TestDistancesDTW(unittest.TestCase): 16 | 17 | def testDTWFunction(self): 18 | X = torch.tensor([[[0, 1], [0, 0], [1, 1], [42, 42]], 19 | [[0, 2], [0, 1], [1, 1], [-1, 0]], 20 | [[0, 0], [0, 1], [0, 0], [21, 211]]], 21 | dtype=torch.float) 22 | 23 | X_size = torch.tensor([3, 4, 2]) 24 | 25 | Y = torch.tensor([[[0, 1], [1, 2], [0, 0]]], dtype=torch.float) 26 | Y_size = torch.tensor([3]) 27 | 28 | distance_mode = abx_group_computation.get_euclidian_distance_batch 29 | dist = abx_group_computation.get_distance_group_dtw(X, Y, 30 | X_size, Y_size, 31 | distance_function=distance_mode) 32 | eq_(dist.size(), (3, 1)) 33 | expected_dist = [[(math.sqrt(2)) / 2], [3 / 4], 34 | [(2 + math.sqrt(2)) / 3]] 35 | for i in range(3): 36 | ok_(abs(expected_dist[i][0] - dist[i].item()) < 1e-4) 37 | 38 | def testThetaDTWFunctionSymetric(self): 39 | A = torch.tensor([[[0, 1], [0, 0], [1, 1], [42, 42]], 40 | [[0, 2], [0, 1], [1, 1], [-1, 0]], 41 | [[0, 0], [0, 1], [0, 0], [21, 211]]], 42 | dtype=torch.float) 43 | A_size = torch.tensor([3, 4, 2]) 44 | B = torch.tensor([[[0, 1], [1, 2], [0, 0]]], dtype=torch.float) 45 | B_size = torch.tensor([3]) 46 | 47 | distance_mode = abx_group_computation.get_euclidian_distance_batch 48 | symetric = True 49 | theta = abx_group_computation.get_theta_group_dtw(A, B, A, A_size, 50 | B_size, A_size, 51 | distance_mode, 52 | symetric) 53 | eq_(theta, 0.5) 54 | 55 | 56 | class testSingularityNormalization(unittest.TestCase): 57 | 58 | def testCosineNormalized(self): 59 | x = torch.tensor([[[1., 0., 0., 0.], [0., 0., 0., 0.]], 60 | [[0., 0., -1., 0.], [0.5, -0.5, 0.5, -0.5]]]) 61 | y = torch.tensor( 62 | [[[-0.5, -0.5, -0.5, 0.5], [0., 0., 0., 0.], [0., 1., 0., 0.]]]) 63 | norm_x = abx_iterators.normalize_with_singularity(x) 64 | norm_y = abx_iterators.normalize_with_singularity(y) 65 | dist = abx_group_computation.get_cosine_distance_batch(norm_x, norm_y) 66 | 67 | eq_(dist.size(), (2, 1, 2, 3)) 68 | ok_(abs(dist[0, 0, 0, 0] - 0.6667) < 1e-4) 69 | ok_(abs(dist[0, 0, 0, 1] - 1.) < 1e-4) 70 | ok_(abs(dist[0, 0, 0, 2] - 0.5) < 1e-4) 71 | 72 | ok_(abs(dist[0, 0, 1, 0] - 1) < 1e-4) 73 | ok_(abs(dist[0, 0, 1, 1]) < 1e-4) 74 | ok_(abs(dist[0, 0, 1, 2] - 1) < 1e-4) 75 | 76 | ok_(abs(dist[1, 0, 0, 0] - 0.3333) < 1e-4) 77 | ok_(abs(dist[1, 0, 0, 1] - 1.) < 1e-4) 78 | ok_(abs(dist[1, 0, 0, 2] - 0.5) < 1e-4) 79 | 80 | ok_(abs(dist[1, 0, 1, 0]-0.6667) < 1e-4) 81 | ok_(abs(dist[1, 0, 1, 1] - 1.) < 1e-4) 82 | ok_(abs(dist[1, 0, 1, 2] - 0.6667) < 1e-4) 83 | 84 | 85 | class testGroupMaker(unittest.TestCase): 86 | 87 | def test1DGroupMaker(self): 88 | 89 | data = [[0], [1], [2], [3], [4], [2], [2], [2]] 90 | order = [0] 91 | out_index, out_data = abx_iterators.get_features_group(data, order) 92 | 93 | expected_index = [0, 1, 2, 5, 6, 7, 3, 4] 94 | eq_(out_index, expected_index) 95 | 96 | expected_output = [(0, 1), (1, 2), (2, 6), (6, 7), (7, 8)] 97 | eq_(out_data, expected_output) 98 | 99 | def test2DGroupMaker(self): 100 | 101 | data = [[0, 1], [1, 2], [2, 3], [3, 3], 102 | [4, 0], [2, 2], [4, 2], [2, 2], [0, 3]] 103 | 104 | order = [1, 0] 105 | out_index, out_data = abx_iterators.get_features_group(data, order) 106 | expected_index = [4, 0, 1, 5, 7, 6, 8, 2, 3] 107 | eq_(out_index, expected_index) 108 | expected_output = [[(0, 1)], 109 | [(1, 2)], 110 | [(2, 3), (3, 5), (5, 6)], 111 | [(6, 7), (7, 8), (8, 9)]] 112 | eq_(out_data, expected_output) 113 | 114 | def test3DGroupMaker(self): 115 | 116 | data = [[0, 0, 0, 1], 117 | [41, 1, 0, 2], 118 | [-23, 0, 3, 1], 119 | [220, 1, -2, 3], 120 | [40, 2, 1, 0], 121 | [200, 0, 0, 1]] 122 | 123 | order = [1, 3, 2] 124 | out_index, out_data = abx_iterators.get_features_group(data, order) 125 | expected_index = [0, 5, 2, 1, 3, 4] 126 | eq_(out_index, expected_index) 127 | 128 | expected_output = [[[(0, 2), (2, 3)]], [ 129 | [(3, 4)], [(4, 5)]], [[(5, 6)]]] 130 | eq_(out_data, expected_output) 131 | 132 | 133 | class testItemLoader(unittest.TestCase): 134 | 135 | def setUp(self): 136 | self.test_data_dir = Path(__file__).parent / 'test_data' 137 | 138 | def testLoadItemFile(self): 139 | path_item_file = self.test_data_dir / "dummy_item_file.item" 140 | out, context_match, phone_match, speaker_match = \ 141 | abx_iterators.load_item_file(path_item_file) 142 | 143 | eq_(len(out), 4) 144 | eq_(len(phone_match), 5) 145 | eq_(len(speaker_match), 3) 146 | 147 | expected_phones = {'n': 0, 'd': 1, 'ih': 2, 148 | 's': 3, 'dh': 4} 149 | eq_(phone_match, expected_phones) 150 | 151 | expected_speakers = {'8193': 0, '2222': 1, '12': 2} 152 | eq_(speaker_match, expected_speakers) 153 | 154 | expected_context = {'ae+d': 0, 'n+l': 1, 'l+n': 2, 'ih+s': 3, 155 | 'n+ax': 4, 'ax+dh': 5, 's+ax': 6} 156 | eq_(context_match, expected_context) 157 | 158 | expected_output = {'2107': [[0.3225, 0.5225, 0, 0, 0], 159 | [0.4225, 0.5925, 1, 1, 1], 160 | [1.1025, 1.2925, 6, 4, 2]], 161 | '42': [[0.4525, 0.6525, 1, 1, 1], 162 | [0.5225, 0.7325, 2, 2, 0], 163 | [0.5925, 0.8725, 3, 0, 0]], 164 | '23': [[0.6525, 1.1025, 4, 3, 0], 165 | [0.7325, 1.1925, 4, 3, 1]], 166 | '407': [[0.8725, 1.2425, 5, 3, 1]]} 167 | 168 | eq_(expected_output, out) 169 | 170 | def testLoadWithinItemFile(self): 171 | path_item_file = self.test_data_dir / "dummy_item_within.item" 172 | out, context_match, phone_match, speaker_match = \ 173 | abx_iterators.load_item_file(path_item_file) 174 | 175 | expected_output = {'2107': [[0., 0.2, 0, 0, 0], 176 | [0.3225, 0.5225, 1, 0, 0], 177 | [0.6, 0.75, 1, 0, 0], 178 | [0.4225, 0.5925, 2, 1, 1]], 179 | '42': [[0.4525, 0.6525, 2, 1, 1], 180 | [0.1301, 0.2501, 2, 2, 1], 181 | [0.5225, 0.7325, 2, 1, 0], 182 | [0.0025, 0.3561, 3, 1, 1], 183 | [0.5925, 0.8725, 3, 1, 0]]} 184 | eq_(expected_output, out) 185 | 186 | 187 | class testABXFeatureLoader(unittest.TestCase): 188 | 189 | def setUp(self): 190 | self.stepFeature = 10 191 | self.test_data_dir = Path(__file__).parent / 'test_data' 192 | 193 | def dummy_feature_maker(path_file, *args): 194 | data = torch.tensor(np.load(path_file)) 195 | assert(len(data.size()) == 1) 196 | return data.view(1, -1, 1) 197 | 198 | def testBaseLoader(self): 199 | seqList = [('2107', self.test_data_dir / '2107.npy'), 200 | ('42', self.test_data_dir / '42.npy'), 201 | ('23', self.test_data_dir / '23.npy'), 202 | ('407', self.test_data_dir / '407.npy')] 203 | 204 | dataset = abx_iterators.ABXFeatureLoader(self.test_data_dir / "dummy_item_file.item", 205 | seqList, 206 | testABXFeatureLoader.dummy_feature_maker, 207 | self.stepFeature, 208 | False) 209 | print(dataset.features) 210 | eq_(dataset.feature_dim, 1) 211 | eq_(len(dataset), 9) 212 | eq_(len(dataset.data.size()), 2) 213 | eq_(len(dataset.data), 16) 214 | data, size, coords = dataset[0] 215 | eq_(size, 1) 216 | eq_(coords, (0, 0, 0)) 217 | eq_(data.tolist(), [[3]]) 218 | 219 | data, size, coords = dataset[3] 220 | eq_(size, 1) 221 | eq_(coords, (1, 1, 1)) 222 | eq_(data.tolist(), [[5]]) 223 | 224 | def testWithinIterator(self): 225 | seqList = [('2107', self.test_data_dir / '2107.npy'), 226 | ('42', self.test_data_dir / '42.npy')] 227 | dataset = abx_iterators.ABXFeatureLoader(self.test_data_dir / "dummy_item_within.item", 228 | seqList, 229 | testABXFeatureLoader.dummy_feature_maker, 230 | self.stepFeature, 231 | False) 232 | iterator = dataset.get_iterator('within', 40) 233 | eq_(iterator.index_csp, [0, 1, 2, 6, 3, 4, 5, 8, 7]) 234 | eq_(iterator.groups_csp, [[[(0, 1)]], [[(1, 3)]], [ 235 | [(3, 4)], [(4, 6), (6, 7)]], [[(7, 8)], [(8, 9)]]]) 236 | eq_(len(iterator), 1) 237 | 238 | it = iter(iterator) 239 | c1, a_01, b_01, x_01 = next(it) 240 | eq_(c1, (1, 1, 2, 2)) 241 | a_1, s_a = a_01 242 | eq_(s_a.tolist(), [1, 1]) 243 | eq_(a_1.tolist(), [[[4.]], [[5.]]]) 244 | eq_(x_01[0].tolist(), a_1.tolist()) 245 | eq_(x_01[1].tolist(), s_a.tolist()) 246 | eq_(b_01[0].tolist(), [[[1.]]]) 247 | eq_(b_01[1].item(), 1) 248 | 249 | eq_(next(it, False), False) 250 | eq_(iterator.get_board_size(), (2, 3, 3, 4)) 251 | -------------------------------------------------------------------------------- /cpc/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /cpc/eval/build_zeroSpeech_features.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import os 6 | import json 7 | import torch 8 | import progressbar 9 | import argparse 10 | import numpy as np 11 | 12 | from cpc.dataset import findAllSeqs 13 | from cpc.feature_loader import buildFeature, FeatureModule, \ 14 | ModelPhoneCombined, loadSupervisedCriterion, loadModel 15 | 16 | 17 | def getArgs(pathCheckpoints): 18 | pathArgs = os.path.join(os.path.dirname(pathCheckpoints), 19 | "checkpoint_args.json") 20 | with open(pathArgs, 'rb') as file: 21 | return json.load(file) 22 | 23 | 24 | def buildAllFeature(featureMaker, pathDB, pathOut, 25 | seqList, stepSize=0.01, strict=False, 26 | maxSizeSeq=64000, format='fea', 27 | seqNorm=False): 28 | 29 | totSeqs = len(seqList) 30 | startStep = stepSize / 2 31 | bar = progressbar.ProgressBar(maxval=totSeqs) 32 | bar.start() 33 | for nseq, seqPath in enumerate(seqList): 34 | bar.update(nseq) 35 | feature = buildFeature(featureMaker, 36 | os.path.join(pathDB, seqPath), 37 | strict=strict or seqNorm, 38 | maxSizeSeq=maxSizeSeq, 39 | seqNorm=seqNorm) 40 | 41 | _, nSteps, hiddenSize = feature.size() 42 | outName = os.path.basename(os.path.splitext(seqPath)[0]) + f'.{format}' 43 | fname = os.path.join(pathOut, outName) 44 | 45 | if format == 'npz': 46 | time = [startStep + step * stepSize for step in range(nSteps)] 47 | values = feature.squeeze(0).float().cpu().numpy() 48 | totTime = np.array([stepSize * nSteps], dtype=np.float32) 49 | with open(fname, 'wb') as f: 50 | np.savez(f, time=time, features=values, totTime=totTime) 51 | elif format == 'npy': 52 | time = [startStep + step * stepSize for step in range(nSteps)] 53 | values = feature.squeeze(0).float().cpu().numpy() 54 | with open(fname, 'wb') as f: 55 | np.save(f, values) 56 | elif format == 'af': 57 | import arrayfire as af 58 | time = [startStep + step * stepSize for step in range(nSteps)] 59 | values = feature.squeeze(0).float().cpu().numpy() 60 | totTime = np.array([stepSize * nSteps], dtype=np.float32) 61 | af.save_array("time", af.Array(time, dtype=af.Dtype.f32), fname) 62 | af.save_array("totTime", af.interop.from_ndarray(totTime), 63 | fname, append=True) 64 | af.save_array("features", af.interop.from_ndarray(values), 65 | fname, append=True) 66 | else: 67 | with open(fname, 'w') as f: 68 | _, nSteps, hiddenSize = feature.size() 69 | for step in range(nSteps): 70 | line = [startStep + step * stepSize] + \ 71 | feature[0, step, :].tolist() 72 | line = [str(x) for x in line] 73 | linestr = ' '.join(line) + '\n' 74 | f.write(linestr) 75 | bar.finish() 76 | 77 | 78 | if __name__ == "__main__": 79 | 80 | parser = argparse.ArgumentParser('Build features for zerospeech \ 81 | Track1 evaluation') 82 | parser.add_argument('pathDB', help='Path to the reference dataset') 83 | parser.add_argument('pathOut', help='Path to the output features') 84 | parser.add_argument('pathCheckpoint', help='Checkpoint to load') 85 | parser.add_argument('--extension', type=str, default='.wav') 86 | parser.add_argument('--addCriterion', action='store_true') 87 | parser.add_argument('--oneHot', action='store_true') 88 | parser.add_argument('--maxSizeSeq', default=64000, type=int) 89 | parser.add_argument('--train_mode', action='store_true') 90 | parser.add_argument('--format', default='fea', type=str, 91 | choices=['npz', 'fea', 'npy', 'af']) 92 | parser.add_argument('--strict', action='store_true') 93 | parser.add_argument('--dimReduction', type=str, default=None) 94 | parser.add_argument('--centroidLimits', type=int, nargs=2, default=None) 95 | parser.add_argument('--getEncoded', action='store_true') 96 | parser.add_argument('--clusters', type=str, default=None) 97 | parser.add_argument('--seqNorm', action='store_true') 98 | 99 | args = parser.parse_args() 100 | 101 | if not os.path.isdir(args.pathOut): 102 | os.mkdir(args.pathOut) 103 | 104 | with open(os.path.join(os.path.dirname(args.pathOut), 105 | f"{os.path.basename(args.pathOut)}.json"), 'w') \ 106 | as file: 107 | json.dump(vars(args), file, indent=2) 108 | 109 | outData = [x[1] for x in 110 | findAllSeqs(args.pathDB, extension=args.extension, 111 | loadCache=False)[0]] 112 | 113 | featureMaker = loadModel([args.pathCheckpoint])[0] 114 | stepSize = featureMaker.gEncoder.DOWNSAMPLING / 16000 115 | print(f"stepSize : {stepSize}") 116 | featureMaker = FeatureModule(featureMaker, args.getEncoded) 117 | featureMaker.collapse = False 118 | 119 | if args.addCriterion: 120 | criterion, nPhones = loadSupervisedCriterion(args.pathCheckpoint) 121 | featureMaker = ModelPhoneCombined(featureMaker, criterion, 122 | nPhones, args.oneHot) 123 | featureMaker = featureMaker.cuda(device=0) 124 | 125 | if not args.train_mode: 126 | featureMaker.eval() 127 | 128 | buildAllFeature(featureMaker, args.pathDB, args.pathOut, outData, 129 | stepSize=stepSize, strict=args.strict, 130 | maxSizeSeq=args.maxSizeSeq, 131 | format=args.format, 132 | seqNorm=args.seqNorm) 133 | -------------------------------------------------------------------------------- /cpc/eval/utils/adjust_sample_rate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import argparse 6 | import torchaudio 7 | import progressbar 8 | import os 9 | import sys 10 | from pathlib import Path 11 | 12 | 13 | def adjust_sample_rate(path_db, file_list, path_db_out, 14 | target_sr): 15 | bar = progressbar.ProgressBar(maxval=len(file_list)) 16 | bar.start() 17 | 18 | for index, item in enumerate(file_list): 19 | path_in = os.path.join(path_db, item) 20 | path_out = os.path.join(path_db_out, item) 21 | 22 | bar.update(index) 23 | data, sr = torchaudio.load(path_in) 24 | transform = torchaudio.transforms.Resample(orig_freq=sr, 25 | new_freq=target_sr, 26 | resampling_method='sinc_interpolation') 27 | data = transform(data) 28 | torchaudio.save(path_out, data, target_sr, 29 | precision=16, channels_first=True) 30 | 31 | bar.finish() 32 | 33 | 34 | def get_names_list(path_tsv_file): 35 | 36 | with open(path_tsv_file, 'r') as file: 37 | data = file.readlines() 38 | 39 | return [x.split()[0] for x in data] 40 | 41 | 42 | def parse_args(argv): 43 | 44 | parser = argparse.ArgumentParser(description='Adjust the sample rate of ' 45 | 'a given group of audio files') 46 | 47 | parser.add_argument('path_db', type=str, 48 | help='Path to the directory containing the audio ' 49 | 'files') 50 | parser.add_argument("path_phone_files", type=str, 51 | help='Path to the .txt file containing the list of ' 52 | 'the files with a phone transcription') 53 | parser.add_argument("path_out", type=str, 54 | help='Path out the output directory') 55 | parser.add_argument("--out_sample_rate", type=int, default=16000, 56 | help="Sample rate of the output audio files " 57 | "(default is 160000)") 58 | parser.add_argument('--file_extension', type=str, default='.mp3') 59 | 60 | return parser.parse_args(argv) 61 | 62 | 63 | def main(argv): 64 | 65 | args = parse_args(argv) 66 | 67 | file_list_db = [f for f in os.listdir(args.path_db) 68 | if Path(f).suffix == args.file_extension] 69 | 70 | print(f"Found {len(file_list_db)} in the dataset") 71 | file_list_phone = get_names_list(args.path_phone_files) 72 | print(f"Found {len(file_list_phone)} with a phone transcription") 73 | 74 | file_list_db.sort() 75 | file_list_phone.sort() 76 | out_list = [] 77 | index_phone = 0 78 | for file_name in file_list_db: 79 | while Path(file_name).stem > file_list_phone[index_phone]: 80 | index_phone += 1 81 | if index_phone >= len(file_list_phone): 82 | break 83 | if Path(file_name).stem == file_list_phone[index_phone]: 84 | out_list.append(file_name) 85 | 86 | print(f"Converting {len(out_list)} files") 87 | 88 | Path(args.path_out).mkdir(parents=True, exist_ok=True) 89 | adjust_sample_rate(args.path_db, out_list, 90 | args.path_out, args.out_sample_rate) 91 | 92 | 93 | if __name__ == '__main__': 94 | main(sys.argv[1:]) 95 | -------------------------------------------------------------------------------- /cpc/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torchaudio 8 | 9 | import torch 10 | 11 | ########################################### 12 | # Networks 13 | ########################################### 14 | 15 | 16 | class IDModule(nn.Module): 17 | 18 | def __init__(self, *args, **kwargs): 19 | super(IDModule, self).__init__() 20 | 21 | def forward(self, x): 22 | return x 23 | 24 | 25 | class ChannelNorm(nn.Module): 26 | 27 | def __init__(self, 28 | numFeatures, 29 | epsilon=1e-05, 30 | affine=True): 31 | 32 | super(ChannelNorm, self).__init__() 33 | if affine: 34 | self.weight = nn.parameter.Parameter(torch.Tensor(1, 35 | numFeatures, 1)) 36 | self.bias = nn.parameter.Parameter(torch.Tensor(1, numFeatures, 1)) 37 | else: 38 | self.weight = None 39 | self.bias = None 40 | self.epsilon = epsilon 41 | self.p = 0 42 | self.affine = affine 43 | self.reset_parameters() 44 | 45 | def reset_parameters(self): 46 | if self.affine: 47 | torch.nn.init.ones_(self.weight) 48 | torch.nn.init.zeros_(self.bias) 49 | 50 | def forward(self, x): 51 | 52 | cumMean = x.mean(dim=1, keepdim=True) 53 | cumVar = x.var(dim=1, keepdim=True) 54 | x = (x - cumMean)*torch.rsqrt(cumVar + self.epsilon) 55 | 56 | if self.weight is not None: 57 | x = x * self.weight + self.bias 58 | return x 59 | 60 | 61 | class CPCEncoder(nn.Module): 62 | 63 | def __init__(self, 64 | sizeHidden=512, 65 | normMode="layerNorm"): 66 | 67 | super(CPCEncoder, self).__init__() 68 | 69 | validModes = ["batchNorm", "instanceNorm", "ID", "layerNorm"] 70 | if normMode not in validModes: 71 | raise ValueError(f"Norm mode must be in {validModes}") 72 | 73 | if normMode == "instanceNorm": 74 | def normLayer(x): return nn.InstanceNorm1d(x, affine=True) 75 | elif normMode == "ID": 76 | normLayer = IDModule 77 | elif normMode == "layerNorm": 78 | normLayer = ChannelNorm 79 | else: 80 | normLayer = nn.BatchNorm1d 81 | 82 | self.dimEncoded = sizeHidden 83 | self.conv0 = nn.Conv1d(1, sizeHidden, 10, stride=5, padding=3) 84 | self.batchNorm0 = normLayer(sizeHidden) 85 | self.conv1 = nn.Conv1d(sizeHidden, sizeHidden, 8, stride=4, padding=2) 86 | self.batchNorm1 = normLayer(sizeHidden) 87 | self.conv2 = nn.Conv1d(sizeHidden, sizeHidden, 4, 88 | stride=2, padding=1) 89 | self.batchNorm2 = normLayer(sizeHidden) 90 | self.conv3 = nn.Conv1d(sizeHidden, sizeHidden, 4, stride=2, padding=1) 91 | self.batchNorm3 = normLayer(sizeHidden) 92 | self.conv4 = nn.Conv1d(sizeHidden, sizeHidden, 4, stride=2, padding=1) 93 | self.batchNorm4 = normLayer(sizeHidden) 94 | self.DOWNSAMPLING = 160 95 | 96 | def getDimOutput(self): 97 | return self.conv4.out_channels 98 | 99 | def forward(self, x): 100 | x = F.relu(self.batchNorm0(self.conv0(x))) 101 | x = F.relu(self.batchNorm1(self.conv1(x))) 102 | x = F.relu(self.batchNorm2(self.conv2(x))) 103 | x = F.relu(self.batchNorm3(self.conv3(x))) 104 | x = F.relu(self.batchNorm4(self.conv4(x))) 105 | return x 106 | 107 | 108 | class MFCCEncoder(nn.Module): 109 | 110 | def __init__(self, 111 | dimEncoded): 112 | 113 | super(MFCCEncoder, self).__init__() 114 | melkwargs = {"n_mels": max(128, dimEncoded), "n_fft": 321} 115 | self.dimEncoded = dimEncoded 116 | self.MFCC = torchaudio.transforms.MFCC(n_mfcc=dimEncoded, 117 | melkwargs=melkwargs) 118 | 119 | def forward(self, x): 120 | x = x.view(x.size(0), -1) 121 | x = self.MFCC(x) 122 | return x.permute(0, 2, 1) 123 | 124 | 125 | class LFBEnconder(nn.Module): 126 | 127 | def __init__(self, dimEncoded, normalize=True): 128 | 129 | super(LFBEnconder, self).__init__() 130 | self.dimEncoded = dimEncoded 131 | self.conv = nn.Conv1d(1, 2 * dimEncoded, 132 | 400, stride=1) 133 | self.register_buffer('han', torch.hann_window(400).view(1, 1, 400)) 134 | self.instancenorm = nn.InstanceNorm1d(dimEncoded, momentum=1) \ 135 | if normalize else None 136 | 137 | def forward(self, x): 138 | 139 | N, C, L = x.size() 140 | x = self.conv(x) 141 | x = x.view(N, self.dimEncoded, 2, -1) 142 | x = x[:, :, 0, :]**2 + x[:, :, 1, :]**2 143 | x = x.view(N * self.dimEncoded, 1, -1) 144 | x = torch.nn.functional.conv1d(x, self.han, bias=None, 145 | stride=160, padding=350) 146 | x = x.view(N, self.dimEncoded, -1) 147 | x = torch.log(1 + torch.abs(x)) 148 | 149 | # Normalization 150 | if self.instancenorm is not None: 151 | x = self.instancenorm(x) 152 | return x 153 | 154 | 155 | class CPCAR(nn.Module): 156 | 157 | def __init__(self, 158 | dimEncoded, 159 | dimOutput, 160 | keepHidden, 161 | nLevelsGRU, 162 | mode="GRU", 163 | reverse=False): 164 | 165 | super(CPCAR, self).__init__() 166 | self.RESIDUAL_STD = 0.1 167 | 168 | if mode == "LSTM": 169 | self.baseNet = nn.LSTM(dimEncoded, dimOutput, 170 | num_layers=nLevelsGRU, batch_first=True) 171 | elif mode == "RNN": 172 | self.baseNet = nn.RNN(dimEncoded, dimOutput, 173 | num_layers=nLevelsGRU, batch_first=True) 174 | else: 175 | self.baseNet = nn.GRU(dimEncoded, dimOutput, 176 | num_layers=nLevelsGRU, batch_first=True) 177 | 178 | self.hidden = None 179 | self.keepHidden = keepHidden 180 | self.reverse = reverse 181 | 182 | def getDimOutput(self): 183 | return self.baseNet.hidden_size 184 | 185 | def forward(self, x): 186 | 187 | if self.reverse: 188 | x = torch.flip(x, [1]) 189 | try: 190 | self.baseNet.flatten_parameters() 191 | except RuntimeError: 192 | pass 193 | x, h = self.baseNet(x, self.hidden) 194 | if self.keepHidden: 195 | if isinstance(h, tuple): 196 | self.hidden = tuple(x.detach() for x in h) 197 | else: 198 | self.hidden = h.detach() 199 | 200 | # For better modularity, a sequence's order should be preserved 201 | # by each module 202 | if self.reverse: 203 | x = torch.flip(x, [1]) 204 | return x 205 | 206 | 207 | class NoAr(nn.Module): 208 | 209 | def __init__(self, *args): 210 | super(NoAr, self).__init__() 211 | 212 | def forward(self, x): 213 | return x 214 | 215 | 216 | class BiDIRARTangled(nn.Module): 217 | r""" 218 | Research: bidirectionnal model for BERT training. 219 | """ 220 | def __init__(self, 221 | dimEncoded, 222 | dimOutput, 223 | nLevelsGRU): 224 | 225 | super(BiDIRARTangled, self).__init__() 226 | assert(dimOutput % 2 == 0) 227 | 228 | self.ARNet = nn.GRU(dimEncoded, dimOutput // 2, 229 | num_layers=nLevelsGRU, batch_first=True, 230 | bidirectional=True) 231 | 232 | def getDimOutput(self): 233 | return self.ARNet.hidden_size * 2 234 | 235 | def forward(self, x): 236 | 237 | self.ARNet.flatten_parameters() 238 | xf, _ = self.ARNet(x) 239 | return xf 240 | 241 | 242 | class BiDIRAR(nn.Module): 243 | r""" 244 | Research: bidirectionnal model for BERT training. 245 | """ 246 | def __init__(self, 247 | dimEncoded, 248 | dimOutput, 249 | nLevelsGRU): 250 | 251 | super(BiDIRAR, self).__init__() 252 | assert(dimOutput % 2 == 0) 253 | 254 | self.netForward = nn.GRU(dimEncoded, dimOutput // 2, 255 | num_layers=nLevelsGRU, batch_first=True) 256 | self.netBackward = nn.GRU(dimEncoded, dimOutput // 2, 257 | num_layers=nLevelsGRU, batch_first=True) 258 | 259 | def getDimOutput(self): 260 | return self.netForward.hidden_size * 2 261 | 262 | def forward(self, x): 263 | 264 | self.netForward.flatten_parameters() 265 | self.netBackward.flatten_parameters() 266 | xf, _ = self.netForward(x) 267 | xb, _ = self.netBackward(torch.flip(x, [1])) 268 | return torch.cat([xf, torch.flip(xb, [1])], dim=2) 269 | 270 | 271 | ########################################### 272 | # Model 273 | ########################################### 274 | 275 | 276 | class CPCModel(nn.Module): 277 | 278 | def __init__(self, 279 | encoder, 280 | AR): 281 | 282 | super(CPCModel, self).__init__() 283 | self.gEncoder = encoder 284 | self.gAR = AR 285 | 286 | def forward(self, batchData, label): 287 | encodedData = self.gEncoder(batchData).permute(0, 2, 1) 288 | cFeature = self.gAR(encodedData) 289 | return cFeature, encodedData, label 290 | 291 | class CPCModelNullspace(nn.Module): 292 | 293 | def __init__(self, 294 | cpc, 295 | nullspace): 296 | 297 | super(CPCModelNullspace, self).__init__() 298 | self.cpc = cpc 299 | self.nullspace = nn.Linear(nullspace.shape[0], nullspace.shape[1], bias=False) 300 | self.nullspace.weight = nn.Parameter(nullspace.T) 301 | self.gEncoder = self.cpc.gEncoder 302 | 303 | 304 | def forward(self, batchData, label): 305 | cFeature, encodedData, label = self.cpc(batchData, label) 306 | cFeature = self.nullspace(cFeature) 307 | encodedData = self.nullspace(encodedData) 308 | return cFeature, encodedData, label 309 | 310 | 311 | class ConcatenatedModel(nn.Module): 312 | 313 | def __init__(self, model_list): 314 | 315 | super(ConcatenatedModel, self).__init__() 316 | self.models = torch.nn.ModuleList(model_list) 317 | 318 | def forward(self, batchData, label): 319 | 320 | outFeatures = [] 321 | outEncoded = [] 322 | for model in self.models: 323 | cFeature, encodedData, label = model(batchData, label) 324 | outFeatures.append(cFeature) 325 | outEncoded.append(encodedData) 326 | return torch.cat(outFeatures, dim=2), \ 327 | torch.cat(outEncoded, dim=2), label 328 | -------------------------------------------------------------------------------- /cpc/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/stats/__init__.py -------------------------------------------------------------------------------- /cpc/stats/empty_stat.py: -------------------------------------------------------------------------------- 1 | 2 | # this is a "template" for stat class, plus each stat must extend this 3 | class Stat: 4 | 5 | def computeForBatch(self, batch): 6 | raise Exception("computeForBatch not implemented") 7 | 8 | def mergeStatResults(self, prev, current): 9 | raise Exception("mergeStatResults not implemented") 10 | 11 | def logStat(self, statValue, epochNr): 12 | raise Exception("logStat not implemented") 13 | # should return it's values in dict format, can also log somewhere 14 | # e.g. subclass can take 'where to log' as additional its-state arg 15 | 16 | def getStatName(self): 17 | raise Exception("getStatName not implemented") 18 | # has to differ if want to compute both at a time; 19 | # can be e.g. framewise_ctx_euclid_diff, framewise_ctx_cosine_diff dep. on Stat settings/state -------------------------------------------------------------------------------- /cpc/stats/repr_diff_stat.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import cpc.stats.empty_stat as statTempl 4 | import torch 5 | import math 6 | import os 7 | from copy import deepcopy 8 | import matplotlib.pyplot as plt 9 | 10 | def euclideanDist(vecs1, vecs2): 11 | return torch.sqrt(torch.square(vecs1).sum(1) + torch.square(vecs2).sum(1) - (2*vecs1*vecs2).sum(1)) 12 | 13 | def euclideanDistSq(vecs1, vecs2): 14 | return torch.square(vecs1).sum(1) + torch.square(vecs2).sum(1) - (2*vecs1*vecs2).sum(1) 15 | 16 | def cosineDist(vecs1, vecs2): 17 | cosSim = (vecs1*vecs2).sum(1) / (torch.sqrt(torch.square(vecs1).sum(1)) * torch.sqrt(torch.square(vecs2).sum(1))) 18 | return -cosSim + 1. 19 | 20 | def cosineCorr(vecs1, vecs2): 21 | cosSim = (vecs1*vecs2).sum(1) / (torch.sqrt(torch.square(vecs1).sum(1)) * torch.sqrt(torch.square(vecs2).sum(1))) 22 | return torch.abs(cosSim) 23 | 24 | class ReprDiffStat(statTempl.Stat): 25 | 26 | def __init__(self, metric, reprType, stepSize, histDir): 27 | super().__init__() 28 | assert metric in ('cosine', 'euclid', 'euclidsq', 'coscorr') 29 | assert reprType in ('conv_repr', 'ctx_repr') 30 | self.metric = metric 31 | self.reprType = reprType 32 | self.stepSize = stepSize 33 | self.histDir = histDir 34 | if not os.path.exists(self.histDir): 35 | os.makedirs(self.histDir) 36 | 37 | @staticmethod 38 | def convertArgsFromStrings(metric, reprType, stepSize, histDir): 39 | return (metric, reprType, float(stepSize), histDir) 40 | 41 | def computeForBatch(self, batch): 42 | reprData = batch[self.reprType] 43 | reprData1 = reprData[:,1:].contiguous().view(-1, reprData.shape[2]) 44 | reprData2 = reprData[:,:-1].contiguous().view(-1, reprData.shape[2]) 45 | if self.metric == 'euclid': 46 | distances = euclideanDist(reprData1, reprData2) 47 | elif self.metric == 'euclidsq': 48 | distances = euclideanDistSq(reprData1, reprData2) 49 | elif self.metric == 'cosine': 50 | distances = cosineDist(reprData1, reprData2) 51 | elif self.metric == 'coscorr': 52 | distances = cosineCorr(reprData1, reprData2) 53 | distances = torch.div(distances, self.stepSize) #, rounding_mode='floor') 54 | occurences = {} 55 | l = 0 56 | for d in distances: 57 | if math.isnan(d): 58 | continue 59 | l += 1 60 | df = math.floor(d) * self.stepSize 61 | if df in occurences: 62 | occurences[df] = occurences[df] + 1 63 | else: 64 | occurences[df] = 1 65 | return { 66 | 'hist': occurences, 67 | 'sum': l 68 | } 69 | 70 | def mergeStatResults(self, prev, current): 71 | merged = {} 72 | merged['sum'] = prev['sum'] + current['sum'] 73 | currentHist = current['hist'] 74 | mergedHist = deepcopy(prev['hist']) 75 | for step in currentHist: 76 | if step in mergedHist: 77 | mergedHist[step] = mergedHist[step] + currentHist[step] 78 | else: 79 | mergedHist[step] = currentHist[step] 80 | merged['hist'] = mergedHist 81 | return merged 82 | 83 | def logStat(self, statValue, epochNr): 84 | histValues = statValue['hist'] 85 | histKeys = sorted(list(histValues.keys())) 86 | histHeights = [histValues[k] for k in histKeys] 87 | plt.figure() 88 | plt.bar(histKeys, histHeights, width=self.stepSize) 89 | plt.savefig(os.path.join(self.histDir, self.getStatName() + "_" + str(epochNr) + ".png")) 90 | return { 91 | 'mean': sum([a*b for a,b in zip (histKeys, histHeights)]) / sum(histHeights) 92 | } 93 | 94 | def getStatName(self): 95 | return "reprDiff_" + self.reprType + "_" + self.metric + "_by" + str(self.stepSize) -------------------------------------------------------------------------------- /cpc/stats/stat_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import cpc.stats.stats_collector as sc 3 | import cpc.stats.repr_diff_stat as repr_diff 4 | 5 | # --valSetStats stat1:a,b,c_stat2 6 | # --captureSetStats stat1:_stat2:p1,p2_stat3:p1 7 | 8 | def getStatFromSpec(spec): 9 | specSplit = spec.split(":") 10 | statName, statArgs = specSplit[0], specSplit[1] 11 | statArgs = statArgs.split(",") 12 | assert statName in ("reprDiff,") 13 | if statName == "reprDiff": 14 | statArgs = repr_diff.ReprDiffStat.convertArgsFromStrings(*statArgs) 15 | return repr_diff.ReprDiffStat(*statArgs) 16 | 17 | def constructStatCollectorFromSpecs(specs): 18 | specList = specs.split('_') 19 | collector = sc.StatsCollector() 20 | for spec in specList: 21 | collector.registerStat(getStatFromSpec(spec)) 22 | return collector 23 | -------------------------------------------------------------------------------- /cpc/stats/stats_collector.py: -------------------------------------------------------------------------------- 1 | 2 | import cpc.stats.empty_stat as statTempl 3 | 4 | class StatsCollector: 5 | 6 | def __init__(self): 7 | self.stats = [] 8 | self.statValues = [] 9 | self.zeroed = True 10 | self.statNames = set() 11 | 12 | def registerStat(self, stat): 13 | assert issubclass(type(stat), statTempl.Stat) 14 | assert stat.getStatName not in self.statNames 15 | self.statNames.add(stat.getStatName) 16 | self.stats.append(stat) 17 | 18 | def zeroStats(self): 19 | self.zeroed = True 20 | 21 | def batchUpdate(self, batch): 22 | if self.zeroed: 23 | self.statValues = [stat.computeForBatch(batch) for stat in self.stats] 24 | self.zeroed = False 25 | else: 26 | oldValues = self.statValues 27 | newValues = [stat.computeForBatch(batch) for stat in self.stats] 28 | self.statValues = [stat.mergeStatResults(prev, current) for stat, (prev, current) 29 | in zip (self.stats, zip(oldValues, newValues))] 30 | 31 | def dataLoaderUpdate(self, dataLoader): 32 | for batch in dataLoader: 33 | self.batchUpdate(batch) 34 | 35 | def logStats(self, epochNr): 36 | statLogs = {} 37 | for stat, statValue in zip(self.stats, self.statValues): 38 | statLogs.update({ stat.getStatName() + "_" + k: v for k, v in stat.logStat(statValue, epochNr).items()}) 39 | return statLogs 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /cpc/test_data/seq_list.txt: -------------------------------------------------------------------------------- 1 | 6476-57446-0019 2 | 5678-43303-0032 3 | 5678-43303-0024 4 | 5678-43301-0021 5 | 5393-19218-0024 6 | 4397-15668-0007 7 | 4397-15668-0003 8 | -------------------------------------------------------------------------------- /cpc/test_data/test_db/2911/12359/2911-12359-0007.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/2911/12359/2911-12359-0007.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/4051/11218/4051-11218-0044.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/4051/11218/4051-11218-0044.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/4397/15668/4397-15668-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/4397/15668/4397-15668-0003.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/4397/15668/4397-15668-0007.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/4397/15668/4397-15668-0007.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/5393/19218/5393-19218-0024.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5393/19218/5393-19218-0024.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/5678/43301/5678-43301-0021.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5678/43301/5678-43301-0021.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/5678/43303/5678-43303-0024.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5678/43303/5678-43303-0024.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/5678/43303/5678-43303-0032.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/5678/43303/5678-43303-0032.flac -------------------------------------------------------------------------------- /cpc/test_data/test_db/6476/57446/6476-57446-0019.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chorowski-lab/CPC_audio/777513a96c498e1db8fe7e6a0b29dffc826459fd/cpc/test_data/test_db/6476/57446/6476-57446-0019.flac -------------------------------------------------------------------------------- /cpc/transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import torch 6 | import torch.nn as nn 7 | import math 8 | 9 | 10 | class ScaledDotProductAttention(nn.Module): 11 | def __init__(self, 12 | sizeSeq, # Size of the input sequence 13 | dk, # Dimension of the input sequence 14 | dropout, # Dropout parameter 15 | relpos=False): # Do we retrieve positional information ? 16 | super(ScaledDotProductAttention, self).__init__() 17 | 18 | self.drop = nn.Dropout(dropout) 19 | self.softmax = nn.Softmax(dim=2) 20 | self.relpos = relpos 21 | self.sizeSeq = sizeSeq 22 | 23 | if relpos: 24 | self.Krelpos = nn.Parameter(torch.Tensor(dk, sizeSeq)) 25 | self.initmat_(self.Krelpos) 26 | self.register_buffer('z', torch.zeros(1, sizeSeq, 1)) 27 | 28 | # A mask is set so that a node never queries data in the future 29 | mask = torch.tril(torch.ones(sizeSeq, sizeSeq), diagonal=0) 30 | mask = 1 - mask 31 | mask[mask == 1] = -float('inf') 32 | self.register_buffer('mask', mask.unsqueeze(0)) 33 | 34 | def initmat_(self, mat, dim=0): 35 | stdv = 1. / math.sqrt(mat.size(dim)) 36 | mat.data.uniform_(-stdv, stdv) 37 | 38 | def forward(self, Q, K, V): 39 | # Input dim : N x sizeSeq x dk 40 | QK = torch.bmm(Q, K.transpose(-2, -1)) 41 | 42 | if self.relpos: 43 | bsz = Q.size(0) 44 | QP = Q.matmul(self.Krelpos) 45 | # This trick with z fills QP's diagonal with zeros 46 | QP = torch.cat((self.z.expand(bsz, -1, -1), QP), 2) 47 | QK += QP.view(bsz, self.sizeSeq + 1, self.sizeSeq)[:, 1:, :] 48 | A = self.softmax(QK / math.sqrt(K.size(-1)) + self.mask) 49 | return torch.bmm(self.drop(A), V) 50 | 51 | 52 | class MultiHeadAttention(nn.Module): 53 | def __init__(self, 54 | sizeSeq, # Size of a sequence 55 | dropout, # Dropout parameter 56 | dmodel, # Model's dimension 57 | nheads, # Number of heads in the model 58 | abspos): # Is positional information encoded in the input ? 59 | super(MultiHeadAttention, self).__init__() 60 | self.Wo = nn.Linear(dmodel, dmodel, bias=False) 61 | self.Wk = nn.Linear(dmodel, dmodel, bias=False) 62 | self.Wq = nn.Linear(dmodel, dmodel, bias=False) 63 | self.Wv = nn.Linear(dmodel, dmodel, bias=False) 64 | self.nheads = nheads 65 | self.dk = dmodel // nheads 66 | self.Att = ScaledDotProductAttention(sizeSeq, self.dk, 67 | dropout, not abspos) 68 | 69 | def trans_(self, x): 70 | bsz, bptt, h, dk = x.size(0), x.size(1), self.nheads, self.dk 71 | return x.view(bsz, bptt, h, dk).transpose(1, 2).contiguous().view(bsz * h, bptt, dk) 72 | 73 | def reverse_trans_(self, x): 74 | bsz, bptt, h, dk = x.size( 75 | 0) // self.nheads, x.size(1), self.nheads, self.dk 76 | return x.view(bsz, h, bptt, dk).transpose(1, 2).contiguous().view(bsz, bptt, h * dk) 77 | 78 | def forward(self, Q, K, V): 79 | q = self.trans_(self.Wq(Q)) 80 | k = self.trans_(self.Wk(K)) 81 | v = self.trans_(self.Wv(V)) 82 | y = self.reverse_trans_(self.Att(q, k, v)) 83 | return self.Wo(y) 84 | 85 | 86 | class FFNetwork(nn.Module): 87 | def __init__(self, din, dout, dff, dropout): 88 | super(FFNetwork, self).__init__() 89 | self.lin1 = nn.Linear(din, dff, bias=True) 90 | self.lin2 = nn.Linear(dff, dout, bias=True) 91 | self.relu = nn.ReLU() 92 | self.drop = nn.Dropout(dropout) 93 | 94 | def forward(self, x): 95 | return self.lin2(self.drop(self.relu(self.lin1(x)))) 96 | 97 | 98 | class TransformerLayer(nn.Module): 99 | def __init__(self, sizeSeq=32, dmodel=512, dff=2048, 100 | dropout=0.1, nheads=8, 101 | abspos=False): 102 | super(TransformerLayer, self).__init__() 103 | self.multihead = MultiHeadAttention(sizeSeq, dropout, 104 | dmodel, nheads, abspos) 105 | self.ln_multihead = nn.LayerNorm(dmodel) 106 | self.ffnetwork = FFNetwork(dmodel, dmodel, dff, dropout) 107 | self.ln_ffnetwork = nn.LayerNorm(dmodel) 108 | 109 | def forward(self, x): 110 | y = self.ln_multihead(x + self.multihead(Q=x, K=x, V=x)) 111 | return self.ln_ffnetwork(y + self.ffnetwork(y)) 112 | 113 | 114 | class StaticPositionEmbedding(nn.Module): 115 | def __init__(self, seqlen, dmodel): 116 | super(StaticPositionEmbedding, self).__init__() 117 | pos = torch.arange(0., seqlen).unsqueeze(1).repeat(1, dmodel) 118 | dim = torch.arange(0., dmodel).unsqueeze(0).repeat(seqlen, 1) 119 | div = torch.exp(- math.log(10000) * (2*(dim//2)/dmodel)) 120 | pos *= div 121 | pos[:, 0::2] = torch.sin(pos[:, 0::2]) 122 | pos[:, 1::2] = torch.cos(pos[:, 1::2]) 123 | self.register_buffer('pe', pos.unsqueeze(0)) 124 | 125 | def forward(self, x): 126 | return x + self.pe[:, :x.size(1), :] 127 | 128 | 129 | def buildTransformerAR(dimEncoded, # Output dimension of the encoder 130 | nLayers, # Number of transformer layers 131 | sizeSeq, # Expected size of the input sequence 132 | abspos): 133 | layerSequence = [] 134 | if abspos: 135 | layerSequence += [StaticPositionEmbedding(sizeSeq, dimEncoded)] 136 | layerSequence += [TransformerLayer(sizeSeq=sizeSeq, 137 | dmodel=dimEncoded, abspos=abspos) 138 | for i in range(nLayers)] 139 | return nn.Sequential(*layerSequence) 140 | -------------------------------------------------------------------------------- /cpc/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /cpc/utils/capture_loader.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import re 4 | import torch 5 | 6 | # reads from torch files 7 | class CaptureLoader: 8 | 9 | def __init__(self, rootDir, onlyReadThose=None): 10 | self.rootDir = rootDir 11 | self.onlyReadThose = onlyReadThose 12 | 13 | self.prepare() 14 | 15 | def prepare(self): 16 | self.batchData = {} 17 | for p,sd,f in sorted(os.walk(self.rootDir)): 18 | for name in sorted(f): 19 | #print(p,sd,f,name) 20 | capturedThing = '_'.join(name.split('_')[:-1]) 21 | if self.onlyReadThose and capturedThing not in self.onlyReadThose: 22 | continue 23 | batchDescr = name.split('_')[-1].split('.')[0] 24 | batchNums = list(map(int, re.findall(r'\d+', batchDescr))) 25 | batchBegin, batchEnd = batchNums[0], batchNums[1] 26 | if (batchBegin, batchEnd) in self.batchData: 27 | self.batchData[(batchBegin, batchEnd)][capturedThing] = os.path.join(p, name) 28 | else: 29 | self.batchData[(batchBegin, batchEnd)] = {capturedThing: os.path.join(p, name)} 30 | #tensor = torch.load(os.path.join(p, name)) 31 | self.batchesNamesInOrder = sorted(self.batchData.keys()) 32 | 33 | def __len__(self): 34 | return len(self.batchesNamesInOrder) 35 | 36 | def __getitem__(self, idx): 37 | paths = self.batchData[self.batchesNamesInOrder[idx]] 38 | return {whatCaptured: torch.load(tensorPath) for whatCaptured, tensorPath in paths.items()} 39 | 40 | def __iter__(self): 41 | for i in range(len(self)): 42 | yield self[i] 43 | 44 | 45 | if __name__ == '__main__': 46 | 47 | cl = CaptureLoader("/pio/scratch/1/i283340/MGR/zs/capture/try20/8") 48 | 49 | for data in cl: 50 | print(data.keys(), [t.shape for t in data.values()]) 51 | 52 | cl2 = CaptureLoader("/pio/scratch/1/i283340/MGR/zs/capture/try20/8", ('ctx', 'cpcctc_align', 'phone_align')) 53 | 54 | for data in cl2: 55 | print(data.keys(), [t.shape for t in data.values()]) -------------------------------------------------------------------------------- /cpc/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import json 6 | import numpy as np 7 | import random 8 | import torch 9 | import sys 10 | import psutil 11 | from copy import deepcopy 12 | from bisect import bisect_left 13 | 14 | def seDistancesToCentroids(vecs, centroids, doNorm=False): 15 | 16 | if len(vecs.shape) == 2: 17 | vecs = vecs.view(1, *(vecs.shape)) 18 | 19 | B = vecs.shape[0] 20 | N = vecs.shape[1] 21 | k = centroids.shape[0] 22 | 23 | # vecs: B x L x Dim 24 | # centroids: k x Dim 25 | 26 | if doNorm: 27 | vecLengths = torch.sqrt((vecs*vecs).sum(-1)) 28 | vecs = vecs / vecLengths.view(B, N, 1) 29 | centrLengths = torch.sqrt((centroids*centroids).sum(-1)) 30 | centroids = centroids / centrLengths.view(k, 1) 31 | 32 | return torch.square(centroids).sum(1).view(1, 1, -1) + torch.square(vecs).sum(-1).view(B, N, 1) \ 33 | - 2*(vecs.view(B, N, 1, -1) * centroids.view(1, 1, k, -1)).sum(-1) #torch.matmul(vecs, centroids.T) 34 | 35 | 36 | def pushToClosestForBatch(points, centers, deg=0.5, doNorm=False, doNormForPush=False): 37 | 38 | B = points.shape[0] 39 | N = points.shape[1] 40 | k = centers.shape[0] 41 | 42 | if doNormForPush: 43 | pointsLengths = torch.sqrt((points*points).sum(-1)) 44 | points = points / pointsLengths.view(B, N, 1) 45 | centrLengths = torch.sqrt((centers*centers).sum(-1)) 46 | centers = centers / centrLengths.view(k, 1) 47 | 48 | distsSq = seDistancesToCentroids(points, centers, doNorm=doNorm) 49 | dists = torch.sqrt(distsSq) 50 | 51 | closest = dists.argmin(-1) 52 | diffs = centers[closest].view(B, N, -1) - points 53 | res = deg * diffs + points 54 | 55 | return res 56 | 57 | 58 | def untensor(d): 59 | if isinstance(d, list): 60 | return [untensor(v) for v in d] 61 | if isinstance(d, dict): 62 | return dict((k, untensor(v)) for k, v in d.items()) 63 | if hasattr(d, 'tolist'): 64 | return d.tolist() 65 | return d 66 | 67 | 68 | def save_logs(data, pathLogs): 69 | with open(pathLogs, 'w') as file: 70 | json.dump(data, file, indent=2) 71 | 72 | 73 | def update_logs(logs, logStep, prevlogs=None): 74 | out = {} 75 | for key in logs: 76 | out[key] = deepcopy(logs[key]) 77 | 78 | if prevlogs is not None: 79 | out[key] -= prevlogs[key] 80 | out[key] /= logStep 81 | return out 82 | 83 | 84 | def show_logs(text, logs): 85 | print("") 86 | print('-'*50) 87 | print(text) 88 | 89 | for key in logs: 90 | 91 | if key == "iter": 92 | continue 93 | 94 | nPredicts = logs[key].shape[0] 95 | 96 | strSteps = ['Step'] + [str(s) for s in range(1, nPredicts + 1)] 97 | formatCommand = ' '.join(['{:>16}' for x in range(nPredicts + 1)]) 98 | print(formatCommand.format(*strSteps)) 99 | 100 | strLog = [key] + ["{:10.6f}".format(s) for s in logs[key]] 101 | print(formatCommand.format(*strLog)) 102 | 103 | print('-'*50) 104 | 105 | 106 | def set_seed(seed): 107 | random.seed(seed) 108 | torch.manual_seed(seed) 109 | np.random.seed(seed) 110 | if torch.cuda.is_available(): 111 | torch.cuda.manual_seed_all(seed) 112 | 113 | 114 | def cpu_stats(): 115 | print(sys.version) 116 | print(psutil.cpu_percent()) 117 | print(psutil.virtual_memory()) 118 | 119 | 120 | def ramp_scheduling_function(n_epoch_ramp, epoch): 121 | if epoch >= n_epoch_ramp: 122 | return 1 123 | else: 124 | return (epoch + 1) / n_epoch_ramp 125 | 126 | 127 | class SchedulerCombiner: 128 | r""" 129 | An object which applies a list of learning rate schedulers sequentially. 130 | """ 131 | 132 | def __init__(self, scheduler_list, activation_step, curr_step=0): 133 | r""" 134 | Args: 135 | - scheduler_list (list): a list of learning rate schedulers 136 | - activation_step (list): a list of int. activation_step[i] 137 | indicates at which step scheduler_list[i] should be activated 138 | - curr_step (int): the starting step. Must be lower than 139 | activation_step[0] 140 | """ 141 | 142 | if len(scheduler_list) != len(activation_step): 143 | raise ValueError("The number of scheduler must be the same as " 144 | "the number of activation step") 145 | if activation_step[0] > curr_step: 146 | raise ValueError("The first activation step cannot be higher than " 147 | "the current step.") 148 | self.scheduler_list = scheduler_list 149 | self.activation_step = deepcopy(activation_step) 150 | self.curr_step = curr_step 151 | 152 | def step(self): 153 | self.curr_step += 1 154 | index = bisect_left(self.activation_step, self.curr_step) - 1 155 | for i in reversed(range(index, len(self.scheduler_list))): 156 | self.scheduler_list[i].step() 157 | 158 | def __str__(self): 159 | out = "SchedulerCombiner \n" 160 | out += "(\n" 161 | for index, scheduler in enumerate(self.scheduler_list): 162 | out += f"({index}) {scheduler.__str__()} \n" 163 | out += ")\n" 164 | return out 165 | -------------------------------------------------------------------------------- /cpc/utils/unit_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import unittest 6 | import torch 7 | import os 8 | from nose.tools import eq_, ok_ 9 | 10 | from .misc import SchedulerCombiner, ramp_scheduling_function 11 | 12 | 13 | class TestCombineSchedulers(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.baseLR = 1 17 | self.module = torch.nn.Linear(1, 1) 18 | self.optimizer = torch.optim.SGD( 19 | list(self.module.parameters()), lr=self.baseLR) 20 | 21 | def testCombineRamp(self): 22 | scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, 23 | lr_lambda=lambda epoch: ramp_scheduling_function( 24 | 3, epoch)) 25 | self.optimizer.step() 26 | eq_(self.optimizer.param_groups[0]['lr'], self.baseLR / 3) 27 | scheduler.step() 28 | eq_(self.optimizer.param_groups[0]['lr'], 2 * self.baseLR / 3) 29 | scheduler.step() 30 | eq_(self.optimizer.param_groups[0]['lr'], 1) 31 | 32 | for i in range(12): 33 | scheduler.step() 34 | eq_(self.optimizer.param_groups[0]['lr'], 1) 35 | 36 | def testCombineRampStep(self): 37 | scheduler_step = torch.optim.lr_scheduler.StepLR( 38 | self.optimizer, 6, gamma=0.5) 39 | scheduler_ramp = torch.optim.lr_scheduler.LambdaLR(self.optimizer, 40 | lr_lambda=lambda epoch: ramp_scheduling_function( 41 | 3, epoch)) 42 | 43 | scheduler = SchedulerCombiner([scheduler_ramp, scheduler_step], [0, 3]) 44 | self.optimizer.step() 45 | # Epoch 0 46 | eq_(self.optimizer.param_groups[0]['lr'], self.baseLR / 3) 47 | scheduler.step() 48 | # Epoch 1 49 | eq_(self.optimizer.param_groups[0]['lr'], 2 * self.baseLR / 3) 50 | scheduler.step() 51 | # Epoch 2 52 | eq_(self.optimizer.param_groups[0]['lr'], 1) 53 | scheduler.step() 54 | 55 | # Epoch 3, 4, 5 56 | for i in range(3): 57 | eq_(self.optimizer.param_groups[0]['lr'], 1) 58 | scheduler.step() 59 | 60 | # Epoch 6 61 | eq_(self.optimizer.param_groups[0]['lr'], 0.5) 62 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: cpc37 2 | channels: 3 | - pytorch 4 | - anaconda 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - pytorch 9 | - torchvision 10 | - cudatoolkit=9.2 11 | - psutil 12 | - pip 13 | - openblas-devel 14 | - tqdm 15 | - nose 16 | - cython 17 | - pysoundfile 18 | - pip: 19 | - progressbar2 20 | - matplotlib 21 | - torchaudio 22 | -------------------------------------------------------------------------------- /experiments/train_pro_1gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script for the Prometheus slurm cluster 4 | 5 | set -x 6 | 7 | RVERB="" # =-v 8 | 9 | REMOTE_USER=plgjch 10 | REMOTE_HOST=pro.cyfronet.pl 11 | 12 | # location of the main repository (contains data/) 13 | CPC_DIR=/pio/scratch/2/jch/wav2vec/CPC_audio #"$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 14 | REMOTE_CPC_DIR=/net/people/plgjch/scratch/CPC_audio 15 | REMOTE_MINICONDA_DIR=/net/archive/groups/plggneurony/os/miniconda3 16 | 17 | # top-level directory for experiments 18 | REMOTE_EXPERIMENT_RUNDIR=/net/scratch/people/plgjch/cpc/ 19 | 20 | # adjust the main loop 21 | # (it can go over .yaml files, over hyperparameters, etc. 22 | for DUMMY in aa \ 23 | ; do 24 | 25 | # low-level directory for experiments 26 | EXP_TAG=remote_pro 27 | NAME=baseline_1gpu 28 | DIR=$EXP_TAG/$NAME 29 | EXP_DIR=$REMOTE_EXPERIMENT_RUNDIR/$DIR 30 | 31 | ssh -q $REMOTE_USER@$REMOTE_HOST mkdir -p $EXP_DIR 32 | 33 | TMP_DIR=`mktemp -d` 34 | mkdir $TMP_DIR/code 35 | # symlink the data from the main dir 36 | 37 | cat > $TMP_DIR/exp_train.sh < $TMP_DIR/exp_train.sh < $TMP_DIR/exp_train.sh <&1 | tee -ai $EXP_DIR/lineval_\${CP}/out.txt 131 | EOF 132 | 133 | # Transmit the startup script 134 | rsync $RVERB -lrpt -e "ssh -q" $TMP_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/ 135 | 136 | # Transmit the rest 137 | rsync --exclude '.*' \ 138 | --exclude data \ 139 | --exclude pretrained_models \ 140 | --exclude '__pycache__' \ 141 | --exclude '*runs*' \ 142 | --exclude '*.pyc' \ 143 | --exclude '*.ipynb' \ 144 | --filter=':- .gitignore' \ 145 | $RVERB -lrpt -e "ssh -q" $CPC_DIR/ $REMOTE_USER@$REMOTE_HOST:$EXP_DIR/code/ 146 | 147 | ssh -q $REMOTE_USER@$REMOTE_HOST sbatch \ 148 | `#--gres="" --time=00:10:00 -p plgrid-testing` \ 149 | $EXP_DIR/exp_train.sh 150 | 151 | rm -Rf $TMP_DIR 152 | 153 | done 154 | 155 | echo "Queue status" 156 | ssh -q $REMOTE_USER@$REMOTE_HOST squeue 157 | -------------------------------------------------------------------------------- /finetune_nullspace.sh: -------------------------------------------------------------------------------- 1 | SAVE_DIR="/pio/scratch/1/i273233/linear_separability/cpc/gru_level2/cpc_official" 2 | SPEAKERS="speakers_factorized" 3 | PHONEMES="phonemes_nullspace" 4 | SPEAKERS_NULLSPACE="speakers_nullspace" 5 | 6 | DIM_INTER=$1 7 | FROM_STEP=$SPEAKERS 8 | if [[ $# -ge 2 ]]; then 9 | FROM_STEP=$2 10 | fi 11 | 12 | case $FROM_STEP in 13 | $SPEAKERS) 14 | echo $SPEAKERS 15 | mkdir -p ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER} --mode $SPEAKERS --max_size_loaded 40000000 --n_process_loader 2 --model cpc --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/log.txt 16 | ;& 17 | $PHONEMES) 18 | echo $PHONEMES 19 | mkdir -p ${SAVE_DIR}_${PHONEMES}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${PHONEMES}_${DIM_INTER} --mode $PHONEMES --max_size_loaded 40000000 --n_process_loader 2 --model cpc --pathPhone $zd/LibriSpeech/alignments2/converted_aligned_phones.txt --path_speakers_factorized ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/checkpoint_9.pt --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${PHONEMES}_${DIM_INTER}/log.txt 20 | ;& 21 | $SPEAKERS_NULLSPACE) 22 | echo $SPEAKERS_NULLSPACE 23 | mkdir -p ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER} && python cpc/eval/linear_separability.py $zd/LibriSpeech/train-clean-100/ $zd/LibriSpeech/labels_split/train_split_100.txt $zd/LibriSpeech/labels_split/test_split_100.txt $zd/checkpoints/CPC-big-kmeans50/cpc_ll6k/checkpoint_32.pt --pathCheckpoint ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER} --mode $SPEAKERS_NULLSPACE --max_size_loaded 40000000 --n_process_loader 2 --model cpc --path_speakers_factorized ${SAVE_DIR}_${SPEAKERS}_${DIM_INTER}/checkpoint_9.pt --dim_inter $DIM_INTER --gru_level 2 | tee ${SAVE_DIR}_${SPEAKERS_NULLSPACE}_${DIM_INTER}/log.txt 24 | ;; 25 | *) 26 | echo "Invalid from step: ${FROM_STEP} while it should be either ${SPEAKERS}, ${PHONEMES} or ${SPEAKERS_NULLSPACE}" 27 | ;; 28 | esac 29 | 30 | exit 0 -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | import argparse 6 | import torch 7 | from cpc.model import CPCModel as cpcmodel 8 | from cpc.cpc_default_config import get_default_cpc_config 9 | from cpc.feature_loader import getEncoder, getAR, loadArgs 10 | dependencies = ['torch', 'torchaudio'] 11 | 12 | 13 | def CPC_audio(pretrained=False, 14 | **kwargs): 15 | """ 16 | Contrast predictive learning model for audio data 17 | pretrained: if True, load a model trained on libri-light 60k 18 | (https://arxiv.org/abs/1912.07875) 19 | **kwargs : see cpc/cpc_default_config to get the list of possible arguments 20 | """ 21 | locArgs = get_default_cpc_config() 22 | if pretrained: 23 | checkpoint_url = 'https://dl.fbaipublicfiles.com/librilight/CPC_checkpoints/60k_epoch4-d0f474de.pt' 24 | checkpoint = torch.hub.load_state_dict_from_url(checkpoint_url, 25 | progress=False) 26 | loadArgs(locArgs, argparse.Namespace(**checkpoint["config"])) 27 | else: 28 | args = argparse.Namespace(**kwargs) 29 | loadArgs(locArgs, args) 30 | encoderNet = getEncoder(locArgs) 31 | arNet = getAR(locArgs) 32 | model = cpcmodel(encoderNet, arNet) 33 | if pretrained: 34 | model.load_state_dict(checkpoint["weights"], strict=False) 35 | return model 36 | -------------------------------------------------------------------------------- /jch_experiments: -------------------------------------------------------------------------------- 1 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 64 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 12 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl.log 2 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 64 --nPredicts 8 --CPCCTC --CPCCTCNumMatched 16 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16.log 3 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16_12/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 48 --nPredicts 12 --CPCCTC --CPCCTCNumMatched 16 --CPCCTCSelfLoop --CPCCTCSkipBeg 1 --CPCCTCSkipEnd 2 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_sl_16_12.log 4 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_largebatch/ --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .flac --n_process_loader 8 --max_size_loaded 400000000 --batchSizeGPU 64 --nPredicts 12 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_largebatch.log 5 | 6 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .wav --normMode layerNorm --dropout --rnnMode transformer --n_process_loader 1 --max_size_loaded 4000000000 --batchSizeGPU 32 --limitNegsInBatch 8 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan_stdout.txt 7 | 8 | # should be?? 9 | python -u cpc/train.py --pathDB /pio/data/zerospeech2021/LibriSpeech/train-clean-100 --pathCheckpoint /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt --file_extension .wav --normMode layerNorm --dropout --rnnMode transformer --n_process_loader 1 --max_size_loaded 4000000000 --nLevelsGRU 2 --batchSizeGPU 32 --limitNegsInBatch 8 --schedulerRamp 10 2>&1 | tee -ai /pio/scratch/2/jch/wav2vec/runs/cpc/cpc_ctc_try_baseline_likealan_stdout.txt 10 | -------------------------------------------------------------------------------- /lineval_ls100.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | RVERB="-v --dry-run" 7 | RVERB="" 8 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 9 | SAVE_DIR="$( 10 | python - "$@" << END 11 | if 1: 12 | import argparse 13 | import os.path 14 | parser = argparse.ArgumentParser(description='Process some integers.') 15 | parser.add_argument('load', type=str, 16 | help="Path to the checkpoint to evaluate.") 17 | parser.add_argument('--pathCheckpoint') 18 | parser.add_argument('--CTC', action='store_true') 19 | args, _ = parser.parse_known_args() 20 | checkpoint_dir = os.path.dirname(args.load) 21 | checkpoint_no = args.load.split('_')[-1][:-3] 22 | eval_ctc = "" 23 | if args.CTC: 24 | eval_ctc = "_ctc" 25 | print(f"{checkpoint_dir}/lineval{eval_ctc}_{checkpoint_no}") 26 | END 27 | )" 28 | 29 | mkdir -p ${SAVE_DIR}/code 30 | rsync --exclude '.*' \ 31 | --exclude data \ 32 | --exclude pretrained_models \ 33 | --exclude '__pycache__' \ 34 | --exclude '*runs*' \ 35 | --exclude '*.pyc' \ 36 | --exclude '*.ipynb' \ 37 | --filter=':- .gitignore' \ 38 | $RVERB -lrpt $CPC_DIR/ ${SAVE_DIR}/code/ 39 | 40 | echo $0 "$@" >> ${SAVE_DIR}/out.txt 41 | exec python -u cpc/eval/linear_separability.py \ 42 | /pio/data/zerospeech2021/LibriSpeech-wav/train-clean-100 \ 43 | /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt \ 44 | /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \ 45 | "$@" \ 46 | --pathPhone /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/converted_aligned_phones.txt \ 47 | --file_extension .wav \ 48 | --pathCheckpoint $SAVE_DIR \ 49 | 2>&1 | tee -ai ${SAVE_DIR}/out.txt 50 | -------------------------------------------------------------------------------- /run_clustering.sh: -------------------------------------------------------------------------------- 1 | NULLSPACE_SIZE=$1 2 | BATCH_SIZE_GPU=$2 3 | MAX_ITER=$3 4 | 5 | python cpc/criterion/clustering/clustering_script.py --pathDB $zd/LibriSpeech/train-clean-100/ --recursionLevel 1 --nClusters 50 --MAX_ITER $MAX_ITER --level_gru 2 --save --load --batchSizeGPU $BATCH_SIZE_GPU --max_size_loaded 40000000 --n_process_loader 2 --nullspace ../linear_separability/cpc/gru_level2/cpc_official_phonemes_nullspace_$NULLSPACE_SIZE/checkpoint_9.pt checkpoints/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE.pt 6 | for directory in dev-clean dev-other test-clean test-other train-clean-100 train-full-960 7 | do 8 | python ./scripts/quantize_audio.py $cpc/checkpoints/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE/clustering_CPC_big_kmeans50_nullspace_$NULLSPACE_SIZE.pt $zd/LibriSpeech/$directory/ /pio/gluster/i273233/quantized/nullspace_$NULLSPACE_SIZE/LibriSpeech/$directory --file_extension flac --nobatch --nullspace 9 | done -------------------------------------------------------------------------------- /scripts/build_1hot_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import progressbar 6 | from pathlib import Path 7 | from time import time 8 | import numpy as np 9 | 10 | from utils.utils_functions import writeArgs 11 | 12 | def parseArgs(argv): 13 | # Run parameters 14 | parser = argparse.ArgumentParser(description='Export 1-hot features from quantized units of audio files.') 15 | parser.add_argument('pathQuantizedUnits', type=str, 16 | help='Path to the quantized units. Each line of the input file must be' 17 | 'of the form file_name[tab]pseudo_units (ex. hat 1,1,2,3,4,4)') 18 | parser.add_argument('pathOutputDir', type=str, 19 | help='Path to the output directory.') 20 | parser.add_argument('--n_units', type=int, default=50, 21 | help='Number of discrete units (default: 50). If a dictionary is given,' 22 | 'this is automatically set as vocab size.') 23 | parser.add_argument('--dict', type=str, 24 | help='Path to the dictionary file containing vocab of the pseudo units on the dataset' 25 | '(this is required if the quantized units are not digits, i.e. multi-group case).') 26 | parser.add_argument('--debug', action='store_true', 27 | help="Load only a very small amount of files for " 28 | "debugging purposes.") 29 | return parser.parse_args(argv) 30 | 31 | def main(argv): 32 | # Args parser 33 | args = parseArgs(argv) 34 | 35 | print("=============================================================") 36 | print(f"Building 1-hot features from {args.pathQuantizedUnits}") 37 | print("=============================================================") 38 | 39 | # Load input file 40 | print("") 41 | print(f"Reading input file from {args.pathQuantizedUnits}") 42 | seqNames = [] 43 | seqInputs = [] 44 | with open(args.pathQuantizedUnits, 'r') as f: 45 | for line in f: 46 | file_name, file_seq = line.strip().split("\t") 47 | # Convert sequence to the desired input form 48 | file_seq = file_seq.replace(",", " ") 49 | # Add to lists 50 | seqNames.append(file_name) 51 | seqInputs.append(file_seq) 52 | print(f"Found {len(seqNames)} sequences!") 53 | 54 | # Verify the output directory 55 | if os.path.exists(args.pathOutputDir): 56 | existing_files = set([os.path.splitext(os.path.basename(x))[0] 57 | for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"]) 58 | seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files] 59 | print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!") 60 | else: 61 | print("") 62 | print(f"Creating the output directory at {args.pathOutputDir}") 63 | Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) 64 | writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) 65 | 66 | # Debug mode 67 | if args.debug: 68 | nsamples=20 69 | print("") 70 | print(f"Debug mode activated, only load {nsamples} samples!") 71 | # shuffle(seqNames) 72 | seqNames = seqNames[:nsamples] 73 | seqInputs = seqInputs[:nsamples] 74 | 75 | # Load 1hot dictionary in case we use it 76 | if seqInputs and not seqInputs[0].split()[0].isdigit(): #multi-group ie. 65-241 77 | assert args.dict is not None, \ 78 | "A dictionary must be given when the quantized outputs is not digits (multi-group case)!" 79 | if args.dict: 80 | print("") 81 | print(f"Loading onehot dictionary from {args.dict}...") 82 | with open(args.dict, "r") as f: 83 | lines = f.read().split("\n") 84 | pair2idx={word.split()[0]: i for i, word in enumerate(lines) if word and not word.startwith("madeupword")} 85 | args.n_units = len(pair2idx) 86 | 87 | # Define onehot_feature_function 88 | def onehot_feature_function(input_sequence): 89 | if args.dict: 90 | indexes_sequence = np.array([pair2idx[item] for item in input_sequence.split()]) 91 | else: 92 | indexes_sequence = np.array([int(item) for item in input_sequence.split()]) 93 | 94 | onehotFeatures = np.eye(args.n_units)[indexes_sequence] 95 | 96 | return onehotFeatures 97 | 98 | # Building features 99 | print("") 100 | print(f"Building 1-hot features and saving outputs to {args.pathOutputDir}...") 101 | bar = progressbar.ProgressBar(maxval=len(seqNames)) 102 | bar.start() 103 | start_time = time() 104 | for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)): 105 | bar.update(index) 106 | 107 | # Computing features 108 | onehot_features = onehot_feature_function(input_seq) 109 | 110 | # Save the outputs 111 | file_name = os.path.splitext(name_seq)[0] + ".txt" 112 | file_out = os.path.join(args.pathOutputDir, file_name) 113 | np.savetxt(file_out, onehot_features) 114 | bar.finish() 115 | print(f"...done {len(seqNames)} files in {time()-start_time} seconds.") 116 | 117 | if __name__ == "__main__": 118 | args = sys.argv[1:] 119 | main(args) 120 | -------------------------------------------------------------------------------- /scripts/build_BERT_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import progressbar 6 | from pathlib import Path 7 | from time import time 8 | import numpy as np 9 | 10 | import torch 11 | 12 | from utils.utils_functions import writeArgs, loadRobertaCheckpoint 13 | 14 | def parseArgs(argv): 15 | # Run parameters 16 | parser = argparse.ArgumentParser(description='Export BERT features from quantized units of audio files.') 17 | parser.add_argument('pathQuantizedUnits', type=str, 18 | help='Path to the quantized units. Each line of the input file must be' 19 | 'of the form file_name[tab]pseudo_units (ex. hat 1,1,2,3,4,4)') 20 | parser.add_argument('pathOutputDir', type=str, 21 | help='Path to the output directory.') 22 | parser.add_argument('pathBERTCheckpoint', type=str, 23 | help='Path to the trained fairseq BERT(RoBERTa) model.') 24 | parser.add_argument('--dict', type=str, 25 | help='Path to the dictionary file (dict.txt) used to train the BERT model' 26 | '(if not speficied, look for dict.txt in the model directory)') 27 | parser.add_argument('--hidden_level', type=int, default=-1, 28 | help="Hidden layer of BERT to extract features from (default: -1, last layer).") 29 | parser.add_argument('--debug', action='store_true', 30 | help="Load only a very small amount of files for " 31 | "debugging purposes.") 32 | parser.add_argument('--cpu', action='store_true', 33 | help="Run on a cpu machine.") 34 | return parser.parse_args(argv) 35 | 36 | def main(argv): 37 | # Args parser 38 | args = parseArgs(argv) 39 | 40 | print("=============================================================") 41 | print(f"Building BERT features from {args.pathQuantizedUnits}") 42 | print("=============================================================") 43 | 44 | # Load input file 45 | print("") 46 | print(f"Reading input file from {args.pathQuantizedUnits}") 47 | seqNames = [] 48 | seqInputs = [] 49 | with open(args.pathQuantizedUnits, 'r') as f: 50 | for line in f: 51 | file_name, file_seq = line.strip().split("\t") 52 | # Convert sequence to the desired input form 53 | file_seq = file_seq.replace(",", " ") 54 | # Add to lists 55 | seqNames.append(file_name) 56 | seqInputs.append(file_seq) 57 | print(f"Found {len(seqNames)} sequences!") 58 | 59 | # Verify the output directory 60 | if os.path.exists(args.pathOutputDir): 61 | existing_files = set([os.path.splitext(os.path.basename(x))[0] 62 | for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"]) 63 | seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files] 64 | print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!") 65 | else: 66 | print("") 67 | print(f"Creating the output directory at {args.pathOutputDir}") 68 | Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) 69 | writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) 70 | 71 | # Debug mode 72 | if args.debug: 73 | nsamples=20 74 | print("") 75 | print(f"Debug mode activated, only load {nsamples} samples!") 76 | # shuffle(seqNames) 77 | seqNames = seqNames[:nsamples] 78 | seqInputs = seqInputs[:nsamples] 79 | 80 | # Load BERT model 81 | if args.dict is None: 82 | pathData = os.path.dirname(args.pathBERTCheckpoint) 83 | else: 84 | pathData = os.path.dirname(args.dict) 85 | assert os.path.exists(os.path.join(pathData, "dict.txt")), \ 86 | f"Dictionary file (dict.txt) not found in {pathData}" 87 | print("") 88 | print(f"Loading RoBERTa model from {args.pathBERTCheckpoint}...") 89 | print(f"Path data {pathData}") 90 | roberta = loadRobertaCheckpoint( 91 | args.pathBERTCheckpoint, 92 | pathData, 93 | from_pretrained=False) 94 | roberta.eval() # disable dropout (or leave in train mode to finetune) 95 | if not args.cpu: 96 | roberta.cuda() 97 | print("Model loaded !") 98 | 99 | # Define BERT_feature_function 100 | def BERT_feature_function(input_sequence, n_hidden=-1): 101 | sentence_tokens = roberta.task.source_dictionary.encode_line( 102 | " " + input_sequence, 103 | append_eos=True, 104 | add_if_not_exist=False).type(torch.LongTensor) 105 | if not args.cpu: 106 | sentence_tokens = sentence_tokens.cuda() 107 | 108 | with torch.no_grad(): 109 | outputs = roberta.extract_features(sentence_tokens, return_all_hiddens=True) 110 | 111 | return outputs[n_hidden].squeeze(0).float().cpu().numpy() 112 | 113 | # Building features 114 | print("") 115 | print(f"Building BERT features and saving outputs to {args.pathOutputDir}...") 116 | bar = progressbar.ProgressBar(maxval=len(seqNames)) 117 | bar.start() 118 | start_time = time() 119 | for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)): 120 | bar.update(index) 121 | 122 | # Computing features 123 | BERT_features = BERT_feature_function(input_seq, n_hidden=args.hidden_level) 124 | 125 | # Save the outputs 126 | file_name = os.path.splitext(name_seq)[0] + ".txt" 127 | file_out = os.path.join(args.pathOutputDir, file_name) 128 | np.savetxt(file_out, BERT_features) 129 | bar.finish() 130 | print(f"...done {len(seqNames)} files in {time()-start_time} seconds.") 131 | 132 | if __name__ == "__main__": 133 | args = sys.argv[1:] 134 | main(args) 135 | -------------------------------------------------------------------------------- /scripts/build_CPC_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import progressbar 6 | from pathlib import Path 7 | from time import time 8 | import numpy as np 9 | 10 | from cpc.dataset import findAllSeqs 11 | from cpc.feature_loader import buildFeature, FeatureModule, loadModel 12 | 13 | from utils.utils_functions import writeArgs, loadCPCFeatureMaker 14 | 15 | def parseArgs(argv): 16 | # Run parameters 17 | parser = argparse.ArgumentParser(description='Export CPC features from audio files.') 18 | parser.add_argument('pathCPCCheckpoint', type=str, 19 | help='Path to the CPC checkpoint.') 20 | parser.add_argument('pathDB', type=str, 21 | help='Path to the dataset that we want to quantize.') 22 | parser.add_argument('pathOutputDir', type=str, 23 | help='Path to the output directory.') 24 | parser.add_argument('--file_extension', type=str, default="wav", 25 | help="Extension of the audio files in the dataset (default: wav).") 26 | parser.add_argument('--get_encoded', type=bool, default=False, 27 | help='If True, get the outputs of the encoder layer only (default: False).') 28 | parser.add_argument('--gru_level', type=int, default=-1, 29 | help='Hidden level of the LSTM autoregressive model to be taken' 30 | '(default: -1, last layer).') 31 | parser.add_argument('--max_size_seq', type=int, default=64000, 32 | help='Maximal number of frames to consider in each chunk' 33 | 'when computing CPC features (defaut: 64000).') 34 | parser.add_argument('--seq_norm', type=bool, default=False, 35 | help='If True, normalize the output along the time' 36 | 'dimension to get chunks of mean zero and var 1 (default: False).') 37 | parser.add_argument('--strict', type=bool, default=True, 38 | help='If True, each batch of feature ' 39 | 'will contain exactly max_size_seq frames (defaut: True).') 40 | parser.add_argument('--debug', action='store_true', 41 | help="Load only a very small amount of files for " 42 | "debugging purposes.") 43 | parser.add_argument('--cpu', action='store_true', 44 | help="Run on a cpu machine.") 45 | return parser.parse_args(argv) 46 | 47 | def main(argv): 48 | # Args parser 49 | args = parseArgs(argv) 50 | 51 | print("=============================================================") 52 | print(f"Building CPC features from {args.pathDB}") 53 | print("=============================================================") 54 | 55 | # Find all sequences 56 | print("") 57 | print(f"Looking for all {args.file_extension} files in {args.pathDB}") 58 | seqNames, _ = findAllSeqs(args.pathDB, 59 | speaker_level=1, 60 | extension=args.file_extension, 61 | loadCache=True) 62 | if len(seqNames) == 0 or not os.path.splitext(seqNames[0][-1])[1].endswith(args.file_extension): 63 | print(f"Seems like the _seq_cache.txt does not contain the correct extension, reload the file list") 64 | seqNames, _ = findAllSeqs(args.pathDB, 65 | speaker_level=1, 66 | extension=args.file_extension, 67 | loadCache=False) 68 | print(f"Done! Found {len(seqNames)} files!") 69 | 70 | # Verify the output directory 71 | if os.path.exists(args.pathOutputDir): 72 | existing_files = set([os.path.splitext(os.path.basename(x))[0] 73 | for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"]) 74 | seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files] 75 | print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!") 76 | else: 77 | print("") 78 | print(f"Creating the output directory at {args.pathOutputDir}") 79 | Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) 80 | writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) 81 | 82 | # Debug mode 83 | if args.debug: 84 | nsamples=20 85 | print("") 86 | print(f"Debug mode activated, only load {nsamples} samples!") 87 | # shuffle(seqNames) 88 | seqNames = seqNames[:nsamples] 89 | 90 | # Load CPC feature maker 91 | print("") 92 | print(f"Loading CPC featureMaker from {args.pathCPCCheckpoint}") 93 | featureMaker = loadCPCFeatureMaker( 94 | args.pathCPCCheckpoint, 95 | gru_level = args.gru_level, 96 | get_encoded = args.get_encoded, 97 | keep_hidden = True) 98 | featureMaker.eval() 99 | if not args.cpu: 100 | featureMaker.cuda() 101 | print("CPC FeatureMaker loaded!") 102 | 103 | # Define CPC_feature_function 104 | def CPC_feature_function(x): 105 | CPC_features = buildFeature(featureMaker, x, 106 | seqNorm=args.seq_norm, 107 | strict=args.strict, 108 | maxSizeSeq=args.max_size_seq) 109 | return CPC_features.squeeze(0).float().cpu().numpy() 110 | 111 | # Building features 112 | print("") 113 | print(f"Building CPC features and saving outputs to {args.pathOutputDir}...") 114 | bar = progressbar.ProgressBar(maxval=len(seqNames)) 115 | bar.start() 116 | start_time = time() 117 | for index, vals in enumerate(seqNames): 118 | bar.update(index) 119 | 120 | file_path = vals[1] 121 | file_path = os.path.join(args.pathDB, file_path) 122 | 123 | # Computing features 124 | CPC_features = CPC_feature_function(file_path) 125 | 126 | # Save the outputs 127 | file_name = os.path.splitext(os.path.basename(file_path))[0] + ".txt" 128 | file_out = os.path.join(args.pathOutputDir, file_name) 129 | np.savetxt(file_out, CPC_features) 130 | bar.finish() 131 | print(f"...done {len(seqNames)} files in {time()-start_time} seconds.") 132 | 133 | if __name__ == "__main__": 134 | args = sys.argv[1:] 135 | main(args) -------------------------------------------------------------------------------- /scripts/build_LSTM_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import progressbar 6 | from pathlib import Path 7 | from time import time 8 | import numpy as np 9 | from copy import deepcopy 10 | 11 | import torch 12 | 13 | from utils.utils_functions import writeArgs, loadLSTMLMCheckpoint 14 | 15 | def parseArgs(argv): 16 | # Run parameters 17 | parser = argparse.ArgumentParser(description='Export LSTM features from quantized units of audio files.') 18 | parser.add_argument('pathQuantizedUnits', type=str, 19 | help='Path to the quantized units. Each line of the input file must be' 20 | 'of the form file_name[tab]pseudo_units (ex. hat 1,1,2,3,4,4)') 21 | parser.add_argument('pathOutputDir', type=str, 22 | help='Path to the output directory.') 23 | parser.add_argument('pathLSTMCheckpoint', type=str, 24 | help='Path to the trained fairseq lstm_lm model.') 25 | parser.add_argument('--dict', type=str, 26 | help='Path to the dictionary file (dict.txt) used to train the LSTM LM model' 27 | '(if not speficied, look for dict.txt in the model directory)') 28 | parser.add_argument('--hidden_level', type=int, default=-1, 29 | help="Hidden layer of BERT to extract features from (default: -1, last layer).") 30 | parser.add_argument('--debug', action='store_true', 31 | help="Load only a very small amount of files for " 32 | "debugging purposes.") 33 | parser.add_argument('--cpu', action='store_true', 34 | help="Run on a cpu machine.") 35 | return parser.parse_args(argv) 36 | 37 | def main(argv): 38 | # Args parser 39 | args = parseArgs(argv) 40 | 41 | print("=============================================================") 42 | print(f"Building BERT features from {args.pathQuantizedUnits}") 43 | print("=============================================================") 44 | 45 | # Load input file 46 | print("") 47 | print(f"Reading input file from {args.pathQuantizedUnits}") 48 | seqNames = [] 49 | seqInputs = [] 50 | with open(args.pathQuantizedUnits, 'r') as f: 51 | for line in f: 52 | file_name, file_seq = line.strip().split("\t") 53 | # Convert sequence to the desired input form 54 | file_seq = file_seq.replace(",", " ") 55 | # Add to lists 56 | seqNames.append(file_name) 57 | seqInputs.append(file_seq) 58 | print(f"Found {len(seqNames)} sequences!") 59 | 60 | # Verify the output directory 61 | if os.path.exists(args.pathOutputDir): 62 | existing_files = set([os.path.splitext(os.path.basename(x))[0] 63 | for x in os.listdir(args.pathOutputDir) if x[-4:]==".npy"]) 64 | seqNames = [s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files] 65 | print(f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!") 66 | else: 67 | print("") 68 | print(f"Creating the output directory at {args.pathOutputDir}") 69 | Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) 70 | writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) 71 | 72 | # Debug mode 73 | if args.debug: 74 | nsamples=20 75 | print("") 76 | print(f"Debug mode activated, only load {nsamples} samples!") 77 | # shuffle(seqNames) 78 | seqNames = seqNames[:nsamples] 79 | seqInputs = seqInputs[:nsamples] 80 | 81 | # Load LSTM model 82 | if args.dict is None: 83 | pathData = os.path.dirname(args.pathLSTMCheckpoint) 84 | else: 85 | pathData = os.path.dirname(args.dict) 86 | assert os.path.exists(os.path.join(pathData, "dict.txt")), \ 87 | f"Dictionary file (dict.txt) not found in {pathData}" 88 | print("") 89 | print(f"Loading LSTM model from {args.pathLSTMCheckpoint}...") 90 | print(f"Path data {pathData}") 91 | model, task = loadLSTMLMCheckpoint( 92 | args.pathLSTMCheckpoint, 93 | pathData) 94 | model.eval() # disable dropout (or leave in train mode to finetune) 95 | if not args.cpu: 96 | model.cuda() 97 | print("Model loaded !") 98 | 99 | # Define LSTM_feature_function 100 | def LSTM_feature_function(input_sequence, n_hidden=-1): 101 | # Get the number of layers 102 | num_layers = len(model.decoder.layers) 103 | assert abs(n_hidden) <= num_layers, \ 104 | "absolute value of n_hidden must be less than or equal to the number of hidden layers = {}".format(num_layers) 105 | 106 | if n_hidden < 0: 107 | n_hidden = num_layers + 1 + n_hidden 108 | 109 | # Get input tensor 110 | input_tensor = task.source_dictionary.encode_line( 111 | " " + input_sequence, 112 | append_eos=True, 113 | add_if_not_exist=False).type(torch.LongTensor).unsqueeze(0) 114 | if not args.cpu: 115 | input_tensor = input_tensor.cuda() 116 | 117 | # Get the output 118 | if n_hidden == 0: # Take the embedding layer 119 | with torch.no_grad(): 120 | output_tensor = model.decoder.embed_tokens(input_tensor) 121 | 122 | else: 123 | decoder_clone = deepcopy(model.decoder) 124 | 125 | # We don't take the final fc features 126 | decoder_clone.fc_out = torch.nn.Identity() 127 | decoder_clone.additional_fc = torch.nn.Identity() 128 | 129 | # Restrict the number of hiddden layers to n_hidden 130 | decoder_clone.layers = decoder_clone.layers[:n_hidden] 131 | 132 | with torch.no_grad(): 133 | output_tensor = decoder_clone(input_tensor)[0] 134 | 135 | return output_tensor[0].data.cpu().numpy() 136 | 137 | # Building features 138 | print("") 139 | print(f"Building LSTM features and saving outputs to {args.pathOutputDir}...") 140 | bar = progressbar.ProgressBar(maxval=len(seqNames)) 141 | bar.start() 142 | start_time = time() 143 | for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)): 144 | bar.update(index) 145 | 146 | # Computing features 147 | LSTM_features = LSTM_feature_function(input_seq, n_hidden=args.hidden_level) 148 | 149 | # Save the outputs 150 | file_name = os.path.splitext(name_seq)[0] + ".txt" 151 | file_out = os.path.join(args.pathOutputDir, file_name) 152 | np.savetxt(file_out, LSTM_features) 153 | bar.finish() 154 | print(f"...done {len(seqNames)} files in {time()-start_time} seconds.") 155 | 156 | if __name__ == "__main__": 157 | args = sys.argv[1:] 158 | main(args) 159 | -------------------------------------------------------------------------------- /scripts/compute_proba_BERT.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from os.path import exists, join, basename, dirname, abspath 3 | import sys 4 | import argparse 5 | 6 | from utils.utils_functions import loadRobertaCheckpoint 7 | from utils.lm_scoring import compute_proba_BERT_mlm_span 8 | 9 | def parseArgs(argv): 10 | # Run parameters 11 | parser = argparse.ArgumentParser(description='Compute pseudo log-probabilities of quantized units with a trained BERT model.') 12 | parser.add_argument('pathQuantizedUnits', type=str, 13 | help='Path to the quantized units. Each line of the input file must be' 14 | 'of the form file_name[tab]pseudo_units (ex. hat 1,1,2,3,4,4)') 15 | parser.add_argument('pathOutputFile', type=str, 16 | help='Path to the output file containing scores.') 17 | parser.add_argument('pathBERTCheckpoint', type=str, 18 | help='Path to the trained fairseq BERT(RoBERTa) model.') 19 | parser.add_argument('--dict', type=str, 20 | help='Path to the dictionary file (dict.txt) used to train the BERT model' 21 | '(if not speficied, look for dict.txt in the model directory)') 22 | parser.add_argument('--decoding_span_size', type=int, default=15, 23 | help='The decoding span size (M_d) parameter used to compute' 24 | 'the pseudo-probability (default: 15).') 25 | parser.add_argument('--temporal_sliding_size', type=int, default=5, 26 | help='The temporal sliding size (Delta_t) parameter used to' 27 | 'compute the pseudo-probability (defaut: 5).') 28 | parser.add_argument('--no_overlap', action="store_true", 29 | help='If specified, not overlap the masking spans when computing the' 30 | 'pseudo-probability (temporal_sliding_size is set to decoding_span_size)') 31 | parser.add_argument('--batchsen_size', type=int, default=32, 32 | help='The number of sentences to be considered in each outer batch' 33 | '(batch of sentences) (defaut: 32). Decrease this for longer sentences (BLIMP).') 34 | parser.add_argument('--inner_batch_size', type=int, default=128, 35 | help='For each sentence, the model has to compute the outputs of many different' 36 | 'masked sequences. This parameter controls the size of the inner batches for' 37 | 'each outer batch (defaut: 128). Decrease this for longer sentences (BLIMP).') 38 | parser.add_argument('--cpu', action='store_true', 39 | help="Run on a cpu machine.") 40 | parser.add_argument('--resume', action='store_true', 41 | help="Continue to compute score if the output file already exists.") 42 | return parser.parse_args(argv) 43 | 44 | def main(argv): 45 | # Args parser 46 | args = parseArgs(argv) 47 | 48 | # Convert to absolute paths to get rid of exceptions 49 | args.pathQuantizedUnits = abspath(args.pathQuantizedUnits) 50 | args.pathOutputFile = abspath(args.pathOutputFile) 51 | args.pathBERTCheckpoint = abspath(args.pathBERTCheckpoint) 52 | if args.dict is not None: 53 | args.dict = abspath(args.dict) 54 | 55 | # Load input file 56 | print("") 57 | print(f"Reading input file from {args.pathQuantizedUnits}") 58 | input_file_names = [] 59 | intput_file_seqs = [] 60 | with open(args.pathQuantizedUnits, 'r') as f: 61 | for line in f: 62 | file_name, file_seq = line.strip().split("\t") 63 | # Convert sequence to the desired input form 64 | file_seq = file_seq.replace(",", " ") 65 | # Add to lists 66 | input_file_names.append(file_name) 67 | intput_file_seqs.append(file_seq) 68 | print(f"Found {len(input_file_names)} sequences!") 69 | 70 | # Check if directory exists 71 | pathOutputDir = dirname(args.pathOutputFile) 72 | if pathOutputDir and not exists(pathOutputDir): 73 | print("") 74 | print(f"Creating the output directory at {pathOutputDir}") 75 | Path(pathOutputDir).mkdir(parents=True, exist_ok=True) 76 | # writeArgs(join(pathOutputDir, "_info_args.json"), args) 77 | 78 | # Continue 79 | if args.resume: 80 | if exists(args.pathOutputFile): 81 | existing_file_names = [] 82 | with open(args.pathOutputFile, 'r') as f: 83 | lines = [line for line in f] 84 | for line in lines: 85 | file_name, score = line.strip().split() 86 | existing_file_names.append(file_name) 87 | assert input_file_names[:len(existing_file_names)] == existing_file_names, \ 88 | "The file names in the existing output file do not match the input file!!" 89 | input_file_names = input_file_names[len(existing_file_names):] 90 | intput_file_seqs = intput_file_seqs[len(existing_file_names):] 91 | print(f"Found existing output file, continue to compute scores of {len(intput_file_seqs)} sequences left!") 92 | else: 93 | assert not exists(args.pathOutputFile), \ 94 | f"Output file {args.pathOutputFile} already exists !!! If you want to continue computing scores, please check the --resume option." 95 | 96 | assert len(intput_file_seqs) > 0, \ 97 | "No file to compute probability!" 98 | 99 | # Load BERT model 100 | if args.dict is None: 101 | pathData = dirname(args.pathBERTCheckpoint) 102 | else: 103 | pathData = dirname(args.dict) 104 | assert exists(join(pathData, "dict.txt")), \ 105 | f"Dictionary file (dict.txt) not found in {pathData}" 106 | print("") 107 | print(f"Loading RoBERTa model from {args.pathBERTCheckpoint}...") 108 | print(f"Path data {pathData}") 109 | roberta = loadRobertaCheckpoint( 110 | args.pathBERTCheckpoint, 111 | pathData, 112 | from_pretrained=False) 113 | roberta.eval() # disable dropout (or leave in train mode to finetune) 114 | print("Model loaded !") 115 | 116 | # Run and save outputs 117 | print("") 118 | print(f"Computing log-probabilities and saving results to {args.pathOutputFile}...") 119 | _ = compute_proba_BERT_mlm_span( 120 | intput_file_seqs, roberta, tokenized=True, 121 | decoding_span_size=args.decoding_span_size, temporal_sliding_size = args.temporal_sliding_size, 122 | span_overlap=not args.no_overlap, 123 | batchsen_size=args.batchsen_size, inner_batch_size = args.inner_batch_size, 124 | gpu=not args.cpu, print_tokens=False, verbose=False, print_shape_statistics=False, 125 | save_to=args.pathOutputFile, file_names=input_file_names) 126 | 127 | if __name__ == "__main__": 128 | args = sys.argv[1:] 129 | main(args) 130 | -------------------------------------------------------------------------------- /scripts/compute_proba_LSTM.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from os.path import exists, join, basename, dirname, abspath 3 | import sys 4 | import argparse 5 | 6 | from utils.utils_functions import loadLSTMLMCheckpoint 7 | from utils.lm_scoring import compute_proba_LSTM 8 | 9 | def parseArgs(argv): 10 | # Run parameters 11 | parser = argparse.ArgumentParser(description='Compute pseudo log-probabilities of quantized units with a trained BERT model.') 12 | parser.add_argument('pathQuantizedUnits', type=str, 13 | help='Path to the quantized units. Each line of the input file must be' 14 | 'of the form file_name[tab]pseudo_units (ex. hat 1,1,2,3,4,4)') 15 | parser.add_argument('pathOutputFile', type=str, 16 | help='Path to the output file containing scores.') 17 | parser.add_argument('pathLSTMCheckpoint', type=str, 18 | help='Path to the trained fairseq LSTM model.') 19 | parser.add_argument('--dict', type=str, 20 | help='Path to the dictionary file (dict.txt) used to train the LSTM model' 21 | '(if not speficied, look for dict.txt in the model directory)') 22 | parser.add_argument('--batchSize', type=int, default=128, 23 | help='The number of sentences to be in each batch (defaut: 128)') 24 | parser.add_argument('--cpu', action='store_true', 25 | help="Run on a cpu machine.") 26 | parser.add_argument('--resume', action='store_true', 27 | help="Continue to compute score if the output file already exists.") 28 | return parser.parse_args(argv) 29 | 30 | def main(argv): 31 | # Args parser 32 | args = parseArgs(argv) 33 | 34 | # Convert to absolute paths to get rid of exceptions 35 | args.pathQuantizedUnits = abspath(args.pathQuantizedUnits) 36 | args.pathOutputFile = abspath(args.pathOutputFile) 37 | args.pathLSTMCheckpoint = abspath(args.pathLSTMCheckpoint) 38 | if args.dict is not None: 39 | args.dict = abspath(args.dict) 40 | 41 | # Load input file 42 | print("") 43 | print(f"Reading input file from {args.pathQuantizedUnits}") 44 | input_file_names = [] 45 | intput_file_seqs = [] 46 | with open(args.pathQuantizedUnits, 'r') as f: 47 | for line in f: 48 | file_name, file_seq = line.strip().split("\t") 49 | # Convert sequence to the desired input form 50 | file_seq = file_seq.replace(",", " ") 51 | # Add to lists 52 | input_file_names.append(file_name) 53 | intput_file_seqs.append(file_seq) 54 | print(f"Found {len(input_file_names)} sequences!") 55 | 56 | # Check if directory exists 57 | pathOutputDir = dirname(args.pathOutputFile) 58 | if pathOutputDir and not exists(pathOutputDir): 59 | print("") 60 | print(f"Creating the output directory at {pathOutputDir}") 61 | Path(pathOutputDir).mkdir(parents=True, exist_ok=True) 62 | # writeArgs(join(pathOutputDir, "_info_args.json"), args) 63 | 64 | # Continue 65 | if args.resume: 66 | if exists(args.pathOutputFile): 67 | existing_file_names = [] 68 | with open(args.pathOutputFile, 'r') as f: 69 | lines = [line for line in f] 70 | for line in lines: 71 | file_name, score = line.strip().split() 72 | existing_file_names.append(file_name) 73 | assert input_file_names[:len(existing_file_names)] == existing_file_names, \ 74 | "The file names in the existing output file do not match the input file!!" 75 | input_file_names = input_file_names[len(existing_file_names):] 76 | intput_file_seqs = intput_file_seqs[len(existing_file_names):] 77 | print(f"Found existing output file, continue to compute scores of {len(intput_file_seqs)} sequences left!") 78 | else: 79 | assert not exists(args.pathOutputFile), \ 80 | f"Output file {args.pathOutputFile} already exists !!! If you want to continue computing scores, please check the --resume option." 81 | 82 | # Load LSTM model 83 | if args.dict is None: 84 | pathData = dirname(args.pathLSTMCheckpoint) 85 | else: 86 | pathData = dirname(args.dict) 87 | assert exists(join(pathData, "dict.txt")), \ 88 | f"Dictionary file (dict.txt) not found in {pathData}" 89 | print("") 90 | print(f"Loading LSTM model from {args.pathLSTMCheckpoint}...") 91 | print(f"Path data {pathData}") 92 | model, task = loadLSTMLMCheckpoint(args.pathLSTMCheckpoint, pathData) 93 | model.eval() 94 | print("Model loaded !") 95 | 96 | # Run and save outputs 97 | print("") 98 | print(f"Computing log-probabilities and saving results to {args.pathOutputFile}...") 99 | _ = compute_proba_LSTM( 100 | intput_file_seqs, model, task, 101 | batch_size = args.batchSize, gpu = not args.cpu, 102 | verbose=False, print_tokens=False, 103 | save_to=args.pathOutputFile, file_names=input_file_names) 104 | 105 | if __name__ == "__main__": 106 | args = sys.argv[1:] 107 | main(args) -------------------------------------------------------------------------------- /scripts/utils/utils_functions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | import torch 5 | from cpc.feature_loader import FeatureModule, loadModel 6 | from cpc.criterion.clustering import kMeanCluster 7 | 8 | #from fairseq import tasks, checkpoint_utils 9 | #from fairseq.models.roberta import RobertaModel, RobertaHubInterface 10 | 11 | def readArgs(pathArgs): 12 | print(f"Loading args from {pathArgs}") 13 | with open(pathArgs, 'r') as file: 14 | args = argparse.Namespace(**json.load(file)) 15 | return args 16 | 17 | def writeArgs(pathArgs, args): 18 | print(f"Writing args to {pathArgs}") 19 | with open(pathArgs, 'w') as file: 20 | json.dump(vars(args), file, indent=2) 21 | 22 | def loadCPCFeatureMaker(pathCheckpoint, gru_level=-1, get_encoded=False, keep_hidden=True, load_nullspace=False): 23 | """ 24 | Load CPC Feature Maker from CPC checkpoint file. 25 | """ 26 | # Set LSTM level 27 | if gru_level is not None and gru_level > 0: 28 | updateConfig = argparse.Namespace(nLevelsGRU=gru_level) 29 | else: 30 | updateConfig = None 31 | 32 | # Load CPC model 33 | model, nHiddenGar, nHiddenEncoder = loadModel([pathCheckpoint], updateConfig=updateConfig, load_nullspace=load_nullspace) 34 | 35 | # Keep hidden units at LSTM layers on sequential batches 36 | if load_nullspace: 37 | model.cpc.gAR.keepHidden = keep_hidden 38 | else: 39 | model.gAR.keepHidden = keep_hidden 40 | 41 | # Build CPC Feature Maker from CPC model 42 | featureMaker = FeatureModule(model, get_encoded=get_encoded) 43 | 44 | return featureMaker 45 | 46 | def loadClusterModule(pathCheckpoint, norm_vec_len=False): 47 | """ 48 | Load CPC Clustering Module from Clustering checkpoint file. 49 | """ 50 | state_dict = torch.load(pathCheckpoint, map_location=torch.device('cpu')) 51 | clusterModule = kMeanCluster(torch.zeros(1, state_dict["n_clusters"], state_dict["dim"]), norm_vec_len=norm_vec_len) 52 | clusterModule.load_state_dict(state_dict["state_dict"]) 53 | return clusterModule 54 | 55 | #def loadRobertaCheckpoint(pathBERTCheckpoint, pathData, from_pretrained=False): 56 | # """ 57 | # Load Roberta model from checkpoint. 58 | # If load a pretrained model from fairseq, set from_pretrained=True. 59 | # """ 60 | # if from_pretrained: # Require connection to download bpe, possible errors for trained checkpoint that contains cfg 61 | # roberta = RobertaModel.from_pretrained(dirname(pathBERTCheckpoint), basename(pathBERTCheckpoint), pathData) 62 | # else: 63 | # # Set up the args Namespace 64 | # model_args = argparse.Namespace( 65 | # task='masked_lm', 66 | # seed=-1, 67 | # output_dictionary_size=-1, 68 | # data=pathData, 69 | # path=pathBERTCheckpoint 70 | # ) 71 | # 72 | # # Setup task 73 | # task = tasks.setup_task(model_args) 74 | # 75 | # # Load model 76 | # models, _model_args = checkpoint_utils.load_model_ensemble([model_args.path], task=task) 77 | # model = models[0] 78 | # 79 | # # Wrap-up to RobertaHubInterface (to be consistent with RobertaModel.from_pretrained) 80 | # roberta = RobertaHubInterface(_model_args, task, model) 81 | # 82 | # return roberta 83 | 84 | #def loadLSTMLMCheckpoint(pathLSTMCheckpoint, pathData): 85 | # """ 86 | # Load lstm_lm model from checkpoint. 87 | # """ 88 | # # Set up the args Namespace 89 | # model_args = argparse.Namespace( 90 | # task='language_modeling', 91 | # output_dictionary_size=-1, 92 | # data=pathData, 93 | # path=pathLSTMCheckpoint 94 | # ) 95 | # 96 | # # Setup task 97 | # task = tasks.setup_task(model_args) 98 | # 99 | # # Load model 100 | # models, _model_args = checkpoint_utils.load_model_ensemble([model_args.path], task=task) 101 | # model = models[0] 102 | # 103 | # return model, task -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | from setuptools import setup, find_packages 6 | from setuptools.extension import Extension 7 | from Cython.Build import cythonize 8 | import numpy 9 | 10 | extensions = [ 11 | Extension( 12 | "cpc.eval.ABX.dtw", 13 | ["cpc/eval/ABX/dtw.pyx"], 14 | include_dirs=[numpy.get_include()], 15 | ), 16 | ] 17 | 18 | setup( 19 | name='CPC_audio', 20 | version='1.0', 21 | description='An implementation of the contrast predictive coding (CPC) ' 22 | 'training method for audio data.', 23 | author='Facebook AI Research', 24 | packages=find_packages(), 25 | classifiers=["License :: OSI Approved :: MIT License", 26 | "Intended Audience :: Science/Research", 27 | "Topic :: Scientific/Engineering", 28 | "Programming Language :: Python"], 29 | ext_modules=cythonize(extensions, language_level="3") 30 | ) 31 | -------------------------------------------------------------------------------- /train_ls100.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | 6 | RVERB="-v --dry-run" 7 | RVERB="" 8 | CPC_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 9 | SAVE_DIR="$( 10 | python - "$@" << END 11 | if 1: 12 | import argparse 13 | parser = argparse.ArgumentParser(description='Process some integers.') 14 | parser.add_argument('--pathCheckpoint') 15 | args, _ = parser.parse_known_args() 16 | print(args.pathCheckpoint) 17 | END 18 | )" 19 | 20 | mkdir -p ${SAVE_DIR}/code 21 | rsync --exclude '.*' \ 22 | --exclude data \ 23 | --exclude pretrained_models \ 24 | --exclude '__pycache__' \ 25 | --exclude '*runs*' \ 26 | --exclude '*.pyc' \ 27 | --exclude '*.ipynb' \ 28 | --filter=':- .gitignore' \ 29 | $RVERB -lrpt $CPC_DIR/ ${SAVE_DIR}/code/ 30 | 31 | echo $0 "$@" >> ${SAVE_DIR}/out.txt 32 | exec python -u cpc/train.py \ 33 | --pathDB /pio/data/zerospeech2021/LibriSpeech-wav/train-clean-100 \ 34 | --pathTrain /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/train_split.txt \ 35 | --pathVal /pio/scratch/2/jch/wav2vec/LibriSpeech100_labels_split/test_split.txt \ 36 | --file_extension .wav \ 37 | "$@" 2>&1 | tee -ai ${SAVE_DIR}/out.txt 38 | --------------------------------------------------------------------------------