├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── clpcnet ├── __init__.py ├── __main__.py ├── assets │ ├── checkpoints │ │ ├── model.h5 │ │ └── original.h5 │ ├── partition │ │ ├── daps-segmented.json │ │ ├── ravdess-hifi.json │ │ ├── ravdess-variable.json │ │ └── vctk.json │ └── text │ │ └── ravdess │ │ ├── 01.txt │ │ └── 02.txt ├── config.py ├── convert.py ├── core.py ├── data.py ├── evaluate │ ├── __init__.py │ ├── dtw.py │ ├── duration.py │ ├── gather.py │ ├── objective │ │ ├── __init__.py │ │ ├── constant.py │ │ └── variable.py │ ├── pitch.py │ ├── plot.py │ ├── prosody.py │ └── subjective │ │ ├── __init__.py │ │ ├── constant.py │ │ └── variable.py ├── load.py ├── loudness.py ├── model.py ├── mp3.py ├── partition.py ├── pitch.py ├── preprocess │ ├── __init__.py │ ├── __main__.py │ ├── augment.py │ └── core.py ├── session.py ├── train.py └── world.py ├── data └── .gitkeep ├── requirements.txt ├── runs ├── cache │ └── .gitkeep ├── checkpoints │ └── .gitkeep ├── eval │ └── .gitkeep └── log │ └── .gitkeep ├── setup.py └── src ├── _kiss_fft_guts.h ├── arch.h ├── celt_lpc.c ├── celt_lpc.h ├── common.h ├── freq.c ├── freq.h ├── kiss_fft.c ├── kiss_fft.h ├── lpcnet.h ├── lpcnet_enc.c ├── lpcnet_private.h ├── opus_types.h ├── pitch.c ├── pitch.h └── preprocess.c /.dockerignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | .git/ 3 | bin/ 4 | data/ 5 | runs/ 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Directories 2 | *.egg-info/ 3 | __pycache__/ 4 | .ipynb_checkpoints/ 5 | .vscode/ 6 | data/ 7 | !data/.gitkeep 8 | htk/ 9 | notebooks/ 10 | runs/ 11 | !runs/*/.gitkeep 12 | bin/ 13 | packages/ 14 | !clpcnet/assets/ 15 | 16 | # Metadata 17 | .DS_Store 18 | ._.DS_Store 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04 2 | 3 | # Apt dependencies 4 | RUN apt-get update && \ 5 | apt-get install -y \ 6 | ffmpeg \ 7 | gcc-multilib \ 8 | libsndfile1 \ 9 | make \ 10 | sox \ 11 | wget 12 | 13 | # Conda setup (from continuumio/miniconda3 image) 14 | ENV PATH /opt/conda/bin:$PATH 15 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ 16 | mkdir -p /opt && \ 17 | sh miniconda.sh -b -p /opt/conda && \ 18 | rm miniconda.sh && \ 19 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 20 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 21 | find /opt/conda/ -follow -type f -name '*.a' -delete && \ 22 | find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ 23 | /opt/conda/bin/conda clean -afy && \ 24 | /opt/conda/bin/conda create -n clpcnet python=3.7 -y && \ 25 | echo "conda activate clpcnet" >> ~/.bashrc 26 | 27 | # Make RUN commands use the new environment 28 | SHELL ["conda", "run", "-n", "clpcnet", "/bin/bash", "-c"] 29 | 30 | # Conda environment setup 31 | RUN conda install -c anaconda cudatoolkit=10.0 cudnn=7.6 -y 32 | 33 | # Allow users to specify a directory for HTK 34 | ARG HTK=htk 35 | 36 | # Setup htk 37 | COPY $HTK /htk 38 | WORKDIR /htk 39 | RUN ./configure --disable-hslab && make all && make install 40 | 41 | # Copy python setup files 42 | COPY requirements.txt /clpcnet/requirements.txt 43 | 44 | # Install python dependencies 45 | WORKDIR /clpcnet 46 | RUN pip install -r requirements.txt 47 | 48 | # Copy C preprocessing code 49 | COPY Makefile /clpcnet/Makefile 50 | COPY src /clpcnet/src 51 | 52 | # Build C preprocessing code 53 | RUN make 54 | 55 | # Copy module 56 | COPY README.md /clpcnet/README.md 57 | COPY setup.py /clpcnet/setup.py 58 | COPY clpcnet /clpcnet/clpcnet 59 | 60 | # Install module 61 | RUN pip install -e . 62 | 63 | # Start bash shell when run 64 | CMD ["bash"] 65 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ADOBE RESEARCH LICENSE 2 | 3 | This license (the "License") between Adobe Inc., having a place of business at 345 Park Avenue, San Jose, California 95110-2704 ("Adobe"), and you, the individual or entity exercising rights under this License ("you" or "your"), sets forth the terms for your use of certain research materials that are owned by Adobe (the "Licensed Materials"). By exercising rights under this License, you accept and agree to be bound by its terms. If you are exercising rights under this license on behalf of an entity, then "you" means you and such entity, and you (personally) represent and warrant that you (personally) have all necessary authority to bind that entity to the terms of this License. 4 | 5 | 1. GRANT OF LICENSE. 6 | 7 | 1.1 Adobe grants you a nonexclusive, worldwide, royalty-free, fully paid license to (A) reproduce, use, modify, and publicly display and perform the Licensed Materials for noncommercial research purposes only; and (B) redistribute the Licensed Materials, and modifications or derivative works thereof, for noncommercial research purposes only, provided that you give recipients a copy of this License. 8 | 9 | 1.2 You may add your own copyright statement to your modifications and may provide additional or different license terms for use, reproduction, modification, public display and performance, and redistribution of your modifications and derivative works, provided that such license terms limit the use, reproduction, modification, public display and performance, and redistribution of such modifications and derivative works to noncommercial research purposes only. 10 | 11 | 1.3 For purposes of this License, noncommercial research purposes include academic research, teaching, and testing, but do not include commercial licensing or distribution, development of commercial products, or any other activity which results in commercial gain. 12 | 13 | 2. OWNERSHIP AND ATTRIBUTION. Adobe and its licensors own all right, title, and interest in the Licensed Materials. You must keep intact any copyright or other notices or disclaimers in the Licensed Materials. 14 | 15 | 3. DISCLAIMER OF WARRANTIES. THE LICENSED MATERIALS ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK AS TO THE RESULTS AND PERFORMANCE OF THE LICENSED MATERIALS IS ASSUMED BY YOU. ADOBE DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED OR STATUTORY, WITH REGARD TO ANY LICENSED MATERIALS PROVIDED UNDER THIS LICENSE, INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT OF THIRD-PARTY RIGHTS. 16 | 17 | 4. LIMITATION OF LIABILITY. IN NO EVENT WILL ADOBE BE LIABLE FOR ANY ACTUAL, INCIDENTAL, SPECIAL OR CONSEQUENTIAL DAMAGES OF ANY NATURE WHATSOEVER, INCLUDING WITHOUT LIMITATION, LOSS OF PROFITS OR OTHER COMMERCIAL LOSS, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ANY LICENSED MATERIALS PROVIDED UNDER THIS LICENSE, EVEN IF ADOBE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 18 | 19 | 5. TERM AND TERMINATION. 20 | 21 | 5.1 The License is effective upon acceptance by you and will remain in effect unless terminated earlier as permitted under this License. 22 | 23 | 5.2 If you breach any material provision of this License, then your rights will terminate immediately. 24 | 25 | 5.3 All clauses which by their nature should survive the termination of this License will survive such termination. In addition, and without limiting the generality of the preceding sentence, Sections 2 (Ownership and Attribution), 3 (Disclaimer of Warranties), 4 (Limitation of Liability) will survive termination of this License. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | files = src/celt_lpc.c src/freq.c src/kiss_fft.c src/lpcnet_enc.c src/pitch.c src/preprocess.c 2 | 3 | all: preprocess 4 | 5 | preprocess: 6 | mkdir -p bin/ 7 | gcc -Wall -W -O3 -g -I src/ $(files) -o bin/preprocess -lm 8 | 9 | clean: 10 | rm -rf bin/preprocess 11 | -------------------------------------------------------------------------------- /clpcnet/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # FATAL 5 | logging.getLogger('tensorflow').setLevel(logging.FATAL) 6 | 7 | from .config import * 8 | from .core import * 9 | from .model import DualDense, model 10 | from .session import Session 11 | from . import convert 12 | from . import data 13 | from . import evaluate 14 | from . import load 15 | from . import loudness 16 | from . import mp3 17 | from . import partition 18 | from . import pitch 19 | from . import preprocess 20 | from . import train 21 | -------------------------------------------------------------------------------- /clpcnet/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import clpcnet 5 | 6 | 7 | ############################################################################### 8 | # Entry point 9 | ############################################################################### 10 | 11 | 12 | def parse_args(): 13 | """Parse command-line arguments""" 14 | parser = argparse.ArgumentParser() 15 | 16 | # Audio I/O 17 | parser.add_argument( 18 | '--audio_files', 19 | type=Path, 20 | nargs='+', 21 | help='The audio files to process') 22 | parser.add_argument( 23 | '--output_files', 24 | type=Path, 25 | nargs='+', 26 | required=True, 27 | help='The files to write the output audio') 28 | 29 | # Time-stretching 30 | parser.add_argument( 31 | '--source_alignment_files', 32 | type=Path, 33 | nargs='+', 34 | help='The original alignments on disk. Used only for time-stretching.') 35 | parser.add_argument( 36 | '--target_alignment_files', 37 | type=Path, 38 | nargs='+', 39 | help='The target alignments on disk. Used only for time-stretching.') 40 | parser.add_argument( 41 | '--constant_stretch', 42 | type=float, 43 | help='A constant value for time-stretching') 44 | 45 | # Pitch shifting 46 | parser.add_argument( 47 | '--source_pitch_files', 48 | type=Path, 49 | nargs='+', 50 | help='The file containing the original pitch contours') 51 | parser.add_argument( 52 | '--source_periodicity_files', 53 | type=Path, 54 | nargs='+', 55 | help='The file containing the original periodicities') 56 | parser.add_argument( 57 | '--target_pitch_files', 58 | type=Path, 59 | nargs='+', 60 | help='The files containing the desired pitch contours') 61 | parser.add_argument( 62 | '--constant_shift', 63 | type=float, 64 | help='A constant value for pitch-shifting') 65 | 66 | # Model checkpoint 67 | parser.add_argument( 68 | '--checkpoint_file', 69 | type=Path, 70 | default=clpcnet.DEFAULT_CHECKPOINT, 71 | help='The checkpoint file to load') 72 | 73 | return parser.parse_args() 74 | 75 | 76 | if __name__ == '__main__': 77 | clpcnet.from_files_to_files(**vars(parse_args())) 78 | -------------------------------------------------------------------------------- /clpcnet/assets/checkpoints/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/assets/checkpoints/model.h5 -------------------------------------------------------------------------------- /clpcnet/assets/checkpoints/original.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/assets/checkpoints/original.h5 -------------------------------------------------------------------------------- /clpcnet/assets/partition/daps-segmented.json: -------------------------------------------------------------------------------- 1 | { 2 | "test": [ 3 | "f1_script1_clean-00007", 4 | "f1_script1_clean-00003", 5 | "f1_script1_clean-00025", 6 | "f1_script1_clean-00019", 7 | "f1_script1_clean-00006", 8 | "f1_script1_clean-00014", 9 | "f1_script1_clean-00033", 10 | "f1_script1_clean-00009", 11 | "f1_script1_clean-00013", 12 | "f1_script1_clean-00012", 13 | "f3_script1_clean-00031", 14 | "f3_script1_clean-00034", 15 | "f3_script1_clean-00025", 16 | "f3_script1_clean-00028", 17 | "f3_script1_clean-00032", 18 | "f3_script1_clean-00006", 19 | "f3_script1_clean-00019", 20 | "f3_script1_clean-00004", 21 | "f3_script1_clean-00021", 22 | "f3_script1_clean-00014", 23 | "f4_script1_clean-00033", 24 | "f4_script1_clean-00005", 25 | "f4_script1_clean-00016", 26 | "f4_script1_clean-00010", 27 | "f4_script1_clean-00003", 28 | "f4_script1_clean-00029", 29 | "f4_script1_clean-00034", 30 | "f4_script1_clean-00011", 31 | "f4_script1_clean-00000", 32 | "f4_script1_clean-00019", 33 | "f5_script1_clean-00003", 34 | "f5_script1_clean-00021", 35 | "f5_script1_clean-00014", 36 | "f5_script1_clean-00015", 37 | "f5_script1_clean-00007", 38 | "f5_script1_clean-00013", 39 | "f5_script1_clean-00020", 40 | "f5_script1_clean-00032", 41 | "f5_script1_clean-00022", 42 | "f5_script1_clean-00000", 43 | "f6_script1_clean-00027", 44 | "f6_script1_clean-00031", 45 | "f6_script1_clean-00007", 46 | "f6_script1_clean-00005", 47 | "f6_script1_clean-00001", 48 | "f6_script1_clean-00016", 49 | "f6_script1_clean-00008", 50 | "f6_script1_clean-00012", 51 | "f6_script1_clean-00011", 52 | "f6_script1_clean-00020", 53 | "m1_script1_clean-00010", 54 | "m1_script1_clean-00018", 55 | "m1_script1_clean-00026", 56 | "m1_script1_clean-00017", 57 | "m1_script1_clean-00032", 58 | "m1_script1_clean-00016", 59 | "m1_script1_clean-00029", 60 | "m1_script1_clean-00033", 61 | "m1_script1_clean-00006", 62 | "m1_script1_clean-00001", 63 | "m3_script1_clean-00028", 64 | "m3_script1_clean-00007", 65 | "m3_script1_clean-00010", 66 | "m3_script1_clean-00002", 67 | "m3_script1_clean-00025", 68 | "m3_script1_clean-00024", 69 | "m3_script1_clean-00032", 70 | "m3_script1_clean-00000", 71 | "m3_script1_clean-00019", 72 | "m3_script1_clean-00033", 73 | "m4_script1_clean-00029", 74 | "m4_script1_clean-00018", 75 | "m4_script1_clean-00006", 76 | "m4_script1_clean-00021", 77 | "m4_script1_clean-00034", 78 | "m4_script1_clean-00004", 79 | "m4_script1_clean-00008", 80 | "m4_script1_clean-00028", 81 | "m4_script1_clean-00022", 82 | "m4_script1_clean-00005", 83 | "m5_script1_clean-00014", 84 | "m5_script1_clean-00003", 85 | "m5_script1_clean-00002", 86 | "m5_script1_clean-00027", 87 | "m5_script1_clean-00015", 88 | "m5_script1_clean-00022", 89 | "m5_script1_clean-00021", 90 | "m5_script1_clean-00016", 91 | "m5_script1_clean-00013", 92 | "m5_script1_clean-00017", 93 | "m6_script1_clean-00031", 94 | "m6_script1_clean-00033", 95 | "m6_script1_clean-00009", 96 | "m6_script1_clean-00010", 97 | "m6_script1_clean-00022", 98 | "m6_script1_clean-00016", 99 | "m6_script1_clean-00029", 100 | "m6_script1_clean-00012", 101 | "m6_script1_clean-00018", 102 | "m6_script1_clean-00023" 103 | ] 104 | } -------------------------------------------------------------------------------- /clpcnet/assets/partition/ravdess-hifi.json: -------------------------------------------------------------------------------- 1 | { 2 | "test": [ 3 | "03-01-01-01-02-02-06", 4 | "03-01-08-01-02-02-13", 5 | "03-01-06-01-02-02-08", 6 | "03-01-06-01-01-01-07", 7 | "03-01-07-01-02-01-15", 8 | "03-01-04-01-02-01-19", 9 | "03-01-06-01-01-02-21", 10 | "03-01-05-01-01-02-22", 11 | "03-01-08-01-01-02-02", 12 | "03-01-02-01-02-01-07", 13 | "03-01-08-01-01-02-19", 14 | "03-01-04-01-02-02-12", 15 | "03-01-02-01-01-01-17", 16 | "03-01-03-01-02-02-16", 17 | "03-01-05-01-02-02-16", 18 | "03-01-04-01-02-01-02", 19 | "03-01-04-01-01-02-15", 20 | "03-01-02-01-02-02-15", 21 | "03-01-03-01-02-02-24", 22 | "03-01-02-01-01-02-01", 23 | "03-01-06-01-02-01-08", 24 | "03-01-04-01-02-01-03", 25 | "03-01-02-01-02-01-17", 26 | "03-01-08-01-01-01-06", 27 | "03-01-01-01-01-02-01", 28 | "03-01-01-01-02-01-11", 29 | "03-01-05-01-02-01-07", 30 | "03-01-04-01-02-02-08", 31 | "03-01-06-01-01-01-01", 32 | "03-01-03-01-01-01-17", 33 | "03-01-02-01-02-02-17", 34 | "03-01-02-01-01-02-15", 35 | "03-01-06-01-01-01-17", 36 | "03-01-05-01-01-02-05", 37 | "03-01-05-01-02-01-01", 38 | "03-01-02-01-02-01-13", 39 | "03-01-05-01-02-01-19", 40 | "03-01-03-01-01-01-10", 41 | "03-01-08-01-02-02-03", 42 | "03-01-05-01-01-02-11", 43 | "03-01-07-01-01-02-22", 44 | "03-01-03-01-01-02-07", 45 | "03-01-03-01-01-02-22", 46 | "03-01-03-01-01-02-10", 47 | "03-01-01-01-01-02-08", 48 | "03-01-08-01-02-01-15", 49 | "03-01-07-01-01-02-11", 50 | "03-01-07-01-01-01-02", 51 | "03-01-03-01-01-01-16", 52 | "03-01-02-01-01-01-24", 53 | "03-01-06-01-01-01-13", 54 | "03-01-06-01-02-01-17", 55 | "03-01-05-01-02-01-24", 56 | "03-01-03-01-01-02-01", 57 | "03-01-03-01-02-02-02", 58 | "03-01-01-01-01-02-21", 59 | "03-01-04-01-02-02-01", 60 | "03-01-02-01-01-01-12", 61 | "03-01-08-01-01-01-02", 62 | "03-01-06-01-01-02-01", 63 | "03-01-01-01-02-02-12", 64 | "03-01-03-01-01-01-15", 65 | "03-01-01-01-01-01-03", 66 | "03-01-08-01-02-02-01", 67 | "03-01-08-01-02-02-18", 68 | "03-01-04-01-02-02-23", 69 | "03-01-01-01-01-01-19", 70 | "03-01-03-01-02-02-05", 71 | "03-01-05-01-01-02-17", 72 | "03-01-01-01-01-02-24", 73 | "03-01-01-01-02-01-19", 74 | "03-01-08-01-01-02-23", 75 | "03-01-01-01-02-02-24", 76 | "03-01-01-01-01-01-10", 77 | "03-01-03-01-02-02-06", 78 | "03-01-05-01-02-02-24", 79 | "03-01-05-01-02-01-06", 80 | "03-01-02-01-01-01-18", 81 | "03-01-07-01-02-02-12", 82 | "03-01-08-01-01-02-05", 83 | "03-01-07-01-02-02-11", 84 | "03-01-07-01-01-01-12", 85 | "03-01-08-01-01-01-16", 86 | "03-01-07-01-01-01-13", 87 | "03-01-06-01-02-01-02", 88 | "03-01-03-01-02-02-13", 89 | "03-01-07-01-02-01-22", 90 | "03-01-01-01-02-01-08", 91 | "03-01-04-01-02-02-21", 92 | "03-01-01-01-01-01-06", 93 | "03-01-04-01-01-02-13", 94 | "03-01-06-01-01-02-22", 95 | "03-01-08-01-01-02-07", 96 | "03-01-03-01-01-01-05", 97 | "03-01-01-01-01-02-19", 98 | "03-01-08-01-02-01-07", 99 | "03-01-04-01-02-01-18", 100 | "03-01-05-01-02-02-11", 101 | "03-01-01-01-02-01-15", 102 | "03-01-01-01-02-01-22", 103 | "03-01-06-01-01-02-23", 104 | "03-01-03-01-02-01-24", 105 | "03-01-05-01-01-02-12", 106 | "03-01-01-01-02-01-02", 107 | "03-01-06-01-02-02-13", 108 | "03-01-01-01-02-01-23", 109 | "03-01-03-01-02-02-22", 110 | "03-01-07-01-02-01-02", 111 | "03-01-08-01-02-01-03", 112 | "03-01-03-01-01-01-12", 113 | "03-01-03-01-01-01-08", 114 | "03-01-06-01-01-02-03", 115 | "03-01-01-01-02-02-05", 116 | "03-01-04-01-02-01-16", 117 | "03-01-04-01-02-01-01", 118 | "03-01-04-01-02-01-07", 119 | "03-01-07-01-01-01-22", 120 | "03-01-01-01-02-02-11", 121 | "03-01-01-01-02-02-23", 122 | "03-01-01-01-02-02-15", 123 | "03-01-05-01-02-02-10", 124 | "03-01-07-01-02-01-10", 125 | "03-01-04-01-01-01-15", 126 | "03-01-02-01-01-02-21", 127 | "03-01-03-01-01-02-24", 128 | "03-01-07-01-01-01-07", 129 | "03-01-06-01-01-01-06", 130 | "03-01-03-01-02-01-08", 131 | "03-01-06-01-01-02-06", 132 | "03-01-01-01-01-02-03", 133 | "03-01-08-01-02-01-10", 134 | "03-01-06-01-02-01-05", 135 | "03-01-06-01-02-02-05", 136 | "03-01-02-01-02-02-08", 137 | "03-01-03-01-02-02-08", 138 | "03-01-05-01-01-01-08", 139 | "03-01-02-01-02-02-18", 140 | "03-01-06-01-01-02-16", 141 | "03-01-02-01-02-01-18", 142 | "03-01-01-01-02-01-05", 143 | "03-01-01-01-01-01-23", 144 | "03-01-01-01-02-02-17", 145 | "03-01-06-01-02-02-11", 146 | "03-01-07-01-02-01-21", 147 | "03-01-05-01-02-01-22", 148 | "03-01-01-01-01-01-16", 149 | "03-01-01-01-01-02-11", 150 | "03-01-04-01-01-01-13", 151 | "03-01-07-01-01-02-24", 152 | "03-01-07-01-02-02-22", 153 | "03-01-02-01-02-01-10", 154 | "03-01-06-01-02-01-23", 155 | "03-01-02-01-01-01-03", 156 | "03-01-04-01-01-02-01", 157 | "03-01-02-01-01-02-18", 158 | "03-01-05-01-02-02-18", 159 | "03-01-08-01-01-02-18", 160 | "03-01-03-01-02-01-19", 161 | "03-01-06-01-02-01-22", 162 | "03-01-06-01-02-01-12", 163 | "03-01-03-01-01-02-03", 164 | "03-01-06-01-01-02-13", 165 | "03-01-06-01-02-02-19", 166 | "03-01-02-01-02-02-07", 167 | "03-01-02-01-01-02-10", 168 | "03-01-03-01-01-02-23", 169 | "03-01-02-01-02-01-01", 170 | "03-01-07-01-01-01-11", 171 | "03-01-06-01-01-01-15", 172 | "03-01-02-01-02-02-12", 173 | "03-01-05-01-02-02-17", 174 | "03-01-02-01-01-01-07", 175 | "03-01-02-01-02-01-03", 176 | "03-01-03-01-02-01-23", 177 | "03-01-06-01-02-01-16", 178 | "03-01-08-01-01-01-07", 179 | "03-01-04-01-01-01-18", 180 | "03-01-07-01-01-02-02", 181 | "03-01-02-01-01-02-24", 182 | "03-01-04-01-02-02-15", 183 | "03-01-06-01-02-02-16", 184 | "03-01-02-01-01-01-08", 185 | "03-01-04-01-01-01-21", 186 | "03-01-01-01-01-02-22", 187 | "03-01-05-01-01-01-07", 188 | "03-01-03-01-01-01-18", 189 | "03-01-04-01-02-01-23", 190 | "03-01-03-01-01-01-07", 191 | "03-01-06-01-02-01-11", 192 | "03-01-06-01-01-02-15", 193 | "03-01-07-01-01-01-10", 194 | "03-01-06-01-02-02-03", 195 | "03-01-03-01-02-01-05", 196 | "03-01-03-01-01-01-11", 197 | "03-01-05-01-02-01-17", 198 | "03-01-02-01-01-01-10", 199 | "03-01-05-01-01-01-05", 200 | "03-01-05-01-01-01-24", 201 | "03-01-07-01-02-02-24", 202 | "03-01-03-01-02-02-07", 203 | "03-01-05-01-01-01-01", 204 | "03-01-05-01-02-01-13", 205 | "03-01-03-01-02-02-21", 206 | "03-01-08-01-01-01-23", 207 | "03-01-02-01-01-01-01", 208 | "03-01-05-01-01-02-02", 209 | "03-01-04-01-01-01-02", 210 | "03-01-07-01-01-02-12", 211 | "03-01-07-01-02-01-18", 212 | "03-01-05-01-01-02-21", 213 | "03-01-08-01-01-02-06", 214 | "03-01-03-01-02-01-06", 215 | "03-01-06-01-02-02-10", 216 | "03-01-01-01-01-01-22", 217 | "03-01-01-01-02-01-03", 218 | "03-01-06-01-01-01-22", 219 | "03-01-03-01-01-02-06", 220 | "03-01-03-01-02-01-01", 221 | "03-01-01-01-02-02-19", 222 | "03-01-01-01-02-02-02", 223 | "03-01-01-01-01-01-11", 224 | "03-01-05-01-01-02-15", 225 | "03-01-07-01-02-02-16", 226 | "03-01-05-01-01-01-23", 227 | "03-01-06-01-01-01-16", 228 | "03-01-03-01-01-01-23", 229 | "03-01-07-01-01-01-17", 230 | "03-01-03-01-01-01-03", 231 | "03-01-05-01-02-01-12", 232 | "03-01-08-01-02-02-06", 233 | "03-01-08-01-01-02-15", 234 | "03-01-01-01-02-01-10", 235 | "03-01-04-01-02-01-15", 236 | "03-01-07-01-02-01-16", 237 | "03-01-04-01-01-02-17", 238 | "03-01-07-01-01-02-13", 239 | "03-01-01-01-02-01-13", 240 | "03-01-03-01-02-01-13", 241 | "03-01-03-01-01-01-13", 242 | "03-01-04-01-02-01-08", 243 | "03-01-01-01-02-02-10", 244 | "03-01-05-01-02-01-11", 245 | "03-01-05-01-01-01-11", 246 | "03-01-01-01-01-02-16", 247 | "03-01-04-01-02-02-16", 248 | "03-01-04-01-01-01-17", 249 | "03-01-02-01-02-02-21", 250 | "03-01-02-01-02-02-23", 251 | "03-01-02-01-02-02-01", 252 | "03-01-06-01-02-02-12", 253 | "03-01-01-01-02-02-21", 254 | "03-01-03-01-01-01-22", 255 | "03-01-05-01-01-02-19", 256 | "03-01-01-01-02-02-18", 257 | "03-01-04-01-02-02-18", 258 | "03-01-05-01-02-02-23", 259 | "03-01-01-01-01-02-23", 260 | "03-01-01-01-02-01-16", 261 | "03-01-07-01-01-02-16", 262 | "03-01-03-01-02-01-02", 263 | "03-01-05-01-01-01-19", 264 | "03-01-05-01-02-01-21", 265 | "03-01-04-01-01-02-18", 266 | "03-01-03-01-01-02-13", 267 | "03-01-02-01-02-01-02", 268 | "03-01-02-01-02-02-10", 269 | "03-01-03-01-01-02-12", 270 | "03-01-03-01-02-02-10", 271 | "03-01-03-01-02-02-03", 272 | "03-01-03-01-02-02-19", 273 | "03-01-05-01-01-01-21", 274 | "03-01-03-01-02-01-11", 275 | "03-01-08-01-01-01-19", 276 | "03-01-07-01-02-02-01" 277 | ] 278 | } -------------------------------------------------------------------------------- /clpcnet/assets/text/ravdess/01.txt: -------------------------------------------------------------------------------- 1 | Kids are talking by the door 2 | -------------------------------------------------------------------------------- /clpcnet/assets/text/ravdess/02.txt: -------------------------------------------------------------------------------- 1 | Dogs are sitting by the door 2 | -------------------------------------------------------------------------------- /clpcnet/config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | ############################################################################### 5 | # Configuration 6 | ############################################################################### 7 | 8 | 9 | # Run inference using checkpoint and settings from the original LPCNet 10 | ORIGINAL_LPCNET = False 11 | 12 | # Ablations 13 | ABLATE_CREPE = False 14 | ABLATE_PITCH_REPR = False 15 | ABLATE_SAMPLING = False 16 | ABLATE_SAMPLING_TAIL = False 17 | 18 | # Settings for using original lpcnet checkpoints 19 | if ORIGINAL_LPCNET: 20 | ABLATE_CREPE = True 21 | ABLATE_PITCH_REPR = True 22 | ABLATE_SAMPLING = True 23 | ABLATE_SAMPLING_TAIL = False 24 | 25 | # Directories 26 | ASSETS_DIR = Path(__file__).parent / 'assets' 27 | DATA_DIR = Path(__file__).parent.parent / 'data' 28 | RUNS_DIR = Path(__file__).parent.parent / 'runs' 29 | CACHE_DIR = RUNS_DIR / 'cache' 30 | CHECKPOINT_DIR = RUNS_DIR / 'checkpoints' 31 | EVAL_DIR = RUNS_DIR / 'eval' 32 | LOG_DIR = RUNS_DIR / 'log' 33 | 34 | # Default pretrained checkpoint 35 | if ORIGINAL_LPCNET: 36 | DEFAULT_CHECKPOINT = ASSETS_DIR / 'checkpoints' / 'original.h5' 37 | else: 38 | DEFAULT_CHECKPOINT = ASSETS_DIR / 'checkpoints' / 'model.h5' 39 | 40 | # Pitch representation 41 | PITCH_BINS = 256 42 | FMAX = 550. # Hz 43 | # 63 Hz is hard minimum imposed when using non-uniform pitch bins 44 | FMIN = 63. if ABLATE_PITCH_REPR else 50. # Hz 45 | 46 | # DSP parameters 47 | HOPSIZE = 160 # samples 48 | BLOCK_SIZE = 640 # samples 49 | LPC_ORDER = 16 50 | MAX_SAMPLE_VALUE = 32768 51 | PCM_LEVELS = 256 52 | PREEMPHASIS_COEF = 0.85 53 | SAMPLE_RATE = 16000 # Hz 54 | 55 | # Training parameters 56 | AVERAGE_STEPS_PER_EPOCH = 436925 # batches per epoch 57 | BATCH_SIZE = 64 # items per batch 58 | FEATURE_CHUNK_SIZE = 15 # frames per item in batch 59 | LEARNING_RATE = 1e-3 60 | PCM_CHUNK_SIZE = HOPSIZE * FEATURE_CHUNK_SIZE # samples per item in batch 61 | STEPS = 45000000 62 | WEIGHT_DECAY = 5e-5 63 | 64 | # Neural network sizes 65 | SPECTRAL_FEATURE_SIZE = 38 66 | EMBEDDING_SIZE = 128 67 | GRU_A_SIZE = 384 68 | GRU_B_SIZE = 16 69 | 70 | # Number of features on disk 71 | TOTAL_FEATURE_SIZE = 55 72 | 73 | # Feature indices 74 | PITCH_IDX = 36 75 | CORRELATION_IDX = 37 76 | -------------------------------------------------------------------------------- /clpcnet/convert.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | import clpcnet 6 | 7 | 8 | ############################################################################### 9 | # Constants 10 | ############################################################################### 11 | 12 | 13 | # Define these explicitly, as they are used millions of times 14 | INV_LOG_256 = 1. / math.log(256) 15 | LOG_256_OVER_128 = math.log(256) / 128. 16 | LI_TO_MU_SCALE = (clpcnet.PCM_LEVELS - 1.) / clpcnet.MAX_SAMPLE_VALUE 17 | MU_TO_LI_SCALE = clpcnet.MAX_SAMPLE_VALUE / (clpcnet.PCM_LEVELS - 1.) 18 | 19 | 20 | ############################################################################### 21 | # Mulaw encoding and decoding 22 | ############################################################################### 23 | 24 | 25 | def linear_to_mulaw(linear): 26 | """Mu-law encode signal""" 27 | # Convert from [-MAX_SAMPLE_VALUE + 1, MAX_SAMPLE_VALUE] to [-126.5, 126.5] 28 | linear *= LI_TO_MU_SCALE 29 | 30 | # Mu-law encode 31 | mulaw = np.sign(linear) * (128 * np.log(1 + np.abs(linear)) * INV_LOG_256) 32 | 33 | # Shift to [0, 255] 34 | mulaw = np.round(mulaw) + 128 35 | 36 | # Clip 37 | return np.clip(mulaw, 0, 255).astype(np.int16) 38 | 39 | 40 | def mulaw_to_linear(mulaw): 41 | """Decode mu-law signal""" 42 | # Zero-center 43 | mulaw = mulaw.astype(np.int16) - 128 44 | 45 | # Convert to linear 46 | linear = np.sign(mulaw) * (np.exp(np.abs(mulaw) * LOG_256_OVER_128) - 1) 47 | 48 | # Scale to [-MAX_SAMPLE_VALUE + 1, MAX_SAMPLE_VALUE] 49 | return linear * MU_TO_LI_SCALE 50 | 51 | 52 | ############################################################################### 53 | # Pitch representations 54 | ############################################################################### 55 | 56 | 57 | def bins_to_hz(bins, 58 | fmin=clpcnet.FMIN, 59 | fmax=clpcnet.FMAX, 60 | pitch_bins=clpcnet.PITCH_BINS): 61 | logmin, logmax = np.log2(fmin), np.log2(fmax) 62 | 63 | # Scale to base-2 log-space 64 | loghz = bins.astype(float) / (pitch_bins - 1) * (logmax - logmin) + logmin 65 | 66 | # Convert to hz 67 | return 2 ** loghz 68 | 69 | 70 | def epochs_to_bins(epochs, sample_rate=clpcnet.SAMPLE_RATE): 71 | """Convert pitch in normalized pitch epochs to quantized bins""" 72 | return bins_to_hz(epochs_to_hz(epochs, sample_rate)) 73 | 74 | 75 | def epochs_to_hz(epochs, sample_rate=clpcnet.SAMPLE_RATE): 76 | """Convert pitch in normalized pitch epochs to Hz""" 77 | return sample_rate / (50 * epochs + 100.1) 78 | 79 | 80 | def epochs_to_length(epochs): 81 | """Convert normalized pitch epochs to samples per period""" 82 | return (50 * epochs + 100.1).astype('int16') 83 | 84 | 85 | def hz_to_bins(hz, 86 | fmin=clpcnet.FMIN, 87 | fmax=clpcnet.FMAX, 88 | pitch_bins=clpcnet.PITCH_BINS): 89 | logmin, logmax = np.log2(fmin), np.log2(fmax) 90 | 91 | # Clip pitch in base-2 log-space 92 | loghz = np.clip(np.log2(hz), logmin, logmax) 93 | 94 | # Scale to 0, 255 95 | return \ 96 | ((loghz - logmin) / (logmax - logmin) * (pitch_bins - 1)).astype(int) 97 | 98 | 99 | def hz_to_epochs(hz, sample_rate=clpcnet.SAMPLE_RATE): 100 | """Convert pitch in Hz to normalized epochs""" 101 | return (sample_rate / hz - 100.1) / 50. 102 | 103 | 104 | def hz_to_length(hz, sample_rate=clpcnet.SAMPLE_RATE): 105 | """Convert pitch in Hz to number of samples per period""" 106 | return (sample_rate / hz).astype('int16') 107 | 108 | 109 | def length_to_epochs(length): 110 | """Convert pitch in number of samples per period to normalized epochs""" 111 | return (length - 100.1) / 50. 112 | 113 | 114 | def length_to_hz(length, sample_rate=clpcnet.SAMPLE_RATE): 115 | """Convert pitch in number of samples per period to Hz""" 116 | return sample_rate / length 117 | 118 | 119 | def seconds_to_frames(seconds): 120 | """Convert time in seconds to number of frames""" 121 | return 1 + int(seconds * clpcnet.SAMPLE_RATE / clpcnet.HOPSIZE) 122 | 123 | 124 | def seconds_to_samples(seconds): 125 | """Convert time in seconds to number of samples""" 126 | return clpcnet.HOPSIZE * seconds_to_frames(seconds) 127 | -------------------------------------------------------------------------------- /clpcnet/data.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import itertools 3 | import json 4 | from pathlib import Path 5 | 6 | import clpcnet 7 | 8 | 9 | ############################################################################### 10 | # Functional interface - file access 11 | ############################################################################### 12 | 13 | 14 | def files(name, directory, partition=None): 15 | """Get audio filenames""" 16 | return resolve(name).files(directory, partition) 17 | 18 | 19 | def partition_file(name): 20 | """Get name of partition file""" 21 | return resolve(name).partition_file() 22 | 23 | 24 | def partitions(name): 25 | """Get split of stems into partitions""" 26 | return resolve(name).partitions() 27 | 28 | 29 | def stems(name, partition=None): 30 | """Get stems""" 31 | return resolve(name).stems(partition) 32 | 33 | 34 | ############################################################################### 35 | # Functional interface - conversions 36 | ############################################################################### 37 | 38 | 39 | def file_to_stem(name, file): 40 | """Convert file to stem""" 41 | return resolve(name).file_to_stem(file) 42 | 43 | 44 | def stem_to_file(name, directory, stem): 45 | """Convert stem to file""" 46 | return resolve(name).stem_to_file(directory, stem) 47 | 48 | 49 | ############################################################################### 50 | # Base dataset 51 | ############################################################################### 52 | 53 | 54 | class Dataset(abc.ABC): 55 | 56 | ########################################################################### 57 | # File access 58 | ########################################################################### 59 | 60 | @classmethod 61 | def files(cls, directory, partition=None): 62 | """Get filenames""" 63 | # Get stems 64 | stems = cls.stems(partition) 65 | 66 | # Convert to files 67 | return [cls.stem_to_file(directory, stem) for stem in stems] 68 | 69 | @classmethod 70 | def partition_file(cls): 71 | """Get name of partition file""" 72 | return clpcnet.ASSETS_DIR / 'partition' / f'{cls.name}.json' 73 | 74 | @classmethod 75 | def partitions(cls): 76 | """Get split of stems into partitions""" 77 | with open(cls.partition_file()) as file: 78 | return json.load(file) 79 | 80 | @classmethod 81 | def stems(cls, partition=None): 82 | """Get stems""" 83 | # Get partitions 84 | partitions = cls.partitions() 85 | 86 | # Return all stems 87 | if partition is None: 88 | return itertools.chain(*partitions.values()) 89 | 90 | # Return stems of a given partition 91 | return partitions[partition] 92 | 93 | ########################################################################### 94 | # Conversions 95 | ########################################################################### 96 | 97 | @staticmethod 98 | @abc.abstractmethod 99 | def file_to_stem(file): 100 | """Convert file to stem""" 101 | pass 102 | 103 | @staticmethod 104 | @abc.abstractmethod 105 | def stem_to_file(directory, stem): 106 | """Convert stem to file""" 107 | pass 108 | 109 | 110 | ############################################################################### 111 | # Datasets 112 | ############################################################################### 113 | 114 | 115 | class Daps(Dataset): 116 | 117 | name = 'daps' 118 | 119 | @staticmethod 120 | def file_to_stem(file): 121 | """Convert daps filename to stem""" 122 | return file.stem[:-4] 123 | 124 | @staticmethod 125 | def stem_to_file(directory, stem): 126 | """Convert daps stem to file""" 127 | return Path(directory, 'clean', f'{stem}.wav') 128 | 129 | 130 | class DapsSegmented(Dataset): 131 | 132 | name = 'daps-segmented' 133 | 134 | @staticmethod 135 | def file_to_stem(file): 136 | """Convert daps-segmented filename to stem""" 137 | return file.stem 138 | 139 | @staticmethod 140 | def stem_to_file(directory, stem): 141 | """Convert daps-segmented stem to filen""" 142 | return Path(directory, f'{stem[:-6]}-sentences', f'{stem}.wav') 143 | 144 | 145 | class RavdessHifi(Dataset): 146 | 147 | name = 'ravdess-hifi' 148 | 149 | @staticmethod 150 | def file_to_stem(file): 151 | """Convert ravdess filename to stem""" 152 | return file.stem 153 | 154 | @staticmethod 155 | def stem_to_file(directory, stem): 156 | """Convert ravdess stem to filename""" 157 | return directory / f'Actor_{stem[-2:]}' / f'{stem}.wav' 158 | 159 | 160 | class RavdessVariable(RavdessHifi): 161 | 162 | name = 'ravdess-variable' 163 | 164 | 165 | class Vctk(Dataset): 166 | 167 | name = 'vctk' 168 | 169 | @staticmethod 170 | def file_to_stem(file): 171 | """Convert vctk filename to stem""" 172 | return file.stem[:-5] 173 | 174 | @staticmethod 175 | def stem_to_file(directory, stem): 176 | """Convert vctk stem to file""" 177 | return Path(directory, 178 | 'wav48_silence_trimmed', 179 | stem.split('_')[0], 180 | f'{stem}_mic2.flac') 181 | 182 | 183 | ############################################################################### 184 | # Utilities 185 | ############################################################################### 186 | 187 | 188 | def resolve(name): 189 | """Resolve name of dataset to static template""" 190 | if name == 'daps': 191 | return Daps 192 | elif name == 'daps-segmented': 193 | return DapsSegmented 194 | elif name == 'ravdess-hifi': 195 | return RavdessHifi 196 | elif name == 'ravdess-variable': 197 | return RavdessVariable 198 | elif name == 'vctk': 199 | return Vctk 200 | raise ValueError(f'Dataset {name} is not defined') 201 | -------------------------------------------------------------------------------- /clpcnet/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from . import dtw 2 | from . import duration 3 | from . import pitch 4 | from . import plot 5 | from . import prosody 6 | -------------------------------------------------------------------------------- /clpcnet/evaluate/dtw.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import scipy 4 | 5 | import clpcnet 6 | 7 | 8 | ############################################################################### 9 | # Dtw alignment evaluation 10 | ############################################################################### 11 | 12 | 13 | def from_files(source_files, target_files): 14 | """Evaluate dtw alignment score""" 15 | dtw = DtwAlignmentScore() 16 | 17 | # Evaluate each pair of files 18 | for source_file, target_file in zip(source_files, target_files): 19 | dtw.update(clpcnet.load.audio(source_file), 20 | clpcnet.load.audio(target_file)) 21 | 22 | # Compute aggregate dtw alignment score over files 23 | return dtw() 24 | 25 | 26 | class DtwAlignmentScore: 27 | """Batch update dtw alignment score""" 28 | 29 | def __init__(self): 30 | self.distance_sum = 0. 31 | self.count = 0 32 | 33 | def __call__(self): 34 | """Return the rmse with the diagonal and the mean cosine distance""" 35 | distance = self.distance_sum / self.count 36 | return distance 37 | 38 | def update(self, source, target): 39 | """Compute the dtw alignment score""" 40 | # Compute mel features 41 | source_feats = features(source) 42 | target_feats = features(target) 43 | 44 | # Resample target features 45 | interp_fn = scipy.interpolate.interp1d( 46 | np.arange(target_feats.shape[1]), 47 | target_feats) 48 | target_feats_interp = interp_fn( 49 | np.linspace(0, target_feats.shape[1] - 1, source_feats.shape[1])) 50 | 51 | # Perform alignment 52 | D, path = librosa.sequence.dtw(target_feats_interp + 1e-10, 53 | source_feats + 1e-10, 54 | backtrack=True, 55 | metric='cosine') 56 | 57 | # Update metrics 58 | self.distance_sum += D[path[0, 0], path[0, 1]] 59 | self.count += len(path) 60 | 61 | 62 | ############################################################################### 63 | # Utilities 64 | ############################################################################### 65 | 66 | 67 | def features(audio): 68 | """Compute spectral features to use for dtw alignment""" 69 | # Compute mfcc without energy 70 | return librosa.feature.mfcc(audio, sr=clpcnet.SAMPLE_RATE)[1:] 71 | 72 | -------------------------------------------------------------------------------- /clpcnet/evaluate/duration.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pypar 3 | 4 | 5 | ############################################################################### 6 | # Duration evaluation 7 | ############################################################################### 8 | 9 | 10 | def from_files(source_files, target_files): 11 | """Evaluate phoneme duration rmse""" 12 | rmse = DurationRmse() 13 | 14 | # Evaluate each pair of files 15 | for source_file, target_file in zip(source_files, target_files): 16 | source = pypar.Alignment(source_file) 17 | target = pypar.Alignment(target_file) 18 | rmse.update(source, target) 19 | 20 | # Compute aggregate rmse over files 21 | return rmse() 22 | 23 | 24 | class DurationRmse: 25 | """Batch update rmse""" 26 | 27 | def __init__(self): 28 | self.sum = 0. 29 | self.count = 0 30 | 31 | def __call__(self): 32 | """Return the rmse value""" 33 | return np.sqrt(self.sum / self.count) 34 | 35 | def update(self, source, target): 36 | """Compute the rmse of the phoneme durations""" 37 | source_durations = np.array([p.duration() for p in source.phonemes()]) 38 | target_durations = np.array([p.duration() for p in target.phonemes()]) 39 | source_mask, target_mask = self.mask(source.phonemes(), 40 | target.phonemes()) 41 | 42 | # First and last are very often long silences with large error 43 | differences = source_durations[source_mask][1:-1] - \ 44 | target_durations[target_mask][1:-1] 45 | 46 | self.sum += (differences ** 2).sum() 47 | self.count += differences.size 48 | 49 | @staticmethod 50 | def mask(source, target): 51 | """Retrieve the mask over values to use for evaluation""" 52 | source_mask = np.full(len(source), True) 53 | target_mask = np.full(len(target), True) 54 | 55 | source_idx, target_idx = 0, 0 56 | while source_idx < len(source) or target_idx < len(target): 57 | 58 | # Handle only one alignment ending with silence 59 | if target_idx >= len(target): 60 | source_mask[source_idx] = False 61 | source_idx += 1 62 | continue 63 | if source_idx >= len(source): 64 | target_mask[target_idx] = False 65 | target_idx += 1 66 | continue 67 | 68 | # Phonemes match 69 | if str(source[source_idx]) == str(target[target_idx]): 70 | source_idx += 1 71 | target_idx += 1 72 | continue 73 | 74 | # Phonemes don't match 75 | if str(source[source_idx]) == pypar.SILENCE: 76 | source_mask[source_idx] = False 77 | source_idx += 1 78 | elif str(target[target_idx]) == pypar.SILENCE: 79 | target_mask[target_idx] = False 80 | target_idx += 1 81 | else: 82 | raise ValueError('Phonemes don\'t match!') 83 | 84 | return source_mask, target_mask 85 | -------------------------------------------------------------------------------- /clpcnet/evaluate/gather.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import json 4 | import random 5 | import shutil 6 | from pathlib import Path 7 | 8 | import numpy as np 9 | import pyfoal 10 | import soundfile 11 | import tqdm 12 | 13 | import clpcnet 14 | 15 | 16 | ############################################################################### 17 | # Gather 18 | ############################################################################### 19 | 20 | 21 | def from_files_to_files(examples, output_directory, gpu=None): 22 | """Gather files for evaluation""" 23 | output_directory.mkdir(exist_ok=True, parents=True) 24 | iterator = tqdm.tqdm(examples, 25 | desc='Setting up evaluation directory', 26 | dynamic_ncols=True) 27 | for example in iterator: 28 | stem = f'{example.stem}-{"" if example.seen else "un"}seen' 29 | prefix = output_directory / stem 30 | 31 | # Copy audio 32 | dst_audio_file = f'{prefix}.wav' 33 | soundfile.write(dst_audio_file, 34 | clpcnet.load.audio(example.audio_file), 35 | clpcnet.SAMPLE_RATE) 36 | 37 | # Estimate pitch 38 | pitch, periodicity = clpcnet.pitch.from_file(example.audio_file, gpu) 39 | 40 | # Scale pitch 41 | low = np.clip(.71 * pitch, clpcnet.FMIN, clpcnet.FMAX) 42 | high = np.clip(1.41 * pitch, clpcnet.FMIN, clpcnet.FMAX) 43 | 44 | # Save original pitch and periodicity and scaled pitch 45 | np.save(f'{prefix}-pitch.npy', pitch) 46 | np.save(f'{prefix}-ps-71-pitch.npy', low) 47 | np.save(f'{prefix}-ps-141-pitch.npy', high) 48 | np.save(f'{prefix}-periodicity.npy', periodicity) 49 | 50 | if example.text_file is not None: 51 | # Copy text 52 | dst_text_file = f'{prefix}.txt' 53 | shutil.copy2(example.text_file, dst_text_file) 54 | 55 | # Force align 56 | alignment = pyfoal.from_file(str(dst_text_file), str(dst_audio_file)) 57 | 58 | # Time-stretch alignment by constant factor 59 | slow = stretch_alignment(alignment, .5) 60 | fast = stretch_alignment(alignment, 2.) 61 | 62 | # Save alignments 63 | alignment.save(f'{prefix}.json') 64 | slow.save(f'{prefix}-ts-50.json') 65 | fast.save(f'{prefix}-ts-200.json') 66 | 67 | 68 | ############################################################################### 69 | # Example selection 70 | ############################################################################### 71 | 72 | 73 | def daps(directory, stems): 74 | """Select evaluation examples from daps""" 75 | selected = [stem for stem in stems if '_script5_' in str(stem) 76 | if int(stem.split('_')[0][1:]) < 5] 77 | return [Example(stem, directory / 'clean' / f'{stem}.wav', None, False) 78 | for stem in selected] 79 | 80 | 81 | def daps_segmented(directory, stems): 82 | """Select evaluation examples from daps""" 83 | examples = [] 84 | for stem in stems: 85 | # Get files 86 | audio_file = clpcnet.data.stem_to_file('daps-segmented', 87 | directory, 88 | stem) 89 | text_file = audio_file.with_suffix('.txt') 90 | 91 | # Create example 92 | examples.append(Example(stem, audio_file, text_file, False)) 93 | 94 | return examples 95 | 96 | 97 | def ravdess_hifi(directory, stems): 98 | """Select evaluation samples from ravdess""" 99 | # Get deterministic but random set of stems 100 | random.seed(0) 101 | stems = random.sample(stems, 100) 102 | 103 | # Create examples 104 | text_directory = clpcnet.ASSETS_DIR / 'text' / 'ravdess' 105 | return [Example(stem, 106 | clpcnet.data.stem_to_file('ravdess-hifi', directory, stem), 107 | text_directory / f'{stem.split("-")[4]}.txt', 108 | False) 109 | for stem in stems] 110 | 111 | 112 | def vctk(directory, unseen, seen): 113 | """Select evaluation examples from vctk""" 114 | # Load speaker info 115 | with open(directory / 'speaker-info.txt') as file: 116 | lines = file.readlines() 117 | speakers = [clpcnet.partition.VCTKSpeaker(line) for line in lines[1:]] 118 | speakers = [s for s in speakers if s.id != 'p362'] 119 | 120 | # Pick a few speakers 121 | random.seed(0) 122 | unseen_speakers = sample_speakers(speakers, unseen) 123 | seen_speakers = sample_speakers(speakers, seen) 124 | 125 | # For each speaker, pick a few files 126 | selected = [] 127 | stems = unseen + seen 128 | for speaker in unseen_speakers + seen_speakers: 129 | speaker_stems = [s for s in stems if speaker in s] 130 | selected.extend(random.sample(speaker_stems, 4)) 131 | 132 | # Get absolute paths to audio and text files 133 | audio_directory = directory / 'wav48_silence_trimmed' 134 | audio_files = [audio_directory / s.split('_')[0] / f'{s}_mic2.flac' 135 | for s in selected] 136 | text_directory = directory / 'txt' 137 | text_files = [text_directory / s.split('_')[0] / f'{s}.txt' 138 | for s in selected] 139 | 140 | iterator = enumerate(zip(selected, audio_files, text_files)) 141 | return [Example(s, audio, text, i >= len(selected) // 2) 142 | for i, (s, audio, text) in iterator] 143 | 144 | 145 | ############################################################################### 146 | # Utilities 147 | ############################################################################### 148 | 149 | 150 | class Example: 151 | 152 | def __init__(self, stem, audio_file, text_file, seen): 153 | self.stem = stem 154 | self.audio_file = audio_file 155 | self.text_file = text_file 156 | self.seen = seen 157 | 158 | 159 | def sample_speakers(speakers, stems, n=8): 160 | """Sample from a list of speakers""" 161 | # Get relevant speakers 162 | relevant = set(s.split('_')[0] for s in stems) 163 | stem_speakers = [s for s in speakers if s.id in relevant] 164 | 165 | # Shuffle 166 | random.shuffle(stem_speakers) 167 | 168 | # Pick first n // 2 of each gender 169 | male = [s.id for s in stem_speakers if s.gender == 'M'] 170 | female = [s.id for s in stem_speakers if s.gender == 'F'] 171 | return male[:n // 2] + female [:n // 2] 172 | 173 | 174 | def stretch_alignment(alignment, rate): 175 | """Stretch the alignment by the given rate""" 176 | alignment = copy.deepcopy(alignment) 177 | durations = [(1. / rate) * p.duration() for p in alignment.phonemes()] 178 | alignment.update(durations=durations) 179 | return alignment 180 | 181 | 182 | ############################################################################### 183 | # Entry point 184 | ############################################################################### 185 | 186 | 187 | def main(): 188 | """Create a directory of files for evaluation""" 189 | # Parse command-line arguments 190 | args = parse_args() 191 | 192 | # Get test partition stems 193 | partition_file = clpcnet.ASSETS_DIR / 'partition' / f'{args.dataset}.json' 194 | with open(partition_file) as file: 195 | partition = json.load(file) 196 | 197 | # Get paths to selected examples 198 | if args.dataset == 'daps': 199 | test_unseen = partition['test'] 200 | examples = daps(args.directory, test_unseen) 201 | elif args.dataset == 'daps-segmented': 202 | test_unseen = partition['test'] 203 | examples = daps_segmented(args.directory, test_unseen) 204 | elif args.dataset == 'ravdess-hifi': 205 | test_unseen = partition['test'] 206 | examples = ravdess_hifi(args.directory, test_unseen) 207 | elif args.dataset == 'vctk': 208 | test_unseen, test_seen = partition['test'], partition['test-seen'] 209 | examples = vctk(args.directory, test_unseen, test_seen) 210 | else: 211 | raise ValueError(f'No dataset {args.dataset}') 212 | 213 | # Gather files for evaluation 214 | from_files_to_files(examples, args.output_directory, args.gpu) 215 | 216 | 217 | def parse_args(): 218 | """Parse command-line arguments""" 219 | parser = argparse.ArgumentParser() 220 | parser.add_argument( 221 | '--dataset', 222 | default='vctk', 223 | help='The dataset to gather evaluation files from') 224 | parser.add_argument( 225 | '--directory', 226 | type=Path, 227 | default=clpcnet.DATA_DIR, 228 | help='The root directory of the dataset') 229 | parser.add_argument( 230 | '--output_directory', 231 | type=Path, 232 | default=clpcnet.EVAL_DIR / 'objective' / 'constant', 233 | help='The output evaluation directory') 234 | parser.add_argument( 235 | '--gpu', 236 | type=int, 237 | default=None, 238 | help='The gpu to use for pitch estimation') 239 | 240 | # Extend directories with dataset name 241 | args = parser.parse_args() 242 | args.directory = args.directory / args.dataset 243 | args.output_directory = args.output_directory / args.dataset / 'data' 244 | 245 | return args 246 | 247 | 248 | if __name__ == '__main__': 249 | main() 250 | -------------------------------------------------------------------------------- /clpcnet/evaluate/objective/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/evaluate/objective/__init__.py -------------------------------------------------------------------------------- /clpcnet/evaluate/objective/variable.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import multiprocessing as mp 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pyfoal 8 | import soundfile 9 | import torch 10 | import tqdm 11 | 12 | import clpcnet 13 | 14 | 15 | ############################################################################### 16 | # Constants 17 | ############################################################################### 18 | 19 | 20 | DEFAULT_DIRECTORY = clpcnet.DATA_DIR / 'ravdess-hifi' 21 | 22 | 23 | ############################################################################### 24 | # Variable-rate pitch shifting 25 | ############################################################################### 26 | 27 | 28 | def evaluate(directory=DEFAULT_DIRECTORY, 29 | run='clpcnet', 30 | checkpoint=clpcnet.DEFAULT_CHECKPOINT, 31 | gpu=None): 32 | """Evaluate variable-rate pitch shifting on ravdess""" 33 | # Get list of examples to generate 34 | with open(clpcnet.data.partition_file('ravdess-variable')) as file: 35 | pairs = json.load(file)['test'] 36 | 37 | # Setup output directory 38 | output_directory = clpcnet.EVAL_DIR / \ 39 | 'objective' / \ 40 | 'variable' / \ 41 | 'ravdess-hifi' / \ 42 | run 43 | output_directory.mkdir(exist_ok=True, parents=True) 44 | 45 | # Setup multiprocessing 46 | pool = mp.get_context('spawn').Pool() 47 | 48 | # Iterate over pairs 49 | iterator = tqdm.tqdm( 50 | pairs, 51 | total=len(pairs), 52 | dynamic_ncols=True, 53 | desc='Generating variable-ratio examples') 54 | for pair in iterator: 55 | 56 | # Load text 57 | statement = pair[0].split('-')[4] 58 | text_file = clpcnet.ASSETS_DIR / 'text' / 'ravdess' / f'{statement}.txt' 59 | with open(text_file) as file: 60 | text = file.read() 61 | 62 | # Load audio 63 | source_file = clpcnet.data.stem_to_file('ravdess-variable', 64 | directory, 65 | pair[0]) 66 | target_file = clpcnet.data.stem_to_file('ravdess-variable', 67 | directory, 68 | pair[1]) 69 | source = clpcnet.load.audio(source_file) 70 | target = clpcnet.load.audio(target_file) 71 | 72 | # Compute pitch 73 | source_pitch, source_periodicity = clpcnet.pitch.from_audio(source, gpu) 74 | target_pitch, target_periodicity = clpcnet.pitch.from_audio(target, gpu) 75 | 76 | # Compute alignment 77 | source_alignment = pyfoal.align(text, source, clpcnet.SAMPLE_RATE) 78 | target_alignment = pyfoal.align(text, target, clpcnet.SAMPLE_RATE) 79 | 80 | # Align periodicity for evaluation 81 | aligned_periodicity = clpcnet.pitch.align(target_periodicity, 82 | source_periodicity, 83 | target_alignment, 84 | source_alignment) 85 | 86 | # Output file prefix 87 | prefix = output_directory / f'{pair[0]}_{pair[1]}' 88 | output_file = prefix.parent / (prefix.stem + '-transfer.wav') 89 | 90 | # Perform pitch shifting 91 | args = (output_file, source) 92 | kwargs = {'source_alignment': source_alignment, 93 | 'target_alignment': target_alignment, 94 | 'target_pitch': target_pitch, 95 | 'checkpoint_file': checkpoint, 96 | 'verbose': False} 97 | pool.apply_async(clpcnet.to_file, args, kwargs) 98 | # clpcnet.to_file(*args, **kwargs) 99 | 100 | # Save stuff 101 | np.save(prefix.parent / (prefix.stem + '-source.npy'), source_pitch) 102 | np.save(prefix.parent / (prefix.stem + '-target.npy'), target_pitch) 103 | np.save(prefix.parent / (prefix.stem + '-aligned.npy'), aligned_periodicity) 104 | np.save(prefix.parent / (prefix.stem + '-sourceharm.npy'), 105 | source_periodicity) 106 | np.save(prefix.parent / (prefix.stem + '-targetharm.npy'), 107 | target_periodicity) 108 | source_alignment.save(prefix.parent / (prefix.stem + '-source.json')) 109 | target_alignment.save(prefix.parent / (prefix.stem + '-target.json')) 110 | with open(prefix.with_suffix('.txt'), 'w') as file: 111 | file.write(text) 112 | soundfile.write(f'{prefix}-source.wav', source, clpcnet.SAMPLE_RATE) 113 | soundfile.write(f'{prefix}-target.wav', target, clpcnet.SAMPLE_RATE) 114 | 115 | # Close multiprocessing pool and wait for processes to finish 116 | pool.close() 117 | pool.join() 118 | 119 | # Pitch estimation 120 | files = list(output_directory.glob('*-transfer.wav')) 121 | prefixes = [f.parent / f.stem for f in files] 122 | clpcnet.pitch.from_files_to_files(files, prefixes, gpu) 123 | 124 | # Forced alignment 125 | pyfoal.from_files_to_files( 126 | [f.parent / (f.stem[:-9] + '.txt') for f in files], 127 | files, 128 | [f.with_suffix('.json') for f in files]) 129 | 130 | # Get pitch files to evaluate 131 | source_pitch_files = sorted(output_directory.glob('*-pitch.npy')) 132 | target_pitch_files = sorted(output_directory.glob('*-target.npy')) 133 | source_periodicity_files = sorted(output_directory.glob('*-periodicity.npy')) 134 | target_periodicity_files = sorted(output_directory.glob('*-aligned.npy')) 135 | 136 | # Evaluate pitch 137 | rmse, precision, recall, f1, gpe_20, gpe_50, hist = \ 138 | clpcnet.evaluate.pitch.from_files(source_pitch_files, 139 | target_pitch_files, 140 | source_periodicity_files, 141 | target_periodicity_files) 142 | run_results = { 143 | 'f1': f1, 144 | 'precision': precision, 145 | 'recall': recall, 146 | 'rmse_cents': rmse, 147 | 'gpe_20': gpe_20, 148 | 'gpe_50': gpe_50} 149 | hist_file = output_directory / f'{run}.png' 150 | clpcnet.evaluate.plot.write_histogram(hist_file, hist) 151 | 152 | # Evaluate time-stretch 153 | duration_dict = {} 154 | 155 | # Get duration files to evaluate 156 | source_duration_files = sorted(output_directory.glob('*-transfer.json')) 157 | target_duration_files = sorted(output_directory.glob('*-target.json')) 158 | 159 | # Forced alignment rmse metric 160 | duration_rmse = clpcnet.evaluate.duration.from_files( 161 | source_duration_files, target_duration_files) 162 | duration_dict['force-align'] = {'rmse_seconds': duration_rmse} 163 | 164 | # DTW metrics 165 | source_audio_files = [f.with_suffix('.wav') for f in source_duration_files] 166 | target_audio_files = [f.with_suffix('.wav') for f in target_duration_files] 167 | distance = clpcnet.evaluate.dtw.from_files(source_audio_files, 168 | target_audio_files) 169 | duration_dict['dtw'] = {'distance': distance} 170 | run_results.update(duration_dict) 171 | 172 | # Load results file 173 | try: 174 | with open(output_directory / 'results.json') as file: 175 | results = json.load(file) 176 | except FileNotFoundError: 177 | results = {} 178 | 179 | # Update results 180 | results[run] = run_results 181 | 182 | # Write results 183 | with open(output_directory / 'results.json', 'w') as file: 184 | json.dump(results, file, indent=4) 185 | 186 | 187 | ############################################################################### 188 | # Entry point 189 | ############################################################################### 190 | 191 | 192 | def parse_args(): 193 | """Parse command-line arguments""" 194 | parser = argparse.ArgumentParser() 195 | parser.add_argument( 196 | '--directory', 197 | type=Path, 198 | default=DEFAULT_DIRECTORY, 199 | help='Root directory of the ravdess dataset') 200 | parser.add_argument( 201 | '--run', 202 | default='clpcnet', 203 | help='The evaluation run') 204 | parser.add_argument( 205 | '--checkpoint', 206 | type=Path, 207 | default=clpcnet.DEFAULT_CHECKPOINT, 208 | help='The model checkpoint') 209 | parser.add_argument( 210 | '--gpu', 211 | type=int, 212 | default=None, 213 | help='The index of the gpu to use') 214 | return parser.parse_args() 215 | 216 | 217 | if __name__ == '__main__': 218 | evaluate(**vars(parse_args())) 219 | -------------------------------------------------------------------------------- /clpcnet/evaluate/pitch.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | import torchcrepe 6 | import tqdm 7 | 8 | import clpcnet 9 | 10 | 11 | ############################################################################### 12 | # Pitch evaluation 13 | ############################################################################### 14 | 15 | 16 | def from_files(source_pitch_files, 17 | target_pitch_files, 18 | source_periodicity_files, 19 | target_periodicity_files): 20 | """Evaluate pitch rmse in voiced regions and f1 of voiced/unvoiced""" 21 | metrics = PitchMetrics() 22 | 23 | # Voiced/unvoiced thresholding fn 24 | threshold = torchcrepe.threshold.Hysteresis() 25 | 26 | # Evaluate each pair of files 27 | iterator = zip(source_pitch_files, 28 | target_pitch_files, 29 | source_periodicity_files, 30 | target_periodicity_files) 31 | iterator = tqdm.tqdm(iterator, desc='Evaluating pitch', dynamic_ncols=True) 32 | for source_pitch_file, \ 33 | target_pitch_file, \ 34 | source_periodicity_file, \ 35 | target_periodicity_file in iterator: 36 | 37 | # Load files 38 | source_pitch = np.load(source_pitch_file) 39 | target_pitch = np.load(target_pitch_file) 40 | source_periodicity = np.load(source_periodicity_file) 41 | target_periodicity = np.load(target_periodicity_file) 42 | 43 | # Convert to torch 44 | source_pitch = torch.tensor(source_pitch)[None] 45 | target_pitch = torch.tensor(target_pitch)[None] 46 | source_periodicity = torch.tensor(source_periodicity)[None] 47 | target_periodicity = torch.tensor(target_periodicity)[None] 48 | 49 | # Threshold 50 | source = threshold(source_pitch, source_periodicity) 51 | target = threshold(target_pitch, target_periodicity) 52 | 53 | # Bound pitch 54 | source = torch.clamp(source, clpcnet.FMIN, clpcnet.FMAX) 55 | target = torch.clamp(target, clpcnet.FMIN, clpcnet.FMAX) 56 | 57 | # Compute metrics 58 | metrics.update(source, target) 59 | 60 | # Compute aggregate metrics over files 61 | return metrics() 62 | 63 | 64 | class PitchMetrics: 65 | """Batch update pitch metrics""" 66 | 67 | gpe_20_threshold = 20. / 1200. 68 | gpe_50_threshold = 50. / 1200. 69 | 70 | def __init__(self, gpe_threshold=50): 71 | self.true_positives = 0 72 | self.false_positives = 0 73 | self.false_negatives = 0 74 | self.sum = 0. 75 | self.gpe_20_count = 0 76 | self.gpe_50_count = 0 77 | self.count = 0 78 | self.differences = [] 79 | 80 | def __call__(self): 81 | """Compute the aggregate rmse, precision, recall, and f1""" 82 | precision = \ 83 | self.true_positives / (self.true_positives + self.false_positives) 84 | recall = \ 85 | self.true_positives / (self.true_positives + self.false_negatives) 86 | f1 = 2 * precision * recall / (precision + recall) 87 | rmse = 1200 * math.sqrt(self.sum / self.count) 88 | gpe_20 = self.gpe_20_count / self.count 89 | gpe_50 = self.gpe_50_count / self.count 90 | differences = 1200 * torch.cat(self.differences) 91 | return rmse, precision, recall, f1, gpe_20, gpe_50, differences 92 | 93 | def update(self, source, target): 94 | """Update the rmse, precision, recall, and f1""" 95 | source_voiced = ~torch.isnan(source) 96 | target_voiced = ~torch.isnan(target) 97 | overlap = source_voiced & target_voiced 98 | differences = torch.log2(source[overlap]) - torch.log2(target[overlap]) 99 | self.true_positives += overlap.sum().item() 100 | self.false_positives += (~source_voiced & target_voiced).sum().item() 101 | self.false_negatives += (source_voiced & ~target_voiced).sum().item() 102 | self.sum += (differences ** 2).sum().item() 103 | self.gpe_20_count += (differences.abs() > self.gpe_20_threshold).sum().item() 104 | self.gpe_50_count += (differences.abs() > self.gpe_50_threshold).sum().item() 105 | self.count += source.numel() 106 | self.differences.append(differences) 107 | -------------------------------------------------------------------------------- /clpcnet/evaluate/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def write_histogram(file, histogram): 5 | """Create and write pitch histogram""" 6 | plt.figure() 7 | plt.hist(histogram.numpy(), bins=50) 8 | plt.title('Log pitch error distribution in voiced regions') 9 | plt.xlabel('Log pitch deviation') 10 | plt.ylabel('Frequency') 11 | plt.savefig(file) 12 | plt.close() 13 | -------------------------------------------------------------------------------- /clpcnet/evaluate/prosody.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import random 3 | import warnings 4 | 5 | import numpy as np 6 | import pyfoal 7 | import pypar 8 | import soundfile 9 | 10 | import clpcnet 11 | 12 | 13 | ############################################################################### 14 | # Prosody transfer representation 15 | ############################################################################### 16 | 17 | 18 | class ProsodyTransfer: 19 | """Representation for a prosody transfer task""" 20 | 21 | def __init__(self, name, text, source_audio, target_audio, gpu=None): 22 | self.name = name 23 | self.text = text 24 | self.source_audio = source_audio 25 | self.target_audio = target_audio 26 | self.gpu = gpu 27 | 28 | def is_valid(self): 29 | """Check if phoneme alignments match""" 30 | # Get phoneme alignments 31 | source = self.source_alignment() 32 | target = self.target_alignment() 33 | 34 | # Get phonemes 35 | source_phonemes = source.phonemes() 36 | target_phonemes = target.phonemes() 37 | 38 | # Length and phoneme checks 39 | iterator = zip(source_phonemes, target_phonemes) 40 | if len(source_phonemes) != len(target_phonemes) or \ 41 | any([str(s) != str(t) for s, t in iterator]): 42 | return False 43 | 44 | # Relative rate check 45 | rates = np.array(pypar.compare.per_phoneme_rate(source, target)) 46 | if any(rates > 4.) or any(rates < .25): 47 | return False 48 | 49 | # Get pitch 50 | source_pitch, source_harm = self.source_pitch(return_periodicity=True) 51 | target_pitch = self.target_pitch() 52 | 53 | # Invert target 54 | aligned_pitch = clpcnet.pitch.align(source_pitch, 55 | target_pitch, 56 | source, 57 | target) 58 | 59 | # Threshold 60 | source_pitch = clpcnet.pitch.threshold(source_pitch, source_harm) 61 | 62 | # Extract voiced 63 | voiced = ~np.isnan(source_pitch) 64 | source = source_pitch[voiced] 65 | target = aligned_pitch[voiced] 66 | 67 | # Pitch range check 68 | if any(source > 400) or any(source < 65) or \ 69 | any(target > 400) or any(target < 65): 70 | return False 71 | 72 | # Pitch shift range check 73 | rate = np.abs(target / source) 74 | return all(rate <= 2.5) and all(rate >= .4) 75 | 76 | 77 | @classmethod 78 | def from_file(cls, prefix, gpu=None): 79 | """Load from disk""" 80 | # Load text 81 | with open(prefix.with_suffix('.txt')) as file: 82 | text = file.read() 83 | 84 | # Load audio 85 | source_audio = clpcnet.load.audio( 86 | prefix.parent / (prefix.stem + '-source.wav')) 87 | target_audio = clpcnet.load.audio( 88 | prefix.parent / (prefix.stem + '-target.wav')) 89 | 90 | # Make transfer 91 | return cls(prefix.stem, text, source_audio, target_audio, gpu) 92 | 93 | def save(self, directory): 94 | """Save audio files to directory""" 95 | prefix = directory / f'{self.name}' 96 | 97 | # Save text 98 | with open(prefix.parent / (prefix.stem + '.txt'), 'w') as file: 99 | file.write(self.text) 100 | 101 | # Save audio 102 | soundfile.write(prefix.parent / (prefix.stem + '-source.wav'), 103 | self.source_audio, 104 | clpcnet.SAMPLE_RATE) 105 | soundfile.write(prefix.parent / (prefix.stem + '-target.wav'), 106 | self.target_audio, 107 | clpcnet.SAMPLE_RATE) 108 | 109 | def source_alignment(self): 110 | """Retrieve the source alignment""" 111 | if not hasattr(self, '_source_alignment'): 112 | self._source_alignment = pyfoal.align(self.text, 113 | self.source_audio, 114 | clpcnet.SAMPLE_RATE) 115 | return self._source_alignment 116 | 117 | def source_pitch(self, return_periodicity=False): 118 | """Retrieve the source pitch""" 119 | if not hasattr(self, '_source_pitch'): 120 | self._source_pitch = clpcnet.pitch.from_audio(self.source_audio, 121 | self.gpu) 122 | return \ 123 | self._source_pitch if return_periodicity else self._source_pitch[0] 124 | 125 | def target_alignment(self): 126 | """Retrieve the target alignment""" 127 | if not hasattr(self, '_target_alignment'): 128 | self._target_alignment = pyfoal.align(self.text, 129 | self.target_audio, 130 | clpcnet.SAMPLE_RATE) 131 | return self._target_alignment 132 | 133 | def target_pitch(self, return_periodicity=False): 134 | """Retrieve the target pitch""" 135 | if not hasattr(self, '_target_pitch'): 136 | self._target_pitch = clpcnet.pitch.from_audio(self.target_audio, 137 | self.gpu) 138 | return \ 139 | self._target_pitch if return_periodicity else self._target_pitch[0] 140 | 141 | 142 | ############################################################################### 143 | # Dataset generators 144 | ############################################################################### 145 | 146 | 147 | def ravdess_generator(directory, gpu=None): 148 | """Generator over examples in ravdess dataset""" 149 | # Get audio files 150 | files = sorted(directory.glob('Actor_*/*.wav')) 151 | 152 | # Get file metadata 153 | metadata = [RavdessFileMetadata(f) for f in files] 154 | 155 | # Filter out high intensity 156 | metadata = [m for m in metadata if m.intensity == 1] 157 | 158 | # Statement text 159 | text = {1: 'Kids are talking by the door', 160 | 2: 'Dogs are sitting by the door'} 161 | 162 | # We make five matches per statement per speaker. There are 20 speakers 163 | # that satisfy this given our filtering, for a total of 100 matches. 164 | for speaker in range(1, 25): 165 | 166 | # Skip speakers that cannot produce 5 matches 167 | if speaker in [4, 9, 14, 20]: 168 | continue 169 | 170 | for statement in range(1, 3): 171 | 172 | # Get relevant files 173 | candidates = [ 174 | m for m in metadata 175 | if m.actor == speaker and m.statement == statement] 176 | 177 | # Iterate over unique pairs in random order 178 | matches = 0 179 | iterator = list(itertools.combinations(candidates, 2)) 180 | random.shuffle(iterator) 181 | for sample_a, sample_b in iterator: 182 | 183 | # Create match 184 | transfer = ProsodyTransfer( 185 | f'{sample_a.file.stem}_{sample_b.file.stem}', 186 | text[statement], 187 | clpcnet.load.audio(sample_a.file), 188 | clpcnet.load.audio(sample_b.file), 189 | gpu=gpu) 190 | 191 | # Check if phoneme alignments match 192 | if transfer.is_valid(): 193 | yield transfer 194 | 195 | # Check if we've made enough matches 196 | matches += 1 197 | if matches == 5: 198 | break 199 | 200 | # Raise if we couldn't find enough matches 201 | if matches != 5: 202 | warnings.warn(f'Can only find {matches} of 5 matches') 203 | # raise ValueError(f'Can only find {matches} of 5 matches') 204 | 205 | 206 | ############################################################################### 207 | # Utilities 208 | ############################################################################### 209 | 210 | 211 | class RavdessFileMetadata: 212 | """Parses the filename into metadata""" 213 | 214 | def __init__(self, file): 215 | self.file = file 216 | 217 | entries = file.stem.split('-') 218 | self.modality = int(entries[0]) 219 | self.channel = int(entries[1]) 220 | self.emotion = int(entries[2]) 221 | self.intensity = int(entries[3]) 222 | self.statement = int(entries[4]) 223 | self.repetition = int(entries[5]) 224 | self.actor = int(entries[6]) 225 | -------------------------------------------------------------------------------- /clpcnet/evaluate/subjective/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/evaluate/subjective/__init__.py -------------------------------------------------------------------------------- /clpcnet/evaluate/subjective/constant.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing as mp 3 | from pathlib import Path 4 | 5 | import soundfile 6 | 7 | import clpcnet 8 | 9 | 10 | ############################################################################### 11 | # Constants 12 | ############################################################################### 13 | 14 | 15 | DURATION_RATIOS = [50, 71, 100, 141, 200] 16 | PITCH_RATIOS = [67, 80, 100, 125, 150] 17 | 18 | 19 | ############################################################################### 20 | # Subjective evaluation generation 21 | ############################################################################### 22 | 23 | 24 | def generate(directory, 25 | run='clpcnet', 26 | checkpoint=clpcnet.DEFAULT_CHECKPOINT, 27 | gpu=None): 28 | """Preprare files for subjective evaluation on daps""" 29 | # Get daps files for evaluation 30 | files = clpcnet.data.files('daps-segmented', directory, 'test') 31 | 32 | # Setup output directory 33 | output_directory = clpcnet.EVAL_DIR / \ 34 | 'subjective' / \ 35 | 'constant' / \ 36 | 'daps-segmented' 37 | output_directory.mkdir(exist_ok=True, parents=True) 38 | 39 | # Setup multiprocessing 40 | pool = mp.get_context('spawn').Pool() 41 | 42 | # Generate pitch-shifting examples 43 | generate_pitch(output_directory / 'constant-pitch', 44 | files, 45 | pool, 46 | run, 47 | checkpoint, 48 | gpu) 49 | 50 | # Generate time-stretching examples 51 | generate_duration(output_directory / 'constant-duration', 52 | files, 53 | pool, 54 | run, 55 | checkpoint) 56 | 57 | # Close the pool and wait until processes finish 58 | pool.close() 59 | pool.join() 60 | 61 | # Convert to mp3 62 | wavfiles = list(output_directory.rglob('*.wav')) 63 | clpcnet.mp3.convert_files(wavfiles) 64 | 65 | # Remove wav files 66 | for file in wavfiles: 67 | file.unlink() 68 | 69 | 70 | ############################################################################### 71 | # Constant-rate duration generation 72 | ############################################################################### 73 | 74 | 75 | def generate_duration(output_directory, 76 | files, 77 | pool, 78 | run='clpcnet', 79 | checkpoint=clpcnet.DEFAULT_CHECKPOINT): 80 | """Prepare constant-rate time-stretching files for subjective evaluation""" 81 | # Setup output directory 82 | original_directory = output_directory / 'original' 83 | original_directory.mkdir(exist_ok=True, parents=True) 84 | 85 | # Iterate over utterances 86 | for file in files: 87 | 88 | # Write original audio 89 | original_file = \ 90 | original_directory / \ 91 | f'constant-duration_original_{file.stem.replace("_", "-")}.wav' 92 | soundfile.write(original_file, 93 | clpcnet.load.audio(file), 94 | clpcnet.SAMPLE_RATE) 95 | 96 | # Constant shifting with lpcnet 97 | pool.apply_async(generate_duration_lpcnet, 98 | (file, output_directory, run, checkpoint)) 99 | # generate_duration_lpcnet(file, output_directory, run, checkpoint) 100 | 101 | 102 | def generate_duration_lpcnet(file, 103 | output_directory, 104 | run='clpcnet', 105 | checkpoint=clpcnet.DEFAULT_CHECKPOINT): 106 | """Generate examples using lpcnet""" 107 | for ratio in DURATION_RATIOS: 108 | 109 | # Get run name 110 | name = f'{run}-{ratio:03d}' 111 | 112 | # Make output directory 113 | directory = output_directory / name 114 | directory.mkdir(exist_ok=True, parents=True) 115 | 116 | # Generate 117 | output_file = directory / \ 118 | f'constant-duration_{name}_{file.stem.replace("_", "-")}.wav' 119 | clpcnet.from_file_to_file(file, 120 | output_file, 121 | constant_stretch=ratio / 100., 122 | checkpoint_file=checkpoint, 123 | verbose=False) 124 | 125 | 126 | ############################################################################### 127 | # Constant-rate pitch generation 128 | ############################################################################### 129 | 130 | 131 | def generate_pitch(output_directory, 132 | files, 133 | pool, 134 | run='clplcnet', 135 | checkpoint=clpcnet.DEFAULT_CHECKPOINT, 136 | gpu=None): 137 | """Prepare constant-rate pitch-shifting files for subjective evaluation""" 138 | # Setup output directory 139 | original_directory = output_directory / 'original' 140 | original_directory.mkdir(exist_ok=True, parents=True) 141 | 142 | # Iterate over utterances 143 | for file in files: 144 | 145 | # Write original audio 146 | original_file = \ 147 | original_directory / \ 148 | f'constant-pitch_original_{file.stem.replace("_", "-")}.wav' 149 | soundfile.write(original_file, 150 | clpcnet.load.audio(file), 151 | clpcnet.SAMPLE_RATE) 152 | 153 | # Constant shifting with lpcnet 154 | pool.apply_async(generate_pitch_lpcnet, 155 | (file, output_directory, run, checkpoint)) 156 | # generate_pitch_lpcnet(file, output_directory, run, checkpoint) 157 | 158 | 159 | def generate_pitch_lpcnet(file, 160 | output_directory, 161 | run='clpcnet', 162 | checkpoint=clpcnet.DEFAULT_CHECKPOINT): 163 | """Generate examples using lpcnet""" 164 | for ratio in PITCH_RATIOS: 165 | 166 | # Get run name 167 | name = f'{run}-{ratio:03d}' 168 | 169 | # Make output directory 170 | directory = output_directory / name 171 | directory.mkdir(exist_ok=True, parents=True) 172 | 173 | # Generate 174 | output_file = \ 175 | directory / \ 176 | f'constant-pitch_{name}_{file.stem.replace("_", "-")}.wav' 177 | clpcnet.from_file_to_file(file, 178 | output_file, 179 | constant_shift=ratio / 100., 180 | checkpoint_file=checkpoint, 181 | verbose=False) 182 | 183 | 184 | ############################################################################### 185 | # Entry point 186 | ############################################################################### 187 | 188 | 189 | def parse_args(): 190 | """Parse command-line arguments""" 191 | parser = argparse.ArgumentParser() 192 | parser.add_argument( 193 | '--directory', 194 | type=Path, 195 | default=clpcnet.DATA_DIR / 'daps-segmented', 196 | help='The root directory of the segmented daps dataset') 197 | parser.add_argument( 198 | '--run', 199 | default='clpcnet', 200 | help='The evaluation run') 201 | parser.add_argument( 202 | '--checkpoint', 203 | type=Path, 204 | default=clpcnet.DEFAULT_CHECKPOINT, 205 | help='The checkpoint to use') 206 | parser.add_argument( 207 | '--gpu', 208 | type=int, 209 | default=None, 210 | help='The gpu to use for pitch estimation') 211 | 212 | return parser.parse_args() 213 | 214 | 215 | if __name__ == '__main__': 216 | generate(**vars(parse_args())) 217 | -------------------------------------------------------------------------------- /clpcnet/evaluate/subjective/variable.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import multiprocessing as mp 4 | from pathlib import Path 5 | 6 | import pyfoal 7 | import soundfile 8 | import tqdm 9 | 10 | import clpcnet 11 | 12 | 13 | ############################################################################### 14 | # Constants 15 | ############################################################################### 16 | 17 | 18 | DEFAULT_DIRECTORY = clpcnet.DATA_DIR / 'ravdess-hifi' 19 | DEFAULT_OUTPUT_DIRECTORY = clpcnet.EVAL_DIR / \ 20 | 'subjective' / \ 21 | 'variable' / \ 22 | 'ravdess-hifi' 23 | 24 | 25 | ############################################################################### 26 | # Variable-rate pitch shifting 27 | ############################################################################### 28 | 29 | 30 | def evaluate(directory=DEFAULT_DIRECTORY, 31 | output_directory=DEFAULT_OUTPUT_DIRECTORY, 32 | run='clpcnet', 33 | checkpoint=clpcnet.DEFAULT_CHECKPOINT, 34 | gpu=None): 35 | """Evaluate variable-rate pitch shifting on ravdess""" 36 | # Get list of examples to generate 37 | with open(clpcnet.data.partition_file('ravdess-variable')) as file: 38 | pairs = json.load(file)['test'] 39 | 40 | # Setup output directory 41 | original_directory = output_directory / 'original' 42 | run_directory = output_directory / run 43 | original_directory.mkdir(exist_ok=True, parents=True) 44 | run_directory.mkdir(exist_ok=True, parents=True) 45 | 46 | # Setup multiprocessing 47 | pool = mp.get_context('spawn').Pool() 48 | 49 | # Iterate over pairs 50 | for pair in tqdm.tqdm(pairs): 51 | 52 | # Load text 53 | statement = pair[0].split('-')[4] 54 | text_file = clpcnet.ASSETS_DIR / 'text' / 'ravdess' / f'{statement}.txt' 55 | with open(text_file) as file: 56 | text = file.read() 57 | 58 | # Load audio 59 | source_file = clpcnet.data.stem_to_file('ravdess-variable', 60 | directory, 61 | pair[0]) 62 | target_file = clpcnet.data.stem_to_file('ravdess-variable', 63 | directory, 64 | pair[1]) 65 | source = clpcnet.load.audio(source_file) 66 | target = clpcnet.load.audio(target_file) 67 | 68 | # Compute pitch 69 | target_pitch, _ = clpcnet.pitch.from_audio(target, gpu) 70 | 71 | # Compute alignment 72 | source_alignment = pyfoal.align(text, source, clpcnet.SAMPLE_RATE) 73 | target_alignment = pyfoal.align(text, target, clpcnet.SAMPLE_RATE) 74 | 75 | # Output file template 76 | template = 'variable_{}_' + f'{pair[0]}-{pair[1]}.wav' 77 | 78 | # Generate with clpcnet 79 | clpcnet_file = run_directory / template.format(run) 80 | args = (clpcnet_file, source) 81 | kwargs = {'source_alignment': source_alignment, 82 | 'target_alignment': target_alignment, 83 | 'target_pitch': target_pitch, 84 | 'checkpoint_file': checkpoint, 85 | 'verbose': False} 86 | pool.apply_async(clpcnet.to_file, args, kwargs) 87 | # clpcnet.to_file(*args, **kwargs) 88 | 89 | # Write original file 90 | original_file = original_directory / template.format('original') 91 | soundfile.write(original_file, target, clpcnet.SAMPLE_RATE) 92 | 93 | # Close multiprocessing pool and wait for processes to finish 94 | pool.close() 95 | pool.join() 96 | 97 | # Convert to mp3 98 | wavfiles = list(output_directory.rglob('*.wav')) 99 | clpcnet.mp3.convert_files(wavfiles) 100 | 101 | # Remove wav files 102 | for file in wavfiles: 103 | file.unlink() 104 | 105 | 106 | ############################################################################### 107 | # Entry point 108 | ############################################################################### 109 | 110 | 111 | def parse_args(): 112 | """Parse command-line arguments""" 113 | parser = argparse.ArgumentParser() 114 | parser.add_argument( 115 | '--directory', 116 | type=Path, 117 | default=DEFAULT_DIRECTORY, 118 | help='Root directory of the ravdess dataset') 119 | parser.add_argument( 120 | '--output_directory', 121 | type=Path, 122 | default=DEFAULT_OUTPUT_DIRECTORY, 123 | help='The location to store files for subjective evaluation') 124 | parser.add_argument( 125 | '--run', 126 | default='clpcnet', 127 | help='The evaluation run') 128 | parser.add_argument( 129 | '--checkpoint', 130 | type=Path, 131 | default=clpcnet.DEFAULT_CHECKPOINT, 132 | help='The checkpoint to use') 133 | parser.add_argument( 134 | '--gpu', 135 | type=int, 136 | default=None, 137 | help='The index of the gpu to use') 138 | return parser.parse_args() 139 | 140 | 141 | if __name__ == '__main__': 142 | evaluate(**vars(parse_args())) 143 | -------------------------------------------------------------------------------- /clpcnet/load.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import soundfile 3 | 4 | import clpcnet 5 | 6 | 7 | def audio(file): 8 | """Load audio from disk 9 | 10 | Arguments 11 | file : string 12 | The audio file to load 13 | 14 | Returns 15 | audio : np.array(shape=(samples,)) 16 | The audio 17 | """ 18 | # Load 19 | audio, sample_rate = soundfile.read(file) 20 | 21 | # Convert to mono if necessary 22 | if audio.ndim == 2: 23 | if audio.shape[1] == 2: 24 | audio = audio.mean(1) 25 | else: 26 | audio = audio.squeeze() 27 | 28 | # Resample 29 | return clpcnet.preprocess.resample(audio, sample_rate) 30 | 31 | 32 | def features(file): 33 | """Load frame-rate features from disk for inference 34 | 35 | Arguments 36 | file : string 37 | The feature file 38 | 39 | Returns 40 | features : np.array(shape=(frames, clpcnet.TOTAL_FEATURE_SIZE)) 41 | """ 42 | # Load test features 43 | features = np.fromfile(file, dtype=np.float32) 44 | 45 | # shape=(time, channels) 46 | features = np.reshape(features, (-1, clpcnet.TOTAL_FEATURE_SIZE)) 47 | 48 | # Zero-out unused bark-scale coefficients 49 | features[:, 18:36] = 0 50 | return features[None] 51 | 52 | 53 | def model(file=clpcnet.DEFAULT_CHECKPOINT, gpu=None): 54 | """Setup the LPCNet model for training 55 | 56 | Arguments 57 | file : string 58 | The model weight file 59 | use_gpu : bool 60 | Whether to use gpu compute 61 | """ 62 | # Bind to generate function 63 | clpcnet.from_features.session = clpcnet.Session(file, gpu) 64 | 65 | 66 | def yin(file): 67 | """Load yin pitch and periodicity from file""" 68 | # Load features 69 | yin_features = features(file) 70 | 71 | # Slice yin pitch and periodicity 72 | return yin_features[0, :, clpcnet.PITCH_IDX], \ 73 | yin_features[0, :, clpcnet.CORRELATION_IDX] 74 | -------------------------------------------------------------------------------- /clpcnet/loudness.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import librosa 4 | import numpy as np 5 | 6 | import clpcnet 7 | 8 | 9 | ############################################################################### 10 | # A-weighted loudness 11 | ############################################################################### 12 | 13 | 14 | def a_weighted(audio, n_fft=1024, min_db=-100.): 15 | """Retrieve the per-frame loudness""" 16 | # Cache weights so long as n_fft doesn't change 17 | if not hasattr(a_weighted, 'weights') or \ 18 | (hasattr(a_weighted, 'n_fft') and a_weighted.n_fft != n_fft): 19 | a_weighted.weights = perceptual_weights(n_fft) 20 | a_weighted.n_fft = n_fft 21 | 22 | # Take stft 23 | stft = librosa.stft(audio, 24 | n_fft=n_fft, 25 | hop_length=clpcnet.HOPSIZE, 26 | win_length=n_fft, 27 | pad_mode='constant') 28 | 29 | # Compute magnitude on db scale 30 | db = librosa.amplitude_to_db(np.abs(stft)) 31 | 32 | # Apply A-weighting 33 | weighted = db + a_weighted.weights 34 | 35 | # Threshold 36 | weighted[weighted < min_db] = min_db 37 | 38 | # Average over weighted frequencies 39 | return weighted.mean(axis=0) 40 | 41 | 42 | def perceptual_weights(n_fft=1024, ref_db=20.): 43 | """A-weighted frequency-dependent perceptual loudness weights""" 44 | frequencies = librosa.fft_frequencies(sr=clpcnet.SAMPLE_RATE, n_fft=n_fft) 45 | 46 | # A warning is raised for nearly inaudible frequencies, but it ends up 47 | # defaulting to -100 db. That default is fine for our purposes. 48 | with warnings.catch_warnings(): 49 | warnings.simplefilter('ignore', RuntimeWarning) 50 | return librosa.A_weighting(frequencies)[:, None] - ref_db 51 | 52 | 53 | ############################################################################### 54 | # Utilities 55 | ############################################################################### 56 | 57 | 58 | def limit(audio, delay=40, attack_coef=.9, release_coef=.9995, threshold=.99): 59 | """Apply a limiter to prevent clipping""" 60 | # Delay compensation 61 | audio = np.pad(audio, (0, delay - 1)) 62 | 63 | current_gain = 1. 64 | delay_index = 0 65 | delay_line = np.zeros(delay) 66 | envelope = 0 67 | 68 | for idx, sample in enumerate(audio): 69 | # Update signal history 70 | delay_line[delay_index] = sample 71 | delay_index = (delay_index + 1) % delay 72 | 73 | # Calculate envelope 74 | envelope = max(abs(sample), envelope * release_coef) 75 | 76 | # Calcuate gain 77 | target_gain = threshold / envelope if envelope > threshold else 1. 78 | current_gain = \ 79 | current_gain * attack_coef + target_gain * (1 - attack_coef) 80 | 81 | # Apply gain 82 | audio[idx] = delay_line[delay_index] * current_gain 83 | 84 | return audio[delay - 1:] 85 | -------------------------------------------------------------------------------- /clpcnet/model.py: -------------------------------------------------------------------------------- 1 | '''Copyright (c) 2018 Mozilla 2 | Modified by Max Morrison 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | - Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | - Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | ''' 27 | import functools 28 | import math 29 | import os 30 | import sys 31 | 32 | # Import keras without printing backend 33 | stderr = sys.stderr 34 | sys.stderr = open(os.devnull, 'w') 35 | import keras 36 | sys.stderr = stderr 37 | 38 | import numpy as np 39 | from keras import backend as K 40 | from keras.layers import Concatenate, Input, Reshape 41 | 42 | import clpcnet 43 | 44 | 45 | ############################################################################### 46 | # LPCNet model construction 47 | ############################################################################### 48 | 49 | 50 | def model(training=False, use_gpu=True): 51 | """Build the LPCNet model""" 52 | 53 | ########################################################################### 54 | # Inputs 55 | ########################################################################### 56 | 57 | # Signal, prediction, and excitation inputs 58 | sample_rate_feats = Input(shape=(None, 3)) 59 | 60 | # Bark-scale coefficients and pitch correlation 61 | spectral_feats = Input(shape=(None, clpcnet.SPECTRAL_FEATURE_SIZE)) 62 | 63 | # Pitch period 64 | pitch = Input(shape=(None, 1)) 65 | 66 | ########################################################################### 67 | # Create graph 68 | ########################################################################### 69 | 70 | # Build and link frame-rate network 71 | frame_rate_feats = frame_rate_network(spectral_feats, pitch, training) 72 | 73 | # Build and add sample-rate network 74 | probabilities, decoder_model = sample_rate_network( 75 | frame_rate_feats, sample_rate_feats, use_gpu) 76 | 77 | # Build lpcnet model 78 | model = keras.models.Model([sample_rate_feats, spectral_feats, pitch], 79 | probabilities) 80 | 81 | # Build encoder model 82 | encoder_model = encoder(spectral_feats, pitch, frame_rate_feats) 83 | 84 | return model, encoder_model, decoder_model 85 | 86 | 87 | ############################################################################### 88 | # Model components 89 | ############################################################################### 90 | 91 | 92 | def decoder(sample_rate_feats, 93 | sample_rate_embedding, 94 | gru_a, 95 | gru_b, 96 | dual_dense): 97 | """Build the LPCNet decoder""" 98 | 99 | ########################################################################### 100 | # Inputs 101 | ########################################################################### 102 | 103 | # Frame-rate features upsampled to the sampling rate 104 | upsampled = Input(shape=(None, 128)) 105 | 106 | # GRU A initial state 107 | gru_a_init = Input(shape=(clpcnet.GRU_A_SIZE,)) 108 | 109 | # GRU B initial state 110 | gru_b_init = Input(shape=(clpcnet.GRU_B_SIZE,)) 111 | 112 | ########################################################################### 113 | # Link 114 | ########################################################################### 115 | 116 | # Concatenate sample-rate and upsampled frame-rate features 117 | all_sample_rate_feats = Concatenate()([sample_rate_embedding, upsampled]) 118 | 119 | # Add sample-rate gru A to graph 120 | activation, gru_a_state = gru_a(all_sample_rate_feats, 121 | initial_state=gru_a_init) 122 | 123 | # Residual connection between upsampled features and rnn output 124 | # Note: this is NOT in the original LPCNet paper, but is in the code 125 | activation = Concatenate()([activation, upsampled]) 126 | 127 | # Add sample-rate gru B to graph 128 | activation, gru_b_state = gru_b(activation, 129 | initial_state=gru_b_init) 130 | 131 | # Add dual fully-connected layer to graph 132 | probabilities = dual_dense(activation) 133 | 134 | # Specify model start and end points 135 | inputs = [sample_rate_feats, upsampled, gru_a_init, gru_b_init] 136 | outputs = [probabilities, gru_a_state, gru_b_state] 137 | 138 | return keras.models.Model(inputs, outputs) 139 | 140 | 141 | def encoder(spectral_feats, pitch, frame_rate_feats): 142 | """Create the LPCNet encoder""" 143 | return keras.models.Model([spectral_feats, pitch], frame_rate_feats) 144 | 145 | 146 | def frame_rate_network(spectral_feats, pitch, training=False): 147 | """Create the LPCNet frame-rate network""" 148 | 149 | ########################################################################### 150 | # Build 151 | ########################################################################### 152 | 153 | # Pitch embedding table 154 | pitch_embedding_table = keras.layers.Embedding( 155 | clpcnet.PITCH_BINS, 64, name='embed_pitch') 156 | 157 | # 1d convolutions 158 | conv_fn = functools.partial(keras.layers.Conv1D, 159 | 128, 160 | 3, 161 | padding='valid' if training else 'same', 162 | activation='tanh') 163 | conv1, conv2 = conv_fn(name='feature_conv1'), conv_fn(name='feature_conv2') 164 | 165 | # Dense layers 166 | dense_fn = functools.partial(keras.layers.Dense, 128, activation='tanh') 167 | dense1 = dense_fn(name='feature_dense1') 168 | dense2 = dense_fn(name='feature_dense2') 169 | 170 | ########################################################################### 171 | # Link 172 | ########################################################################### 173 | 174 | # Embed pitch 175 | pitch_embedding = Reshape((-1, 64))(pitch_embedding_table(pitch)) 176 | 177 | # Join frame-rate features 178 | features = Concatenate()([spectral_feats, pitch_embedding]) 179 | 180 | # Convolution layer forward pass 181 | activation = conv2(conv1(features)) 182 | 183 | # Dense layer forward pass 184 | # Note: The residual connection shown in the paper was later found 185 | # to be harmful. Therefore, it is omitted. 186 | return dense2(dense1(activation)) 187 | 188 | 189 | def sample_rate_network(frame_rate_feats, sample_rate_feats, use_gpu=True): 190 | """Create the LPCNet sample-rate network""" 191 | 192 | ########################################################################### 193 | # Build 194 | ########################################################################### 195 | 196 | # PCM sample embedding table 197 | sample_rate_embedding_table = keras.layers.Embedding( 198 | clpcnet.PCM_LEVELS, 199 | clpcnet.EMBEDDING_SIZE, 200 | embeddings_initializer=sample_rate_embedding_initializer, 201 | name='embed_sig') 202 | 203 | # Upsampler 204 | repeat = keras.layers.Lambda( 205 | lambda x: K.repeat_elements(x, clpcnet.HOPSIZE, 1)) 206 | 207 | # Get gru function based on compute 208 | if use_gpu: 209 | gru_fn = functools.partial( 210 | keras.layers.CuDNNGRU, return_sequences=True, return_state=True) 211 | else: 212 | gru_fn = functools.partial(keras.layers.GRU, 213 | return_sequences=True, 214 | return_state=True, 215 | recurrent_activation='sigmoid', 216 | reset_after='true') 217 | 218 | # Gru layers 219 | gru_a = gru_fn(clpcnet.GRU_A_SIZE, name='gru_a') 220 | gru_b = gru_fn(clpcnet.GRU_B_SIZE, name='gru_b') 221 | 222 | # Dual fully-connected layer 223 | dual_dense = DualDense( 224 | clpcnet.PCM_LEVELS, activation='softmax', name='dual_fc') 225 | 226 | ########################################################################### 227 | # Link 228 | ########################################################################### 229 | 230 | # Embed Audio 231 | sample_rate_embedding = sample_rate_embedding_table(sample_rate_feats) 232 | sample_rate_embedding = \ 233 | Reshape((-1, 3 * clpcnet.EMBEDDING_SIZE))(sample_rate_embedding) 234 | 235 | # Upsample the frame-rate features to the sampling rate 236 | upsampled = repeat(frame_rate_feats) # Residual connection --------------- 237 | # | 238 | # Concatenate sample-rate and upsampled frame-rate features # | 239 | all_sample_rate_feats = Concatenate()( # | 240 | [sample_rate_embedding, upsampled]) # | 241 | # | 242 | # Add sample-rate gru A to graph. # | 243 | activation = gru_a(all_sample_rate_feats)[0] # | 244 | # | 245 | # Residual connection between upsampled features and rnn output # | 246 | # Note: this is NOT in the original LPCNet paper # | 247 | activation = Concatenate()([activation, upsampled]) # <------------------- 248 | 249 | # Add sample-rate gru B to graph 250 | activation = gru_b(activation)[0] 251 | 252 | # Add dual fully-connected layer to graph 253 | probabilities = dual_dense(activation) 254 | 255 | # Reuse components to build decoder model 256 | decoder_model = decoder(sample_rate_feats, 257 | sample_rate_embedding, 258 | gru_a, 259 | gru_b, 260 | dual_dense) 261 | 262 | return probabilities, decoder_model 263 | 264 | 265 | ############################################################################### 266 | # Custom keras layer 267 | ############################################################################### 268 | 269 | 270 | class DualDense(keras.layers.Layer): 271 | """Dual fully-connected layer""" 272 | 273 | channels = 2 274 | 275 | def __init__(self, output_size, activation=None, name=None): 276 | super().__init__(name=name) 277 | self.output_size = output_size 278 | self.activation = keras.activations.get(activation) 279 | 280 | # Network weights 281 | self.kernel, self.bias, self.factor = None, None, None 282 | 283 | def build(self, input_shape): 284 | """Initialize the DualDense layer weights""" 285 | assert len(input_shape) >= 2 286 | 287 | # Kernel 288 | kernel_shape = (self.output_size, input_shape[-1], self.channels) 289 | self.kernel = self.add_weight( 290 | name='kernel', 291 | shape=kernel_shape, 292 | initializer=keras.initializers.get('glorot_uniform'), 293 | regularizer=keras.regularizers.get(None), 294 | constraint=keras.constraints.get(None)) 295 | 296 | # Bias 297 | bias_shape = (self.output_size, self.channels) 298 | self.bias = self.add_weight( 299 | name='bias', 300 | shape=bias_shape, 301 | initializer=keras.initializers.get('zeros'), 302 | regularizer=keras.regularizers.get(None), 303 | constraint=keras.constraints.get(None)) 304 | 305 | # Learned scale factor 306 | self.factor = self.add_weight( 307 | name='factor', 308 | shape=bias_shape, 309 | initializer=keras.initializers.get('ones'), 310 | regularizer=keras.regularizers.get(None), 311 | constraint=keras.constraints.get(None)) 312 | 313 | def call(self, inputs): 314 | """Forward pass through the DualDense layer""" 315 | # Pass through two linear maps 316 | output = K.dot(inputs, self.kernel) + self.bias 317 | 318 | # Scaled tanh nonlinearity 319 | output = K.tanh(output) * self.factor 320 | 321 | # Sum over the two channels of dual-dense layer 322 | output = K.sum(output, axis=-1) 323 | 324 | # Apply optional output activation 325 | return self.activation(output) 326 | 327 | 328 | ############################################################################### 329 | # Custom embedding initializer 330 | ############################################################################### 331 | 332 | 333 | def sample_rate_embedding_initializer(shape, dtype=None): 334 | """Initializer for the sample-rate feature embedding table""" 335 | # Get output shape 336 | shape = (np.prod(shape[:-1]), shape[-1]) 337 | 338 | # Initialize as uniform noise in [-sqrt(3), sqrt(3)] 339 | weights = np.random.uniform(-1.7321, 1.7321, shape) 340 | 341 | # Add a unique offset to each weight such that the embedding 342 | # table is encouraged to be ordered 343 | line = np.arange(-.5 * shape[0] + .5, .5 * shape[0] - .4) 344 | line *= math.sqrt(12) / shape[0] 345 | return weights + np.reshape(line, (shape[0], 1)) 346 | -------------------------------------------------------------------------------- /clpcnet/mp3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import multiprocessing as mp 4 | import os 5 | import shutil 6 | import subprocess 7 | from pathlib import Path 8 | 9 | 10 | ############################################################################### 11 | # Convert audio to mp3 12 | ############################################################################### 13 | 14 | 15 | def convert_file(input_file, output_file=None, verbose=False): 16 | """Convert audio file to mp3""" 17 | # Handle input files starting with hyphen 18 | clean_input = False 19 | if input_file.stem.startswith('-'): 20 | dummy_file = input_file.parent / input_file.name[1:] 21 | shutil.copyfile(input_file, dummy_file) 22 | input_file = dummy_file 23 | clean_input = True 24 | 25 | # Handle output files starting with hyphen 26 | clean_output = False 27 | if output_file.stem.startswith('-'): 28 | output_file = output_file.parent / output_file.name[1:] 29 | clean_output = True 30 | 31 | # Default output filename is same as input but with MP3 extension 32 | if output_file is None: 33 | output_file = input_file.with_suffix('.mp3') 34 | 35 | # Convert 36 | args = [ 37 | 'ffmpeg', 38 | '-y', 39 | '-i', 40 | str(input_file), 41 | '-b:a', 42 | '320k', 43 | str(output_file)] 44 | process = subprocess.Popen( 45 | args, 46 | stdout=subprocess.PIPE, 47 | stderr=subprocess.PIPE, 48 | universal_newlines=True) 49 | stdout, stderr = process.communicate() 50 | 51 | # Maybe print 52 | if verbose or process.returncode != 0: 53 | print(stdout) 54 | print(stderr) 55 | 56 | # Clean-up input files starting with hyphen 57 | if clean_input: 58 | os.remove(input_file) 59 | 60 | # Clean-up output files starting with hyphen 61 | if clean_output: 62 | os.replace(output_file, output_file.parent / ('-' + output_file.name)) 63 | 64 | 65 | def convert_files(input_files, output_files=None): 66 | """Convert audio files to mp3""" 67 | # Convert to paths 68 | input_files = [Path(file) for file in input_files] 69 | 70 | # Default output filename is same as input but with MP3 extension 71 | if output_files is None: 72 | output_files = [file.with_suffix('.mp3') for file in input_files] 73 | 74 | # Multiprocess conversion 75 | with mp.Pool() as pool: 76 | pool.starmap(convert_file, zip(input_files, output_files)) 77 | 78 | # for input_file, output_file in zip(input_files, output_files): 79 | # convert_file(input_file, output_file) 80 | 81 | 82 | ############################################################################### 83 | # Entry point 84 | ############################################################################### 85 | 86 | 87 | def expand_files(files): 88 | """Expands a wildcard to a list of paths for Windows compatibility""" 89 | # Split at whitespace 90 | files = files.split() 91 | 92 | # Handle wildcard expansion 93 | if len(files) == 1 and '*' in files[0]: 94 | files = glob.glob(files[0]) 95 | 96 | # Convert to Path objects 97 | return files 98 | 99 | 100 | def parse_args(): 101 | """Parse command-line arguments""" 102 | parser = argparse.ArgumentParser() 103 | 104 | # Handle wildcards across platforms 105 | if os.name == 'nt': 106 | parser.add_argument( 107 | '--input_files', 108 | type=expand_files, 109 | help='The audio files to convert to mp3') 110 | else: 111 | parser.add_argument( 112 | '--input_files', 113 | nargs='+', 114 | help='The audio files to convert to mp3') 115 | 116 | parser.add_argument( 117 | '--output_files', 118 | type=Path, 119 | nargs='+', 120 | help='The corresponding output files. ' + 121 | 'Uses same filename with mp3 extension by default') 122 | return parser.parse_args() 123 | 124 | 125 | if __name__ == '__main__': 126 | convert_files(**vars(parse_args())) 127 | -------------------------------------------------------------------------------- /clpcnet/partition.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import itertools 4 | import json 5 | import random 6 | from pathlib import Path 7 | 8 | import tqdm 9 | 10 | import clpcnet 11 | 12 | 13 | ############################################################################### 14 | # Partition 15 | ############################################################################### 16 | 17 | 18 | def daps_segmented(directory): 19 | """Partition daps-segmented dataset""" 20 | files = list(directory.rglob('*.wav')) 21 | 22 | # Get files corresponding to each selected speaker 23 | speaker_files = { 24 | s: [f for f in files if f.stem.split('_')[0] == s] 25 | for s in ['f1', 'f3', 'f4', 'f5', 'f6', 'm1', 'm3', 'm4', 'm5', 'm6']} 26 | 27 | # Deterministic but random selection 28 | random.seed(0) 29 | test_files = itertools.chain( 30 | *[random.sample(f, 10) for f in speaker_files.values()]) 31 | 32 | return {'test': [f.stem for f in test_files]} 33 | 34 | 35 | def ravdess_hifi(directory): 36 | """Partition ravdess dataset""" 37 | partition_file = clpcnet.ASSETS_DIR / 'partition' / 'ravdess-variable.json' 38 | with open(partition_file) as file: 39 | pairs = json.load(file)['test'] 40 | stems = set(list(itertools.chain(*pairs))) 41 | return {'test': list(stems)} 42 | 43 | 44 | def ravdess_variable(directory, gpu=None): 45 | """Partition ravdess dataset into prosody transfer pairs""" 46 | pairs = [] 47 | generator = clpcnet.evaluate.prosody.ravdess_generator(directory, gpu) 48 | for transfer in tqdm.tqdm(generator): 49 | pairs.append(transfer.name.split('_')) 50 | return {'test': pairs} 51 | 52 | 53 | def vctk(directory, rejects=['p341_101']): 54 | """Partition vctk dataset""" 55 | # Load speaker info 56 | with open(directory / 'speaker-info.txt') as file: 57 | lines = file.readlines() 58 | speakers = [VCTKSpeaker(line) for line in lines[1:]] 59 | 60 | # Filter out speakers where mic 2 is not available 61 | speakers = [s for s in speakers if s.id not in ['p280', 'p315']] 62 | 63 | # Shuffle speakers 64 | random.seed(0) 65 | random.shuffle(speakers) 66 | 67 | # Partition speakers 68 | male = [s.id for s in speakers if s.gender == 'M'] 69 | female = [s.id for s in speakers if s.gender == 'F'] 70 | train_speaker = male[:-4] + female[:-4] 71 | test_speaker = male[-4:] + female[-4:] 72 | 73 | # Get file lists relative to root directory 74 | text_directory = directory / 'txt' 75 | train_files = chain_list_files(text_directory, train_speaker) 76 | test_files = chain_list_files(text_directory, test_speaker) 77 | 78 | # Require mic 2 be available 79 | train_files = vctk_mic_check(train_files) 80 | test_files = vctk_mic_check(test_files) 81 | 82 | # Move some train files to a separate test partition of seen speakers 83 | test_seen_speaker = male[:10] + female[:10] 84 | test_seen_files = [ 85 | random.sample([f for f in train_files 86 | if s in f.stem and f.stem not in rejects], 5) 87 | for s in test_seen_speaker] 88 | test_seen_files = list(itertools.chain(*test_seen_files)) 89 | train_files = [f for f in train_files if f not in test_seen_files] 90 | 91 | # Pack partition dictionary 92 | return { 93 | 'train': sorted([f.stem for f in train_files 94 | if f.stem not in rejects]), 95 | 'test': sorted([f.stem for f in test_files 96 | if f.stem not in rejects]), 97 | 'test-seen': sorted([f.stem for f in test_seen_files])} 98 | 99 | 100 | ############################################################################### 101 | # Utilities 102 | ############################################################################### 103 | 104 | 105 | class VCTKSpeaker: 106 | 107 | def __init__(self, line): 108 | line = self.strip_comment(line) 109 | self.id, _, self.gender = line.split()[:3] 110 | 111 | @staticmethod 112 | def strip_comment(line): 113 | comment_index = line.find('(') 114 | return line[:comment_index] if comment_index != -1 else line 115 | 116 | 117 | def chain_list_files(directory, subdirectories): 118 | """List files in all subdirectories""" 119 | return list(itertools.chain( 120 | *[(directory / sd).glob('**/*') for sd in subdirectories])) 121 | 122 | 123 | def vctk_mic_check(files): 124 | """Filter files by whether mic 2 is available""" 125 | directory = files[0].parent.parent.parent / 'wav48_silence_trimmed' 126 | result = [] 127 | for file in files: 128 | speaker = file.parent.name 129 | if (directory / speaker / f'{file.stem}_mic2.flac').exists(): 130 | result.append(file) 131 | return result 132 | 133 | 134 | ############################################################################### 135 | # Entry point 136 | ############################################################################### 137 | 138 | 139 | def main(): 140 | """Partition dataset""" 141 | # Parse command-line arguments 142 | args = parse_args() 143 | 144 | # Get partitioning function 145 | if args.dataset == 'daps-segmented': 146 | partition_fn = daps_segmented 147 | elif args.dataset == 'ravdess-hifi': 148 | partition_fn = ravdess_hifi 149 | elif args.dataset == 'ravdess-variable': 150 | partition_fn = functools.partial(ravdess_variable, gpu=args.gpu) 151 | elif args.dataset == 'vctk': 152 | partition_fn = vctk 153 | else: 154 | raise ValueError(f'No dataset {args.dataset}') 155 | 156 | # Partition 157 | partition = partition_fn(args.directory) 158 | 159 | # Save to disk 160 | with open(clpcnet.data.partition_file(args.dataset), 'w') as file: 161 | json.dump(partition, file, indent=4) 162 | 163 | 164 | def parse_args(): 165 | """Parse command-line arguments""" 166 | parser = argparse.ArgumentParser() 167 | parser.add_argument( 168 | '--dataset', 169 | default='vctk', 170 | help='The name of the dataset') 171 | parser.add_argument( 172 | '--directory', 173 | type=Path, 174 | default=clpcnet.DATA_DIR, 175 | help='The data directory') 176 | parser.add_argument( 177 | '--gpu', 178 | type=int, 179 | default=None, 180 | help='The gpu to use') 181 | 182 | # Extend directory with dataset name 183 | args = parser.parse_args() 184 | dataset = \ 185 | 'ravdess-hifi' if args.dataset == 'ravdess-variable' else args.dataset 186 | args.directory = args.directory / dataset 187 | 188 | return args 189 | 190 | 191 | if __name__ == '__main__': 192 | main() 193 | -------------------------------------------------------------------------------- /clpcnet/pitch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tempfile 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pypar 7 | import torch 8 | import torchcrepe 9 | import tqdm 10 | 11 | import clpcnet 12 | 13 | 14 | ############################################################################### 15 | # Pitch methods 16 | ############################################################################### 17 | 18 | 19 | def crepe(audio, gpu=None): 20 | """Preprocess crepe pitch from audio""" 21 | # Highpass 22 | audio = clpcnet.preprocess.highpass(audio) 23 | 24 | # Estimate pitch 25 | pitch, periodicity = torchcrepe.predict( 26 | torch.tensor(audio.copy(), dtype=torch.float)[None], 27 | sample_rate=clpcnet.SAMPLE_RATE, 28 | fmin=clpcnet.FMIN, 29 | fmax=clpcnet.FMAX, 30 | model='full', 31 | return_periodicity=True, 32 | batch_size=1024, 33 | device='cpu' if gpu is None else f'cuda:{gpu}') 34 | 35 | # Detach from graph 36 | pitch = pitch.cpu().squeeze().numpy() 37 | periodicity = periodicity.cpu().squeeze().numpy() 38 | 39 | # Set low energy frames to unvoiced 40 | periodicity[clpcnet.loudness.a_weighted(audio) < -60.] = 0. 41 | 42 | return pitch, periodicity 43 | 44 | 45 | def yin(audio): 46 | """Preprocess yin pitch from audio""" 47 | with tempfile.TemporaryDirectory() as directory: 48 | prefix = Path(directory) / 'tmp' 49 | 50 | # Preprocess and save to disk 51 | clpcnet.preprocess.from_audio_to_file(audio, prefix) 52 | 53 | # Load features 54 | features = clpcnet.load.features(f'{prefix}-frames.f32') 55 | 56 | # Extrect pitch and periodicity 57 | pitch = features[0, :, clpcnet.PITCH_IDX] 58 | periodicity = features[0, :, clpcnet.CORRELATION_IDX] 59 | 60 | # Convert to hz 61 | pitch = clpcnet.convert.epochs_to_hz(pitch) 62 | 63 | # Bound 64 | pitch[pitch > clpcnet.FMAX] = clpcnet.FMAX 65 | pitch[pitch < clpcnet.FMIN] = clpcnet.FMIN 66 | 67 | # Scale periodicity to [0, 1] 68 | return pitch, (periodicity + .4) / .8 69 | 70 | 71 | ############################################################################### 72 | # Interface 73 | ############################################################################### 74 | 75 | 76 | def from_audio(audio, gpu=None): 77 | """Preprocess pitch from audio""" 78 | if clpcnet.ABLATE_CREPE: 79 | return yin(audio) 80 | return crepe(audio, gpu) 81 | 82 | 83 | def from_audio_to_file(audio, prefix, gpu=None): 84 | """Perform pitch estimation on audio and save to disk""" 85 | # Perform pitch estimation 86 | pitch, periodicity = from_audio(audio, gpu) 87 | 88 | # Save to disk 89 | np.save(f'{prefix}-pitch.npy', pitch) 90 | np.save(f'{prefix}-periodicity.npy', periodicity) 91 | 92 | 93 | def from_dataset_to_files(dataset, 94 | directory, 95 | cache, 96 | gpu=None): 97 | """Perform pitch estimation on dataset and save to disk""" 98 | # Get filenames 99 | files = clpcnet.data.files(dataset, directory, 'train') 100 | 101 | # Get prefixes 102 | prefixes = [ 103 | cache / f'{clpcnet.data.file_to_stem(dataset, file)}-r100' 104 | for file in files] 105 | 106 | # Perform pitch estimation 107 | from_files_to_files(files, prefixes, gpu) 108 | 109 | 110 | def from_file(file, gpu=None): 111 | """Preprocess crepe pitch from file""" 112 | # Load and estimate pitch 113 | return from_audio(clpcnet.load.audio(file), gpu) 114 | 115 | 116 | def from_file_to_file(file, prefix, gpu=None): 117 | """Preprocess crepe pitch from file and save to disk""" 118 | pitch, periodicity = from_file(file, gpu) 119 | np.save(f'{prefix}-pitch.npy', pitch) 120 | np.save(f'{prefix}-periodicity.npy', periodicity) 121 | 122 | 123 | def from_files_to_files(files, prefixes, gpu=None): 124 | """Preprocess pitch from files and save to disk""" 125 | iterator = zip(files, prefixes) 126 | iterator = tqdm.tqdm(iterator, desc='pitch estimation', dynamic_ncols=True) 127 | for file, prefix in iterator: 128 | from_file_to_file(file, prefix, gpu) 129 | 130 | 131 | ############################################################################### 132 | # Utilities 133 | ############################################################################### 134 | 135 | 136 | def align(source, target, source_alignment, target_alignment): 137 | """Align target pitch with source by inverting the alignment""" 138 | # Get relative rates for each frame 139 | rates = pypar.compare.per_frame_rate(source_alignment, 140 | target_alignment, 141 | clpcnet.SAMPLE_RATE, 142 | clpcnet.HOPSIZE) 143 | 144 | # Get interpolation indices 145 | indices = np.cumsum(np.array(rates)) 146 | 147 | # Interpolate 148 | return np.interp(indices, np.arange(len(target)), target) 149 | 150 | 151 | def threshold(pitch, periodicity): 152 | """Threshold pitch via periodicity contour""" 153 | return torchcrepe.threshold.Hysteresis()( 154 | torch.tensor(pitch)[None], 155 | torch.tensor(periodicity)[None]).squeeze().numpy() 156 | 157 | 158 | ############################################################################### 159 | # Entry point 160 | ############################################################################### 161 | 162 | 163 | def parse_args(): 164 | """Parse command-line arguments""" 165 | parser = argparse.ArgumentParser() 166 | parser.add_argument( 167 | '--dataset', 168 | default='vctk', 169 | help='The dataset to perform pitch tracking on') 170 | parser.add_argument( 171 | '--directory', 172 | type=Path, 173 | default=clpcnet.DATA_DIR, 174 | help='The data directory') 175 | parser.add_argument( 176 | '--cache', 177 | type=Path, 178 | default=clpcnet.CACHE_DIR, 179 | help='The cache directory') 180 | parser.add_argument( 181 | '--gpu', 182 | type=int, 183 | default=None, 184 | help='The gpu to use for pitch tracking') 185 | 186 | # Extend directories with dataset name 187 | args = parser.parse_args() 188 | args.directory = args.directory / args.dataset 189 | args.cache = args.cache / args.dataset 190 | 191 | return args 192 | 193 | 194 | if __name__ == '__main__': 195 | from_dataset_to_files(**vars(parse_args())) 196 | -------------------------------------------------------------------------------- /clpcnet/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /clpcnet/preprocess/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import clpcnet 5 | 6 | 7 | ############################################################################### 8 | # Entry point 9 | ############################################################################### 10 | 11 | 12 | def parse_args(): 13 | """Parse command-line arguments""" 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | '--dataset', 17 | default='vctk', 18 | help='The dataset to preprocess') 19 | parser.add_argument( 20 | '--directory', 21 | type=Path, 22 | default=clpcnet.DATA_DIR, 23 | help='The data directory') 24 | parser.add_argument( 25 | '--cache', 26 | type=Path, 27 | default=clpcnet.CACHE_DIR, 28 | help='The cache directory') 29 | 30 | # Extend directories with dataset name 31 | args = parser.parse_args() 32 | args.directory = args.directory / args.dataset 33 | args.cache = args.cache / args.dataset 34 | 35 | return args 36 | 37 | 38 | if __name__ == '__main__': 39 | clpcnet.preprocess.from_dataset_to_files(**vars(parse_args())) 40 | -------------------------------------------------------------------------------- /clpcnet/preprocess/augment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing as mp 3 | import os 4 | import random 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import soundfile 9 | import tqdm 10 | 11 | import clpcnet 12 | 13 | 14 | ############################################################################### 15 | # Constants 16 | ############################################################################### 17 | 18 | 19 | ALLOWED_SCALES = [50, 67, 75, 80, 125, 133, 150, 200] 20 | DATASET = 'vctk' 21 | PASSES = 8 22 | 23 | 24 | ############################################################################### 25 | # Data augmentation 26 | ############################################################################### 27 | 28 | 29 | def dataset(dataset=DATASET, 30 | directory=clpcnet.CACHE_DIR / DATASET, 31 | cache=clpcnet.DATA_DIR / DATASET, 32 | allowed_scales=ALLOWED_SCALES, 33 | passes=PASSES, 34 | gpu=None): 35 | """Perform data augmentation for a given dataset""" 36 | # Compute the current histogram from pitch files in cache and determine 37 | # for each example which scales have been used 38 | counts, scales = count_cache(dataset, cache) 39 | 40 | # Get list of audio files 41 | files = clpcnet.data.files(dataset, directory, 'train') 42 | random.seed(0) 43 | random.shuffle(files) 44 | 45 | # Preprocessing workers 46 | feature_pool = mp.Pool(min(os.cpu_count() - 1, 2)) 47 | pitch_pool = mp.Pool(1) 48 | 49 | # Iterate over dataset 50 | for i in range(passes): 51 | iterator = tqdm.tqdm(files, 52 | dynamic_ncols=True, 53 | desc=f'augmentation pass {i}') 54 | for file in iterator: 55 | 56 | # Load pitch 57 | stem = clpcnet.data.file_to_stem(dataset, file) 58 | pitch = np.load(cache / f'{stem}-r100-pitch.npy') 59 | periodicity = np.load(cache / f'{stem}-r100-periodicity.npy') 60 | 61 | # Threshold pitch 62 | pitch = clpcnet.pitch.threshold(pitch, periodicity) 63 | 64 | # Select scale to use that maximizes entropy 65 | scale, counts = select_scale(pitch[~np.isnan(pitch)], 66 | counts, 67 | allowed_scales, 68 | scales[stem]) 69 | 70 | # No unused scale for this file 71 | if scale is None: 72 | continue 73 | 74 | # Load audio 75 | audio, sample_rate = soundfile.read(file) 76 | 77 | # Scale audio 78 | scaled = clpcnet.preprocess.resample(audio, 79 | (scale / 100.) * sample_rate, 80 | sample_rate) 81 | 82 | # Resample to lpcnet sample rate 83 | scaled = clpcnet.preprocess.resample(scaled, sample_rate) 84 | 85 | # Preprocess 86 | prefix = f'{cache / stem}-r{scale:03}' 87 | feature_pool.apply_async(clpcnet.preprocess.from_audio_to_file, 88 | (scaled, prefix)) 89 | pitch_pool.apply_async(clpcnet.pitch.from_audio_to_file, 90 | (scaled, prefix, gpu)) 91 | # clpcnet.pitch.from_audio_to_file(scaled, prefix, gpu) 92 | 93 | # Mark scale as used 94 | scales[stem].append(scale) 95 | 96 | # Close worker pools 97 | feature_pool.close() 98 | pitch_pool.close() 99 | 100 | # Wait for preprocessing to finish 101 | feature_pool.join() 102 | pitch_pool.join() 103 | 104 | 105 | ############################################################################### 106 | # Utilities 107 | ############################################################################### 108 | 109 | 110 | def count_cache(dataset, cache): 111 | """Compute pitch histogram and used scales of examples in cache""" 112 | counts = np.zeros(clpcnet.PITCH_BINS, dtype=int) 113 | scales = {} 114 | 115 | # Loop over pitch files 116 | for file in cache.glob('*-pitch.npy'): 117 | 118 | # Load pitch 119 | pitch = np.load(file) 120 | periodicity = np.load(str(file).replace('-pitch.npy', 121 | '-periodicity.npy')) 122 | 123 | # Add pitch to histogram 124 | counts += count_pitch(clpcnet.pitch.threshold(pitch, periodicity)) 125 | 126 | # Add scale to used set 127 | stem = file.stem[:-11] 128 | if stem not in scales: 129 | scales[stem] = [] 130 | scales[stem].append(int(file.stem[-9:-6])) 131 | 132 | return counts, scales 133 | 134 | 135 | def count_pitch(pitch): 136 | """Compute pitch histogram on pitch in Hz""" 137 | bins = clpcnet.convert.hz_to_bins(pitch[~np.isnan(pitch)]) 138 | return np.bincount(bins, minlength=clpcnet.PITCH_BINS) 139 | 140 | 141 | def entropy(counts): 142 | """Compute the entropy of the categorical distribution defined by counts""" 143 | # Compute categorical distribution parameters 144 | distribution = counts / counts.sum(keepdims=True) 145 | 146 | # Compute entropy contribution of each category 147 | contribution = distribution * np.log2(distribution) 148 | contribution[np.isnan(contribution)] = 0. 149 | 150 | return - (1. / np.log2(len(distribution))) * contribution.sum() 151 | 152 | 153 | def scale_pitch(pitch, scale): 154 | """Scale pitch by scale factor""" 155 | # Scale 156 | scale_min = clpcnet.FMIN / pitch.min() 157 | scale_max = clpcnet.FMAX / pitch.max() 158 | scale = scale_min if scale < scale_min else scale 159 | scale = scale_max if scale > scale_max else scale 160 | pitch = scale * pitch.copy() 161 | 162 | # Interpolate 163 | scaled = np.interp(np.arange(0, len(pitch), scale), 164 | np.arange(len(pitch)), 165 | pitch) 166 | 167 | return scaled, int(100 * scale) 168 | 169 | 170 | def select_scale(pitch, counts, allowed_scales, used_scales): 171 | """ 172 | Shift the pitch by all allowed scales. If scale causes pitch to be 173 | outside (50, 550), use the closest scale that keeps pitch in this range. 174 | Do not use scale values that have already been used for this file. 175 | """ 176 | best_entropy, best_scale = None, None 177 | for scale in set(allowed_scales) - set(used_scales): 178 | 179 | # Scale pitch 180 | scaled, scale = scale_pitch(pitch, scale / 100.) 181 | 182 | # If scale was clipped, make sure we can still use it 183 | if scale in used_scales: 184 | continue 185 | 186 | # Get pitch histogram 187 | scale_counts = counts + count_pitch(scaled) 188 | 189 | # Measure entropy for this scale 190 | scale_entropy = entropy(scale_counts) 191 | 192 | # Select scale if it maximizes entropy 193 | if best_entropy is None or \ 194 | (best_entropy is not None and scale_entropy > best_entropy): 195 | best_entropy, best_scale = scale_entropy, scale 196 | counts = scale_counts 197 | 198 | return best_scale, counts 199 | 200 | 201 | ############################################################################### 202 | # Entry point 203 | ############################################################################### 204 | 205 | 206 | def parse_args(): 207 | """Parse command-line arguments""" 208 | parser = argparse.ArgumentParser() 209 | 210 | parser.add_argument( 211 | '--dataset', 212 | default=DATASET, 213 | help='The name of the dataset') 214 | parser.add_argument( 215 | '--directory', 216 | default=clpcnet.DATA_DIR, 217 | type=Path, 218 | help='The data directory') 219 | parser.add_argument( 220 | '--cache', 221 | default=clpcnet.CACHE_DIR, 222 | type=Path, 223 | help='The cache directory') 224 | parser.add_argument( 225 | '--allowed_scales', 226 | nargs='+', 227 | type=float, 228 | default=ALLOWED_SCALES, 229 | help='The allowable scale values for resampling') 230 | parser.add_argument( 231 | '--passes', 232 | type=int, 233 | default=PASSES, 234 | help='The number of augmentation passes to make over the dataset') 235 | parser.add_argument( 236 | '--gpu', 237 | type=int, 238 | default=None, 239 | help='The index of the gpu to use') 240 | 241 | # Extend directories with dataset name 242 | args = parser.parse_args() 243 | args.directory = args.directory / args.dataset 244 | args.cache = args.cache / args.dataset 245 | 246 | return args 247 | 248 | 249 | if __name__ == '__main__': 250 | dataset(**vars(parse_args())) 251 | -------------------------------------------------------------------------------- /clpcnet/preprocess/core.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import subprocess 3 | import tempfile 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import resampy 8 | import scipy 9 | 10 | import clpcnet 11 | 12 | 13 | __all__ = ['from_audio', 14 | 'from_audio_to_file', 15 | 'from_dataset_to_files', 16 | 'from_file_to_file', 17 | 'from_files_to_files', 18 | 'clip', 19 | 'highpass', 20 | 'pad', 21 | 'preemphasis', 22 | 'resample'] 23 | 24 | 25 | ############################################################################### 26 | # Preprocessing transforms 27 | ############################################################################### 28 | 29 | 30 | def clip(audio, threshold=.99): 31 | """Normalize audio""" 32 | maximum = np.abs(audio).max() 33 | return audio * threshold / maximum if maximum > threshold else audio 34 | 35 | 36 | def highpass(audio, sample_rate=clpcnet.SAMPLE_RATE, cutoff=65., order=5): 37 | """Highpass audio""" 38 | # Get filter coefficients 39 | b, a = scipy.signal.butter( 40 | order, cutoff / (sample_rate / 2), btype='high') 41 | 42 | # Filter 43 | return scipy.signal.filtfilt(b, a, audio) 44 | 45 | 46 | def pad(audio): 47 | """Pad the audio to be a multiple of the block size""" 48 | padding = 2 * clpcnet.BLOCK_SIZE - (audio.size % clpcnet.BLOCK_SIZE) 49 | return np.pad(audio, (clpcnet.HOPSIZE // 2, padding)) 50 | 51 | 52 | def preemphasis(audio, coefficient=clpcnet.PREEMPHASIS_COEF): 53 | """Apply preemphasis filter""" 54 | result = np.zeros_like(audio) 55 | memory = 0. 56 | for i in range(len(audio)): 57 | result[i] = audio[i] + memory 58 | memory = -coefficient * audio[i] 59 | return result 60 | 61 | 62 | def resample(audio, sample_rate, target_rate=clpcnet.SAMPLE_RATE): 63 | """Resample audio""" 64 | if sample_rate != target_rate: 65 | return resampy.resample(audio, sample_rate, target_rate) 66 | return audio 67 | 68 | 69 | ############################################################################### 70 | # Preprocess data 71 | ############################################################################### 72 | 73 | 74 | def from_dataset_to_files(dataset, directory, cache): 75 | """Preprocess dataset""" 76 | # Get filenames 77 | files = clpcnet.data.files(dataset, directory, 'train') 78 | 79 | # Get prefixes 80 | prefixes = [ 81 | cache / f'{clpcnet.data.file_to_stem(dataset, file)}-r100' 82 | for file in files] 83 | 84 | # Create cache 85 | cache.mkdir(exist_ok=True, parents=True) 86 | 87 | # Preprocess from joined audio 88 | clpcnet.preprocess.from_files_to_files(files, prefixes) 89 | 90 | 91 | def from_audio(audio): 92 | """Preprocess audio""" 93 | # Preprocess to a file in a temporary directory 94 | with tempfile.TemporaryDirectory() as directory: 95 | prefix = Path(directory) / 'features' 96 | 97 | # Preprocess 98 | from_audio_to_file(audio, prefix) 99 | 100 | # Load features 101 | return clpcnet.load.features(f'{prefix}-frames.f32') 102 | 103 | 104 | def from_audio_to_file(audio, prefix): 105 | """Preprocess audio and save to disk""" 106 | # Get number of frames before padding 107 | frames = 1 + int(len(audio) // clpcnet.HOPSIZE) 108 | 109 | # Transform 110 | audio = clpcnet.loudness.limit(preemphasis(highpass(pad(audio)))) 111 | 112 | # Convert to 16-bit int 113 | audio = (audio * clpcnet.MAX_SAMPLE_VALUE).astype(np.int16) 114 | 115 | # Write audio to temporary storage and preprocess 116 | with tempfile.TemporaryDirectory() as directory: 117 | file = Path(directory) / 'audio.s16' 118 | 119 | # Save to disk 120 | audio.tofile(file) 121 | 122 | # Preprocess from file 123 | from_binary_file_to_file(file, prefix, frames) 124 | 125 | 126 | def from_file_to_file(file, prefix): 127 | """Load, preprocess, and save to disk""" 128 | from_audio_to_file(clpcnet.load.audio(file), prefix) 129 | 130 | 131 | def from_files_to_files(files, prefixes): 132 | """Load, preprocess, and save many files""" 133 | with mp.Pool() as pool: 134 | pool.starmap(from_file_to_file, zip(files, prefixes)) 135 | 136 | 137 | ############################################################################### 138 | # Utilities 139 | ############################################################################### 140 | 141 | 142 | def from_binary_file_to_file(file, prefix, frames): 143 | """Preprocess from binary s16 file""" 144 | # Write intermediate output to temporary file 145 | with tempfile.TemporaryDirectory() as directory: 146 | frame_file = f'{directory}-frames.f32' 147 | sample_file = f'{directory}-samples.u8' 148 | 149 | # Preprocess in C 150 | args = [str(Path(__file__).parent.parent.parent / 'bin' / 'preprocess'), 151 | str(file), 152 | frame_file, 153 | sample_file] 154 | subprocess.Popen(args).wait() 155 | 156 | # Truncate to original number of frames 157 | features = np.fromfile(frame_file, dtype=np.float32) 158 | features = features[:frames * clpcnet.TOTAL_FEATURE_SIZE] 159 | features.tofile(f'{prefix}-frames.f32') 160 | samples = np.fromfile(sample_file, dtype=np.uint8) 161 | samples = samples[:4 * frames * clpcnet.HOPSIZE] 162 | samples.tofile(f'{prefix}-samples.u8') 163 | -------------------------------------------------------------------------------- /clpcnet/session.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import sys 4 | 5 | # Import keras without printing backend 6 | stderr = sys.stderr 7 | sys.stderr = open(os.devnull, 'w') 8 | import keras 9 | sys.stderr = stderr 10 | 11 | import tensorflow as tf 12 | 13 | import clpcnet 14 | 15 | 16 | ############################################################################### 17 | # Tensorflow session management 18 | ############################################################################### 19 | 20 | 21 | class Session: 22 | 23 | def __init__(self, file, gpu=None): 24 | self.file = file 25 | self.gpu = gpu 26 | 27 | # Tensorflow setup 28 | if gpu is None: 29 | config = tf.compat.v1.ConfigProto(device_count={'GPU': 0}) 30 | else: 31 | gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) 32 | config = tf.compat.v1.ConfigProto(gpu_options=gpu_options) 33 | 34 | self.session = tf.compat.v1.Session(config=config) 35 | self.graph = tf.compat.v1.get_default_graph() 36 | 37 | # Keras setup 38 | keras.backend.set_session(self.session) 39 | 40 | # Device management 41 | device = 'CPU' if gpu is None else 'GPU' 42 | number = '0' if gpu is None else str(gpu) 43 | self.device = f'/{device}:{number}' 44 | 45 | # Build LPCNet 46 | model, encoder, decoder = clpcnet.model(use_gpu=gpu is not None) 47 | optimizer = keras.optimizers.Adam(clpcnet.LEARNING_RATE, 48 | amsgrad=True, 49 | decay=clpcnet.WEIGHT_DECAY) 50 | model.compile(optimizer=optimizer, 51 | loss='sparse_categorical_crossentropy', 52 | metrics=['sparse_categorical_accuracy']) 53 | 54 | # Load pretrained weights 55 | model.load_weights(file) 56 | 57 | # Bind model components for inference 58 | self.encoder = encoder 59 | self.decoder = decoder 60 | 61 | @contextlib.contextmanager 62 | def context(self): 63 | """Context manager for tensorflow setup""" 64 | with tf.device(self.device): 65 | with self.graph.as_default(): 66 | yield 67 | -------------------------------------------------------------------------------- /clpcnet/world.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pypar 3 | import pyworld 4 | import scipy 5 | import soundfile 6 | import torch 7 | 8 | import clpcnet 9 | 10 | 11 | ############################################################################### 12 | # WORLD constants 13 | ############################################################################### 14 | 15 | 16 | ALLOWED_RANGE = .8 17 | 18 | 19 | ############################################################################### 20 | # Pitch-shifting and time-stretching with WORLD 21 | ############################################################################### 22 | 23 | 24 | def from_audio(audio, 25 | source_alignment=None, 26 | target_alignment=None, 27 | target_pitch=None, 28 | constant_stretch=None, 29 | constant_shift=None): 30 | """Pitch-shifting and time-stretching with WORLD""" 31 | # World parameterization 32 | audio = audio.squeeze().numpy() 33 | pitch, spectrogram, aperiodicity = analyze(audio) 34 | 35 | # Variable-ratio pitch-shifting 36 | if target_pitch is not None: 37 | target_pitch = target_pitch.squeeze().numpy() 38 | 39 | if (len(target_pitch) != len(pitch) and 40 | source_alignment is None and 41 | target_alignment is None): 42 | raise ValueError( 43 | f'Source pitch of length {len(pitch)} incompatible ' + 44 | f'with target pitch of length {len(target_pitch)}.') 45 | pitch = target_pitch.astype(np.float64) 46 | 47 | # Constant-ratio pitch-shifting 48 | if constant_shift is not None: 49 | pitch *= constant_shift 50 | 51 | # Variable-ratio time-stretching 52 | if source_alignment is not None and target_alignment is not None: 53 | 54 | # Align spectrogram and aperiodicity 55 | spectrogram = clpcnet.pitch.align(None, spectrogram, target_alignment, source_alignment) 56 | aperiodicity = clpcnet.pitch.align(None, aperiodicity, target_alignment, source_alignment) 57 | 58 | # Constant-ratio time-stretching 59 | if constant_stretch is not None: 60 | 61 | # Get new duration 62 | duration = len(audio) / clpcnet.SAMPLE_RATE / constant_stretch 63 | 64 | # Stretch features 65 | pitch, spectrogram, aperiodicity = linear_time_stretch( 66 | pitch, spectrogram, aperiodicity, duration) 67 | 68 | # Synthesize using modified parameters 69 | vocoded = pyworld.synthesize(pitch, 70 | spectrogram, 71 | aperiodicity, 72 | clpcnet.SAMPLE_RATE, 73 | clpcnet.HOPSIZE / clpcnet.SAMPLE_RATE * 1000.) 74 | 75 | # Trim zero padding 76 | return vocoded 77 | 78 | 79 | def from_file_to_file(input_file, 80 | output_file, 81 | source_alignment_file=None, 82 | target_alignment_file=None, 83 | target_pitch_file=None, 84 | constant_stretch=None, 85 | constant_shift=None): 86 | """Perform pitch-shifting and time-stretching with WORLD on files""" 87 | source = torch.tensor(clpcnet.load.audio(input_file))[None] 88 | 89 | # Load source alignment 90 | if source_alignment_file is not None: 91 | source_alignment = pypar.Alignment(source_alignment_file) 92 | else: 93 | source_alignment = None 94 | 95 | # Load target alignment 96 | if target_alignment_file is not None: 97 | target_alignment = pypar.Alignment(target_alignment_file) 98 | else: 99 | target_alignment = None 100 | 101 | # Load target pitch 102 | if target_pitch_file is not None: 103 | target_pitch = torch.tensor(np.load(target_pitch_file))[None] 104 | else: 105 | target_pitch = None 106 | 107 | to_file(source, 108 | output_file, 109 | source_alignment, 110 | target_alignment, 111 | target_pitch, 112 | constant_stretch, 113 | constant_shift) 114 | 115 | 116 | def to_file(source, 117 | output_file, 118 | source_alignment=None, 119 | target_alignment=None, 120 | target_pitch=None, 121 | constant_stretch=None, 122 | constant_shift=None): 123 | """Perform pitch-shifting and time-stretching with WORLD and save""" 124 | vocoded = from_audio(source, 125 | source_alignment, 126 | target_alignment, 127 | target_pitch, 128 | constant_stretch, 129 | constant_shift) 130 | soundfile.write(output_file, vocoded, clpcnet.SAMPLE_RATE) 131 | 132 | 133 | ############################################################################### 134 | # Vocoding utilities 135 | ############################################################################### 136 | 137 | 138 | def analyze(audio): 139 | """Convert an audio signal to WORLD parameter representation 140 | Arguments 141 | audio : np.array(shape=(samples,)) 142 | The audio being analyzed 143 | Returns 144 | pitch : np.array(shape=(frames,)) 145 | The pitch contour 146 | spectrogram : np.array(shape=(frames, channels)) 147 | The audio spectrogram 148 | aperiodicity : np.array(shape=(frames,)) 149 | The voiced/unvoiced confidence 150 | """ 151 | # Cast to double 152 | audio = audio.astype(np.float64) 153 | 154 | # Hopsize in milliseconds 155 | frame_period = clpcnet.HOPSIZE / clpcnet.SAMPLE_RATE * 1000. 156 | 157 | # Pitch 158 | pitch, time = pyworld.dio(audio, 159 | clpcnet.SAMPLE_RATE, 160 | frame_period=frame_period, 161 | f0_floor=clpcnet.FMIN, 162 | f0_ceil=clpcnet.FMAX, 163 | allowed_range=ALLOWED_RANGE) 164 | pitch = pyworld.stonemask(audio, pitch, time, clpcnet.SAMPLE_RATE) 165 | 166 | # Spectrogram 167 | spectrogram = pyworld.cheaptrick(audio, pitch, time, clpcnet.SAMPLE_RATE) 168 | 169 | # Aperiodicity 170 | aperiodicity = pyworld.d4c(audio, pitch, time, clpcnet.SAMPLE_RATE) 171 | 172 | return pitch, spectrogram, aperiodicity 173 | 174 | 175 | def linear_time_stretch(prev_pitch, 176 | prev_spectrogram, 177 | prev_aperiodicity, 178 | duration): 179 | """Apply time stretch in WORLD parameter space 180 | Arguments 181 | prev_pitch : np.array(shape=(frames,)) 182 | The pitch to be stretched 183 | prev_spectrogram : np.array(shape=(frames, frequencies)) 184 | The spectrogram to be stretched 185 | prev_aperiodicity : np.array(shape=(frames, frequencies)) 186 | The aperiodicity to be stretched 187 | duration : float 188 | The new duration in seconds 189 | """ 190 | # Number of frames before and after 191 | prev_frames = len(prev_pitch) 192 | next_frames = clpcnet.convert.seconds_to_frames(duration) 193 | 194 | # Time-aligned grid before and after 195 | prev_grid = np.linspace(0, prev_frames - 1, prev_frames) 196 | next_grid = np.linspace(0, prev_frames - 1, next_frames) 197 | 198 | # Apply time stretch to pitch 199 | pitch = linear_time_stretch_pitch( 200 | prev_pitch, prev_grid, next_grid, next_frames) 201 | 202 | # Allocate spectrogram and aperiodicity buffers 203 | frequencies = prev_spectrogram.shape[1] 204 | spectrogram = np.zeros((next_frames, frequencies)) 205 | aperiodicity = np.zeros((next_frames, frequencies)) 206 | 207 | # Apply time stretch to all channels of spectrogram and aperiodicity 208 | for i in range(frequencies): 209 | spectrogram[:, i] = scipy.interp( 210 | next_grid, prev_grid, prev_spectrogram[:, i]) 211 | aperiodicity[:, i] = scipy.interp( 212 | next_grid, prev_grid, prev_aperiodicity[:, i]) 213 | 214 | return pitch, spectrogram, aperiodicity 215 | 216 | 217 | def linear_time_stretch_pitch(pitch, prev_grid, next_grid, next_frames): 218 | """Perform time-stretching on pitch features""" 219 | if (pitch == 0.).all(): 220 | return np.zeros(next_frames) 221 | 222 | # Get unvoiced tokens 223 | unvoiced = pitch == 0. 224 | 225 | # Linearly interpolate unvoiced regions 226 | pitch[unvoiced] = np.interp( 227 | np.where(unvoiced)[0], np.where(~unvoiced)[0], pitch[~unvoiced]) 228 | 229 | # Apply time stretch to pitch 230 | pitch = scipy.interp(next_grid, prev_grid, pitch) 231 | 232 | # Apply time stretch to unvoiced sequence 233 | unvoiced = scipy.interp(next_grid, prev_grid, unvoiced) 234 | 235 | # Reapply unvoiced tokens 236 | pitch[unvoiced > .5] = 0. 237 | 238 | return pitch 239 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/data/.gitkeep -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py==2.10.0 2 | keras==2.3.1 3 | librosa 4 | matplotlib 5 | numpy 6 | protobuf==3.20.1 7 | pyfoal 8 | pypar 9 | pyworld 10 | scipy 11 | soundfile 12 | tensorflow-gpu==1.15 13 | torch 14 | torchcrepe 15 | tqdm 16 | -------------------------------------------------------------------------------- /runs/cache/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/cache/.gitkeep -------------------------------------------------------------------------------- /runs/checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/checkpoints/.gitkeep -------------------------------------------------------------------------------- /runs/eval/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/eval/.gitkeep -------------------------------------------------------------------------------- /runs/log/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/log/.gitkeep -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from pkg_resources import parse_requirements 3 | from setuptools import setup 4 | 5 | 6 | with open('README.md') as file: 7 | long_description = file.read() 8 | 9 | 10 | with open(Path(__file__).parent / 'requirements.txt') as file: 11 | requirements = [str(req) for req in parse_requirements(file)] 12 | 13 | 14 | setup( 15 | name='clpcnet', 16 | version='0.0.1', 17 | description='Neural pitch-shifting and time-stretching with controllable lpcnet', 18 | author='Max Morrison', 19 | author_email='maxrmorrison@gmail.com', 20 | url='https://github.com/maxrmorrison/clpcnet', 21 | packages=['clpcnet'], 22 | package_data={'clpcnet': ['assets/*']}, 23 | long_description=long_description, 24 | long_description_content_type='text/markdown', 25 | keywords='speech vocoder prosody pitch-shifting time-stretching lpcnet', 26 | install_requires=requirements) 27 | -------------------------------------------------------------------------------- /src/_kiss_fft_guts.h: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2003-2004, Mark Borgerding 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 18 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE.*/ 25 | 26 | #ifndef KISS_FFT_GUTS_H 27 | #define KISS_FFT_GUTS_H 28 | 29 | #define MIN(a,b) ((a)<(b) ? (a):(b)) 30 | #define MAX(a,b) ((a)>(b) ? (a):(b)) 31 | 32 | /* kiss_fft.h 33 | defines kiss_fft_scalar as either short or a float type 34 | and defines 35 | typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */ 36 | #include "kiss_fft.h" 37 | 38 | /* 39 | Explanation of macros dealing with complex math: 40 | 41 | C_MUL(m,a,b) : m = a*b 42 | C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise 43 | C_SUB( res, a,b) : res = a - b 44 | C_SUBFROM( res , a) : res -= a 45 | C_ADDTO( res , a) : res += a 46 | * */ 47 | #ifdef FIXED_POINT 48 | #include "arch.h" 49 | 50 | 51 | #define SAMP_MAX 2147483647 52 | #define TWID_MAX 32767 53 | #define TRIG_UPSCALE 1 54 | 55 | #define SAMP_MIN -SAMP_MAX 56 | 57 | 58 | # define S_MUL(a,b) MULT16_32_Q15(b, a) 59 | 60 | # define C_MUL(m,a,b) \ 61 | do{ (m).r = SUB32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ 62 | (m).i = ADD32_ovflw(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0) 63 | 64 | # define C_MULC(m,a,b) \ 65 | do{ (m).r = ADD32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ 66 | (m).i = SUB32_ovflw(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0) 67 | 68 | # define C_MULBYSCALAR( c, s ) \ 69 | do{ (c).r = S_MUL( (c).r , s ) ;\ 70 | (c).i = S_MUL( (c).i , s ) ; }while(0) 71 | 72 | # define DIVSCALAR(x,k) \ 73 | (x) = S_MUL( x, (TWID_MAX-((k)>>1))/(k)+1 ) 74 | 75 | # define C_FIXDIV(c,div) \ 76 | do { DIVSCALAR( (c).r , div); \ 77 | DIVSCALAR( (c).i , div); }while (0) 78 | 79 | #define C_ADD( res, a,b)\ 80 | do {(res).r=ADD32_ovflw((a).r,(b).r); (res).i=ADD32_ovflw((a).i,(b).i); \ 81 | }while(0) 82 | #define C_SUB( res, a,b)\ 83 | do {(res).r=SUB32_ovflw((a).r,(b).r); (res).i=SUB32_ovflw((a).i,(b).i); \ 84 | }while(0) 85 | #define C_ADDTO( res , a)\ 86 | do {(res).r = ADD32_ovflw((res).r, (a).r); (res).i = ADD32_ovflw((res).i,(a).i);\ 87 | }while(0) 88 | 89 | #define C_SUBFROM( res , a)\ 90 | do {(res).r = ADD32_ovflw((res).r,(a).r); (res).i = SUB32_ovflw((res).i,(a).i); \ 91 | }while(0) 92 | 93 | #if defined(OPUS_ARM_INLINE_ASM) 94 | #include "arm/kiss_fft_armv4.h" 95 | #endif 96 | 97 | #if defined(OPUS_ARM_INLINE_EDSP) 98 | #include "arm/kiss_fft_armv5e.h" 99 | #endif 100 | #if defined(MIPSr1_ASM) 101 | #include "mips/kiss_fft_mipsr1.h" 102 | #endif 103 | 104 | #else /* not FIXED_POINT*/ 105 | 106 | # define S_MUL(a,b) ( (a)*(b) ) 107 | #define C_MUL(m,a,b) \ 108 | do{ (m).r = (a).r*(b).r - (a).i*(b).i;\ 109 | (m).i = (a).r*(b).i + (a).i*(b).r; }while(0) 110 | #define C_MULC(m,a,b) \ 111 | do{ (m).r = (a).r*(b).r + (a).i*(b).i;\ 112 | (m).i = (a).i*(b).r - (a).r*(b).i; }while(0) 113 | 114 | #define C_MUL4(m,a,b) C_MUL(m,a,b) 115 | 116 | # define C_FIXDIV(c,div) /* NOOP */ 117 | # define C_MULBYSCALAR( c, s ) \ 118 | do{ (c).r *= (s);\ 119 | (c).i *= (s); }while(0) 120 | #endif 121 | 122 | #ifndef CHECK_OVERFLOW_OP 123 | # define CHECK_OVERFLOW_OP(a,op,b) /* noop */ 124 | #endif 125 | 126 | #ifndef C_ADD 127 | #define C_ADD( res, a,b)\ 128 | do { \ 129 | CHECK_OVERFLOW_OP((a).r,+,(b).r)\ 130 | CHECK_OVERFLOW_OP((a).i,+,(b).i)\ 131 | (res).r=(a).r+(b).r; (res).i=(a).i+(b).i; \ 132 | }while(0) 133 | #define C_SUB( res, a,b)\ 134 | do { \ 135 | CHECK_OVERFLOW_OP((a).r,-,(b).r)\ 136 | CHECK_OVERFLOW_OP((a).i,-,(b).i)\ 137 | (res).r=(a).r-(b).r; (res).i=(a).i-(b).i; \ 138 | }while(0) 139 | #define C_ADDTO( res , a)\ 140 | do { \ 141 | CHECK_OVERFLOW_OP((res).r,+,(a).r)\ 142 | CHECK_OVERFLOW_OP((res).i,+,(a).i)\ 143 | (res).r += (a).r; (res).i += (a).i;\ 144 | }while(0) 145 | 146 | #define C_SUBFROM( res , a)\ 147 | do {\ 148 | CHECK_OVERFLOW_OP((res).r,-,(a).r)\ 149 | CHECK_OVERFLOW_OP((res).i,-,(a).i)\ 150 | (res).r -= (a).r; (res).i -= (a).i; \ 151 | }while(0) 152 | #endif /* C_ADD defined */ 153 | 154 | #ifdef FIXED_POINT 155 | /*# define KISS_FFT_COS(phase) TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * cos (phase)))) 156 | # define KISS_FFT_SIN(phase) TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * sin (phase))))*/ 157 | # define KISS_FFT_COS(phase) floor(.5+TWID_MAX*cos (phase)) 158 | # define KISS_FFT_SIN(phase) floor(.5+TWID_MAX*sin (phase)) 159 | # define HALF_OF(x) ((x)>>1) 160 | #elif defined(USE_SIMD) 161 | # define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) ) 162 | # define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) ) 163 | # define HALF_OF(x) ((x)*_mm_set1_ps(.5f)) 164 | #else 165 | # define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase) 166 | # define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase) 167 | # define HALF_OF(x) ((x)*.5f) 168 | #endif 169 | 170 | #define kf_cexp(x,phase) \ 171 | do{ \ 172 | (x)->r = KISS_FFT_COS(phase);\ 173 | (x)->i = KISS_FFT_SIN(phase);\ 174 | }while(0) 175 | 176 | #define kf_cexp2(x,phase) \ 177 | do{ \ 178 | (x)->r = TRIG_UPSCALE*celt_cos_norm((phase));\ 179 | (x)->i = TRIG_UPSCALE*celt_cos_norm((phase)-32768);\ 180 | }while(0) 181 | 182 | #endif /* KISS_FFT_GUTS_H */ 183 | -------------------------------------------------------------------------------- /src/arch.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2003-2008 Jean-Marc Valin 2 | Copyright (c) 2007-2008 CSIRO 3 | Copyright (c) 2007-2009 Xiph.Org Foundation 4 | Written by Jean-Marc Valin */ 5 | /** 6 | @file arch.h 7 | @brief Various architecture definitions for CELT 8 | */ 9 | /* 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions 12 | are met: 13 | 14 | - Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | 17 | - Redistributions in binary form must reproduce the above copyright 18 | notice, this list of conditions and the following disclaimer in the 19 | documentation and/or other materials provided with the distribution. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 25 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | 34 | #ifndef ARCH_H 35 | #define ARCH_H 36 | 37 | #include "opus_types.h" 38 | #include "common.h" 39 | 40 | # if !defined(__GNUC_PREREQ) 41 | # if defined(__GNUC__)&&defined(__GNUC_MINOR__) 42 | # define __GNUC_PREREQ(_maj,_min) \ 43 | ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min)) 44 | # else 45 | # define __GNUC_PREREQ(_maj,_min) 0 46 | # endif 47 | # endif 48 | 49 | #define CELT_SIG_SCALE 32768.f 50 | 51 | #define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__); 52 | #ifdef ENABLE_ASSERTIONS 53 | #include 54 | #include 55 | #ifdef __GNUC__ 56 | __attribute__((noreturn)) 57 | #endif 58 | static inline void _celt_fatal(const char *str, const char *file, int line) 59 | { 60 | fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str); 61 | abort(); 62 | } 63 | #define celt_assert(cond) {if (!(cond)) {celt_fatal("assertion failed: " #cond);}} 64 | #define celt_assert2(cond, message) {if (!(cond)) {celt_fatal("assertion failed: " #cond "\n" message);}} 65 | #else 66 | #define celt_assert(cond) 67 | #define celt_assert2(cond, message) 68 | #endif 69 | 70 | #define IMUL32(a,b) ((a)*(b)) 71 | 72 | #define MIN16(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 16-bit value. */ 73 | #define MAX16(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 16-bit value. */ 74 | #define MIN32(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 32-bit value. */ 75 | #define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */ 76 | #define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */ 77 | #define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */ 78 | #define UADD32(a,b) ((a)+(b)) 79 | #define USUB32(a,b) ((a)-(b)) 80 | 81 | /* Set this if opus_int64 is a native type of the CPU. */ 82 | /* Assume that all LP64 architectures have fast 64-bit types; also x86_64 83 | (which can be ILP32 for x32) and Win64 (which is LLP64). */ 84 | #if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) 85 | #define OPUS_FAST_INT64 1 86 | #else 87 | #define OPUS_FAST_INT64 0 88 | #endif 89 | 90 | #define PRINT_MIPS(file) 91 | 92 | #ifdef FIXED_POINT 93 | 94 | typedef opus_int16 opus_val16; 95 | typedef opus_int32 opus_val32; 96 | typedef opus_int64 opus_val64; 97 | 98 | typedef opus_val32 celt_sig; 99 | typedef opus_val16 celt_norm; 100 | typedef opus_val32 celt_ener; 101 | 102 | #define Q15ONE 32767 103 | 104 | #define SIG_SHIFT 12 105 | /* Safe saturation value for 32-bit signals. Should be less than 106 | 2^31*(1-0.85) to avoid blowing up on DC at deemphasis.*/ 107 | #define SIG_SAT (300000000) 108 | 109 | #define NORM_SCALING 16384 110 | 111 | #define DB_SHIFT 10 112 | 113 | #define EPSILON 1 114 | #define VERY_SMALL 0 115 | #define VERY_LARGE16 ((opus_val16)32767) 116 | #define Q15_ONE ((opus_val16)32767) 117 | 118 | #define SCALEIN(a) (a) 119 | #define SCALEOUT(a) (a) 120 | 121 | #define ABS16(x) ((x) < 0 ? (-(x)) : (x)) 122 | #define ABS32(x) ((x) < 0 ? (-(x)) : (x)) 123 | 124 | static inline opus_int16 SAT16(opus_int32 x) { 125 | return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x; 126 | } 127 | 128 | #ifdef FIXED_DEBUG 129 | #include "fixed_debug.h" 130 | #else 131 | 132 | #include "fixed_generic.h" 133 | 134 | #ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR 135 | #include "arm/fixed_arm64.h" 136 | #elif OPUS_ARM_INLINE_EDSP 137 | #include "arm/fixed_armv5e.h" 138 | #elif defined (OPUS_ARM_INLINE_ASM) 139 | #include "arm/fixed_armv4.h" 140 | #elif defined (BFIN_ASM) 141 | #include "fixed_bfin.h" 142 | #elif defined (TI_C5X_ASM) 143 | #include "fixed_c5x.h" 144 | #elif defined (TI_C6X_ASM) 145 | #include "fixed_c6x.h" 146 | #endif 147 | 148 | #endif 149 | 150 | #else /* FIXED_POINT */ 151 | 152 | typedef float opus_val16; 153 | typedef float opus_val32; 154 | typedef float opus_val64; 155 | 156 | typedef float celt_sig; 157 | typedef float celt_norm; 158 | typedef float celt_ener; 159 | 160 | #define Q15ONE 1.0f 161 | 162 | #define NORM_SCALING 1.f 163 | 164 | #define EPSILON 1e-15f 165 | #define VERY_SMALL 1e-30f 166 | #define VERY_LARGE16 1e15f 167 | #define Q15_ONE ((opus_val16)1.f) 168 | 169 | /* This appears to be the same speed as C99's fabsf() but it's more portable. */ 170 | #define ABS16(x) ((float)fabs(x)) 171 | #define ABS32(x) ((float)fabs(x)) 172 | 173 | #define QCONST16(x,bits) (x) 174 | #define QCONST32(x,bits) (x) 175 | 176 | #define NEG16(x) (-(x)) 177 | #define NEG32(x) (-(x)) 178 | #define NEG32_ovflw(x) (-(x)) 179 | #define EXTRACT16(x) (x) 180 | #define EXTEND32(x) (x) 181 | #define SHR16(a,shift) (a) 182 | #define SHL16(a,shift) (a) 183 | #define SHR32(a,shift) (a) 184 | #define SHL32(a,shift) (a) 185 | #define PSHR32(a,shift) (a) 186 | #define VSHR32(a,shift) (a) 187 | 188 | #define PSHR(a,shift) (a) 189 | #define SHR(a,shift) (a) 190 | #define SHL(a,shift) (a) 191 | #define SATURATE(x,a) (x) 192 | #define SATURATE16(x) (x) 193 | 194 | #define ROUND16(a,shift) (a) 195 | #define SROUND16(a,shift) (a) 196 | #define HALF16(x) (.5f*(x)) 197 | #define HALF32(x) (.5f*(x)) 198 | 199 | #define ADD16(a,b) ((a)+(b)) 200 | #define SUB16(a,b) ((a)-(b)) 201 | #define ADD32(a,b) ((a)+(b)) 202 | #define SUB32(a,b) ((a)-(b)) 203 | #define ADD32_ovflw(a,b) ((a)+(b)) 204 | #define SUB32_ovflw(a,b) ((a)-(b)) 205 | #define MULT16_16_16(a,b) ((a)*(b)) 206 | #define MULT16_16(a,b) ((opus_val32)(a)*(opus_val32)(b)) 207 | #define MAC16_16(c,a,b) ((c)+(opus_val32)(a)*(opus_val32)(b)) 208 | 209 | #define MULT16_32_Q15(a,b) ((a)*(b)) 210 | #define MULT16_32_Q16(a,b) ((a)*(b)) 211 | 212 | #define MULT32_32_Q31(a,b) ((a)*(b)) 213 | 214 | #define MAC16_32_Q15(c,a,b) ((c)+(a)*(b)) 215 | #define MAC16_32_Q16(c,a,b) ((c)+(a)*(b)) 216 | 217 | #define MULT16_16_Q11_32(a,b) ((a)*(b)) 218 | #define MULT16_16_Q11(a,b) ((a)*(b)) 219 | #define MULT16_16_Q13(a,b) ((a)*(b)) 220 | #define MULT16_16_Q14(a,b) ((a)*(b)) 221 | #define MULT16_16_Q15(a,b) ((a)*(b)) 222 | #define MULT16_16_P15(a,b) ((a)*(b)) 223 | #define MULT16_16_P13(a,b) ((a)*(b)) 224 | #define MULT16_16_P14(a,b) ((a)*(b)) 225 | #define MULT16_32_P16(a,b) ((a)*(b)) 226 | 227 | #define DIV32_16(a,b) (((opus_val32)(a))/(opus_val16)(b)) 228 | #define DIV32(a,b) (((opus_val32)(a))/(opus_val32)(b)) 229 | 230 | #define SCALEIN(a) ((a)*CELT_SIG_SCALE) 231 | #define SCALEOUT(a) ((a)*(1/CELT_SIG_SCALE)) 232 | 233 | #define SIG2WORD16(x) (x) 234 | 235 | #endif /* !FIXED_POINT */ 236 | 237 | #ifndef GLOBAL_STACK_SIZE 238 | #ifdef FIXED_POINT 239 | #define GLOBAL_STACK_SIZE 120000 240 | #else 241 | #define GLOBAL_STACK_SIZE 120000 242 | #endif 243 | #endif 244 | 245 | #endif /* ARCH_H */ 246 | -------------------------------------------------------------------------------- /src/celt_lpc.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2009-2010 Xiph.Org Foundation 2 | Written by Jean-Marc Valin */ 3 | /* 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | - Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | - Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include "celt_lpc.h" 28 | #include "arch.h" 29 | #include "common.h" 30 | #include "pitch.h" 31 | 32 | float _celt_lpc( 33 | opus_val16 *_lpc, /* out: [0...p-1] LPC coefficients */ 34 | opus_val16 *rc, 35 | const opus_val32 *ac, /* in: [0...p] autocorrelation values */ 36 | int p 37 | ) 38 | { 39 | int i, j; 40 | opus_val32 r; 41 | opus_val32 error = ac[0]; 42 | #ifdef FIXED_POINT 43 | opus_val32 lpc[LPC_ORDER]; 44 | #else 45 | float *lpc = _lpc; 46 | #endif 47 | 48 | RNN_CLEAR(lpc, p); 49 | RNN_CLEAR(rc, p); 50 | if (ac[0] != 0) 51 | { 52 | for (i = 0; i < p; i++) { 53 | /* Sum up this iteration's reflection coefficient */ 54 | opus_val32 rr = 0; 55 | for (j = 0; j < i; j++) 56 | rr += MULT32_32_Q31(lpc[j],ac[i - j]); 57 | rr += SHR32(ac[i + 1],3); 58 | r = -SHL32(rr,3)/error; 59 | rc[i] = r; 60 | /* Update LPC coefficients and total error */ 61 | lpc[i] = SHR32(r,3); 62 | for (j = 0; j < (i+1)>>1; j++) 63 | { 64 | opus_val32 tmp1, tmp2; 65 | tmp1 = lpc[j]; 66 | tmp2 = lpc[i-1-j]; 67 | lpc[j] = tmp1 + MULT32_32_Q31(r,tmp2); 68 | lpc[i-1-j] = tmp2 + MULT32_32_Q31(r,tmp1); 69 | } 70 | 71 | error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error); 72 | /* Bail out once we get 30 dB gain */ 73 | #ifdef FIXED_POINT 74 | if (error0); 138 | celt_assert(overlap>=0); 139 | if (overlap == 0) 140 | { 141 | xptr = x; 142 | } else { 143 | for (i=0;i 6 | #include 7 | #include 8 | 9 | 10 | float lpc_from_cepstrum(float *lpc, const float *cepstrum); 11 | 12 | #define LOG256 5.5451774445f 13 | static inline float log2_approx(float x) 14 | { 15 | int integer; 16 | float frac; 17 | union { 18 | float f; 19 | int i; 20 | } in; 21 | in.f = x; 22 | integer = (in.i>>23)-127; 23 | in.i -= integer<<23; 24 | frac = in.f - 1.5f; 25 | frac = -0.41445418f + frac*(0.95909232f 26 | + frac*(-0.33951290f + frac*0.16541097f)); 27 | return 1+integer+frac; 28 | } 29 | 30 | #define log_approx(x) (0.69315f*log2_approx(x)) 31 | 32 | /** Copy n elements from src to dst. The 0* term provides compile-time type checking */ 33 | #ifndef OVERRIDE_RNN_COPY 34 | #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) )) 35 | #endif 36 | 37 | /** Copy n elements from src to dst, allowing overlapping regions. The 0* term 38 | provides compile-time type checking */ 39 | #ifndef OVERRIDE_RNN_MOVE 40 | #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) )) 41 | #endif 42 | 43 | /** Set n elements of dst to zero */ 44 | #ifndef OVERRIDE_RNN_CLEAR 45 | #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst)))) 46 | #endif 47 | 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /src/freq.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2018 Mozilla */ 2 | /* 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | - Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | - Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 17 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 18 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 22 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "arch.h" 32 | #include "celt_lpc.h" 33 | #include "common.h" 34 | #include "freq.h" 35 | #include "kiss_fft.h" 36 | #include "pitch.h" 37 | 38 | #define SQUARE(x) ((x)*(x)) 39 | 40 | static const opus_int16 eband5ms[] = { 41 | /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k*/ 42 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40 43 | }; 44 | 45 | static const float compensation[] = { 46 | 0.8f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.666667f, 0.5f, 0.5f, 0.5f, 0.333333f, 0.25f, 0.25f, 0.2f, 0.166667f, 0.173913f 47 | }; 48 | 49 | typedef struct { 50 | int init; 51 | kiss_fft_state *kfft; 52 | float half_window[OVERLAP_SIZE]; 53 | float dct_table[NB_BANDS*NB_BANDS]; 54 | } CommonState; 55 | 56 | 57 | 58 | void compute_band_energy(float *bandE, const kiss_fft_cpx *X) { 59 | int i; 60 | float sum[NB_BANDS] = {0}; 61 | for (i=0;i 33 | #include 34 | #include "arch.h" 35 | 36 | #include 37 | #define opus_alloc(x) malloc(x) 38 | #define opus_free(x) free(x) 39 | 40 | #ifdef __cplusplus 41 | extern "C" { 42 | #endif 43 | 44 | #ifdef USE_SIMD 45 | # include 46 | # define kiss_fft_scalar __m128 47 | #define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes) 48 | #else 49 | #define KISS_FFT_MALLOC opus_alloc 50 | #endif 51 | 52 | #ifdef FIXED_POINT 53 | #include "arch.h" 54 | 55 | # define kiss_fft_scalar opus_int32 56 | # define kiss_twiddle_scalar opus_int16 57 | 58 | 59 | #else 60 | # ifndef kiss_fft_scalar 61 | /* default is float */ 62 | # define kiss_fft_scalar float 63 | # define kiss_twiddle_scalar float 64 | # define KF_SUFFIX _celt_single 65 | # endif 66 | #endif 67 | 68 | typedef struct { 69 | kiss_fft_scalar r; 70 | kiss_fft_scalar i; 71 | }kiss_fft_cpx; 72 | 73 | typedef struct { 74 | kiss_twiddle_scalar r; 75 | kiss_twiddle_scalar i; 76 | }kiss_twiddle_cpx; 77 | 78 | #define MAXFACTORS 8 79 | /* e.g. an fft of length 128 has 4 factors 80 | as far as kissfft is concerned 81 | 4*4*4*2 82 | */ 83 | 84 | typedef struct arch_fft_state{ 85 | int is_supported; 86 | void *priv; 87 | } arch_fft_state; 88 | 89 | typedef struct kiss_fft_state{ 90 | int nfft; 91 | opus_val16 scale; 92 | #ifdef FIXED_POINT 93 | int scale_shift; 94 | #endif 95 | int shift; 96 | opus_int16 factors[2*MAXFACTORS]; 97 | const opus_int16 *bitrev; 98 | const kiss_twiddle_cpx *twiddles; 99 | arch_fft_state *arch_fft; 100 | } kiss_fft_state; 101 | 102 | #if defined(HAVE_ARM_NE10) 103 | #include "arm/fft_arm.h" 104 | #endif 105 | 106 | /*typedef struct kiss_fft_state* kiss_fft_cfg;*/ 107 | 108 | /** 109 | * opus_fft_alloc 110 | * 111 | * Initialize a FFT (or IFFT) algorithm's cfg/state buffer. 112 | * 113 | * typical usage: kiss_fft_cfg mycfg=opus_fft_alloc(1024,0,NULL,NULL); 114 | * 115 | * The return value from fft_alloc is a cfg buffer used internally 116 | * by the fft routine or NULL. 117 | * 118 | * If lenmem is NULL, then opus_fft_alloc will allocate a cfg buffer using malloc. 119 | * The returned value should be free()d when done to avoid memory leaks. 120 | * 121 | * The state can be placed in a user supplied buffer 'mem': 122 | * If lenmem is not NULL and mem is not NULL and *lenmem is large enough, 123 | * then the function places the cfg in mem and the size used in *lenmem 124 | * and returns mem. 125 | * 126 | * If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough), 127 | * then the function returns NULL and places the minimum cfg 128 | * buffer size in *lenmem. 129 | * */ 130 | 131 | kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base, int arch); 132 | 133 | kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch); 134 | 135 | /** 136 | * opus_fft(cfg,in_out_buf) 137 | * 138 | * Perform an FFT on a complex input buffer. 139 | * for a forward FFT, 140 | * fin should be f[0] , f[1] , ... ,f[nfft-1] 141 | * fout will be F[0] , F[1] , ... ,F[nfft-1] 142 | * Note that each element is complex and can be accessed like 143 | f[k].r and f[k].i 144 | * */ 145 | void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout); 146 | void opus_ifft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout); 147 | 148 | void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout); 149 | void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout); 150 | 151 | void opus_fft_free(const kiss_fft_state *cfg, int arch); 152 | 153 | 154 | void opus_fft_free_arch_c(kiss_fft_state *st); 155 | int opus_fft_alloc_arch_c(kiss_fft_state *st); 156 | 157 | #if !defined(OVERRIDE_OPUS_FFT) 158 | /* Is run-time CPU detection enabled on this platform? */ 159 | #if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) 160 | 161 | extern int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])( 162 | kiss_fft_state *st); 163 | 164 | #define opus_fft_alloc_arch(_st, arch) \ 165 | ((*OPUS_FFT_ALLOC_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st)) 166 | 167 | extern void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])( 168 | kiss_fft_state *st); 169 | #define opus_fft_free_arch(_st, arch) \ 170 | ((*OPUS_FFT_FREE_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st)) 171 | 172 | extern void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, 173 | const kiss_fft_cpx *fin, kiss_fft_cpx *fout); 174 | #define opus_fft(_cfg, _fin, _fout, arch) \ 175 | ((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout)) 176 | 177 | extern void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, 178 | const kiss_fft_cpx *fin, kiss_fft_cpx *fout); 179 | #define opus_ifft(_cfg, _fin, _fout, arch) \ 180 | ((*OPUS_IFFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout)) 181 | 182 | #else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */ 183 | 184 | #define opus_fft_alloc_arch(_st, arch) \ 185 | ((void)(arch), opus_fft_alloc_arch_c(_st)) 186 | 187 | #define opus_fft_free_arch(_st, arch) \ 188 | ((void)(arch), opus_fft_free_arch_c(_st)) 189 | 190 | #define opus_fft(_cfg, _fin, _fout, arch) \ 191 | ((void)(arch), opus_fft_c(_cfg, _fin, _fout)) 192 | 193 | #define opus_ifft(_cfg, _fin, _fout, arch) \ 194 | ((void)(arch), opus_ifft_c(_cfg, _fin, _fout)) 195 | 196 | #endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */ 197 | #endif /* end if !defined(OVERRIDE_OPUS_FFT) */ 198 | 199 | #ifdef __cplusplus 200 | } 201 | #endif 202 | 203 | #endif 204 | -------------------------------------------------------------------------------- /src/lpcnet.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2018 Mozilla */ 2 | /* 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | - Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | - Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 17 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 18 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 22 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | #ifndef _LPCNET_H_ 28 | #define _LPCNET_H_ 29 | 30 | 31 | #define NB_FEATURES 38 32 | #define NB_TOTAL_FEATURES 55 33 | 34 | 35 | typedef struct LPCNetEncState LPCNetEncState; 36 | 37 | 38 | /** Gets the size of an LPCNetEncState structure. 39 | * @returns The size in bytes. 40 | */ 41 | int lpcnet_encoder_get_size(); 42 | 43 | /** Initializes a previously allocated encoder state 44 | * The memory pointed to by st must be at least the size returned by lpcnet_encoder_get_size(). 45 | * This is intended for applications which use their own allocator instead of malloc. 46 | * @see lpcnet_encoder_create(),lpcnet_encoder_get_size() 47 | * @param [in] st LPCNetEncState*: Encoder state 48 | * @retval 0 Success 49 | */ 50 | int lpcnet_encoder_init(LPCNetEncState *st); 51 | 52 | /** Allocates and initializes an encoder state. 53 | * @returns The newly created state 54 | */ 55 | LPCNetEncState *lpcnet_encoder_create(); 56 | 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/lpcnet_enc.c: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2017-2019 Mozilla */ 2 | /* 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | - Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | - Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 17 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 18 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 22 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "arch.h" 32 | #include "celt_lpc.h" 33 | #include "common.h" 34 | #include "freq.h" 35 | #include "kiss_fft.h" 36 | #include "lpcnet.h" 37 | #include "lpcnet_private.h" 38 | #include "pitch.h" 39 | 40 | //#define NB_FEATURES (2*NB_BANDS+3+LPC_ORDER) 41 | 42 | int interp_search(const float *x, const float *left, const float *right, float *dist_out) 43 | { 44 | int i, k; 45 | float min_dist = 1e15; 46 | int best_pred = 0; 47 | float pred[4 * NB_BANDS]; 48 | for (i = 0; i < NB_BANDS; i++) 49 | pred[i] = pred[NB_BANDS + i] = .5 * (left[i] + right[i]); 50 | for (i = 0; i < NB_BANDS; i++) 51 | pred[2 * NB_BANDS + i] = left[i]; 52 | for (i = 0; i < NB_BANDS; i++) 53 | pred[3 * NB_BANDS + i] = right[i]; 54 | 55 | for (k = 1; k < 4; k++) 56 | { 57 | float dist = 0; 58 | for (i = 0; i < NB_BANDS; i++) 59 | dist += (x[i] - pred[k * NB_BANDS + i]) * (x[i] - pred[k * NB_BANDS + i]); 60 | dist_out[k - 1] = dist; 61 | if (dist < min_dist) 62 | { 63 | min_dist = dist; 64 | best_pred = k; 65 | } 66 | } 67 | return best_pred - 1; 68 | } 69 | 70 | int double_interp_search(float features[4][NB_TOTAL_FEATURES], const float *mem) 71 | { 72 | int i, j; 73 | int best_id = 0; 74 | float min_dist = 1e15; 75 | float dist[2][3]; 76 | interp_search(features[0], mem, features[1], dist[0]); 77 | interp_search(features[2], features[1], features[3], dist[1]); 78 | for (i = 0; i < 3; i++) 79 | { 80 | for (j = 0; j < 3; j++) 81 | { 82 | float d; 83 | int id; 84 | id = 3 * i + j; 85 | d = dist[0][i] + dist[1][j]; 86 | if (d < min_dist && id != FORBIDDEN_INTERP) 87 | { 88 | min_dist = d; 89 | best_id = id; 90 | } 91 | } 92 | } 93 | return best_id - (best_id >= FORBIDDEN_INTERP); 94 | } 95 | 96 | 97 | int lpcnet_encoder_get_size() { 98 | return sizeof(LPCNetEncState); 99 | } 100 | 101 | int lpcnet_encoder_init(LPCNetEncState *st) { 102 | memset(st, 0, sizeof(*st)); 103 | return 0; 104 | } 105 | 106 | LPCNetEncState *lpcnet_encoder_create() { 107 | LPCNetEncState *st; 108 | st = malloc(lpcnet_encoder_get_size()); 109 | lpcnet_encoder_init(st); 110 | return st; 111 | } 112 | 113 | static void frame_analysis(LPCNetEncState *st, kiss_fft_cpx *X, float *Ex, const float *in) { 114 | float x[WINDOW_SIZE]; 115 | RNN_COPY(x, st->analysis_mem, OVERLAP_SIZE); 116 | RNN_COPY(&x[OVERLAP_SIZE], in, FRAME_SIZE); 117 | RNN_COPY(st->analysis_mem, &in[FRAME_SIZE - OVERLAP_SIZE], OVERLAP_SIZE); 118 | apply_window(x); 119 | forward_transform(X, x); 120 | compute_band_energy(Ex, X); 121 | } 122 | 123 | void compute_frame_features(LPCNetEncState *st, const float *in) { 124 | float aligned_in[FRAME_SIZE]; 125 | int i; 126 | float E = 0; 127 | float Ly[NB_BANDS]; 128 | float follow, logMax; 129 | float g; 130 | kiss_fft_cpx X[FREQ_SIZE]; 131 | float Ex[NB_BANDS]; 132 | float xcorr[PITCH_MAX_PERIOD]; 133 | float ener0; 134 | int sub; 135 | float ener; 136 | RNN_COPY(aligned_in, &st->analysis_mem[OVERLAP_SIZE - TRAINING_OFFSET], TRAINING_OFFSET); 137 | 138 | // Compute bark-scale cepstrum 139 | frame_analysis(st, X, Ex, in); 140 | logMax = -2; 141 | follow = -2; 142 | for (i = 0; i < NB_BANDS; i++) 143 | { 144 | Ly[i] = log10(1e-2 + Ex[i]); 145 | Ly[i] = MAX16(logMax - 8, MAX16(follow - 2.5, Ly[i])); 146 | logMax = MAX16(logMax, Ly[i]); 147 | follow = MAX16(follow - 2.5, Ly[i]); 148 | E += Ex[i]; 149 | } 150 | 151 | // Compute coefficients from bark-scale cepstrum 152 | dct(st->features[st->pcount], Ly); 153 | st->features[st->pcount][0] -= 4; 154 | 155 | // Compute lpcs from cepstral coefficients 156 | g = lpc_from_cepstrum(st->lpc, st->features[st->pcount]); 157 | 158 | // Store lpcs in features 159 | st->features[st->pcount][2 * NB_BANDS + 2] = log10(g); 160 | for (i = 0; i < LPC_ORDER; i++) 161 | st->features[st->pcount][2 * NB_BANDS + 3 + i] = st->lpc[i]; 162 | 163 | // Move excitation by one frame 164 | RNN_MOVE(st->exc_buf, &st->exc_buf[FRAME_SIZE], PITCH_MAX_PERIOD); 165 | 166 | // Perform yin pitch-tracking 167 | RNN_COPY(&aligned_in[TRAINING_OFFSET], in, FRAME_SIZE - TRAINING_OFFSET); 168 | for (i = 0; i < FRAME_SIZE; i++) 169 | { 170 | int j; 171 | float sum = aligned_in[i]; 172 | for (j = 0; j < LPC_ORDER; j++) 173 | sum += st->lpc[j] * st->pitch_mem[j]; 174 | RNN_MOVE(st->pitch_mem + 1, st->pitch_mem, LPC_ORDER - 1); 175 | st->pitch_mem[0] = aligned_in[i]; 176 | st->exc_buf[PITCH_MAX_PERIOD + i] = sum + .7 * st->pitch_filt; 177 | st->pitch_filt = sum; 178 | } 179 | /* Cross-correlation on half-frames. */ 180 | for (sub = 0; sub < 2; sub++) 181 | { 182 | int off = sub * FRAME_SIZE / 2; 183 | celt_pitch_xcorr(&st->exc_buf[PITCH_MAX_PERIOD + off], st->exc_buf + off, xcorr, FRAME_SIZE / 2, PITCH_MAX_PERIOD); 184 | ener0 = celt_inner_prod(&st->exc_buf[PITCH_MAX_PERIOD + off], &st->exc_buf[PITCH_MAX_PERIOD + off], FRAME_SIZE / 2); 185 | st->frame_weight[2 + 2 * st->pcount + sub] = ener0; 186 | for (i = 0; i < PITCH_MAX_PERIOD; i++) 187 | { 188 | ener = (1 + ener0 + celt_inner_prod(&st->exc_buf[i + off], &st->exc_buf[i + off], FRAME_SIZE / 2)); 189 | st->xc[2 + 2 * st->pcount + sub][i] = 2 * xcorr[i] / ener; 190 | } 191 | } 192 | } 193 | 194 | void process_superframe(LPCNetEncState *st, FILE *ffeat) { 195 | int i; 196 | int sub; 197 | int best_i; 198 | int best[10]; 199 | int pitch_prev[8][PITCH_MAX_PERIOD]; 200 | float best_a = 0; 201 | float best_b = 0; 202 | float w; 203 | float sx = 0, sxx = 0, sxy = 0, sy = 0, sw = 0; 204 | float frame_corr; 205 | int voiced; 206 | float frame_weight_sum = 1e-15; 207 | float center_pitch; 208 | int main_pitch; 209 | int modulation; 210 | for (sub = 0; sub < 8; sub++) 211 | frame_weight_sum += st->frame_weight[2 + sub]; 212 | for (sub = 0; sub < 8; sub++) 213 | st->frame_weight[2 + sub] *= (8.f / frame_weight_sum); 214 | for (sub = 0; sub < 8; sub++) 215 | { 216 | float max_path_all = -1e15; 217 | best_i = 0; 218 | for (i = 0; i < PITCH_MAX_PERIOD - 2 * PITCH_MIN_PERIOD; i++) 219 | { 220 | float xc_half = MAX16(MAX16(st->xc[2 + sub][(PITCH_MAX_PERIOD + i) / 2], st->xc[2 + sub][(PITCH_MAX_PERIOD + i + 2) / 2]), st->xc[2 + sub][(PITCH_MAX_PERIOD + i - 1) / 2]); 221 | if (st->xc[2 + sub][i] < xc_half * 1.1) 222 | st->xc[2 + sub][i] *= .8; 223 | } 224 | for (i = 0; i < PITCH_MAX_PERIOD - PITCH_MIN_PERIOD; i++) 225 | { 226 | int j; 227 | float max_prev; 228 | max_prev = st->pitch_max_path_all - 6.f; 229 | pitch_prev[sub][i] = st->best_i; 230 | for (j = IMIN(0, 4 - i); j <= 4 && i + j < PITCH_MAX_PERIOD - PITCH_MIN_PERIOD; j++) 231 | { 232 | if (st->pitch_max_path[0][i + j] > max_prev) 233 | { 234 | max_prev = st->pitch_max_path[0][i + j] - .02f * abs(j) * abs(j); 235 | pitch_prev[sub][i] = i + j; 236 | } 237 | } 238 | st->pitch_max_path[1][i] = max_prev + st->frame_weight[2 + sub] * st->xc[2 + sub][i]; 239 | if (st->pitch_max_path[1][i] > max_path_all) 240 | { 241 | max_path_all = st->pitch_max_path[1][i]; 242 | best_i = i; 243 | } 244 | } 245 | /* Renormalize. */ 246 | for (i = 0; i < PITCH_MAX_PERIOD - PITCH_MIN_PERIOD; i++) 247 | st->pitch_max_path[1][i] -= max_path_all; 248 | RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD); 249 | st->pitch_max_path_all = max_path_all; 250 | st->best_i = best_i; 251 | } 252 | best_i = st->best_i; 253 | frame_corr = 0; 254 | 255 | /* Backward pass. */ 256 | for (sub = 7; sub >= 0; sub--) 257 | { 258 | best[2 + sub] = PITCH_MAX_PERIOD - best_i; 259 | frame_corr += st->frame_weight[2 + sub] * st->xc[2 + sub][best_i]; 260 | best_i = pitch_prev[sub][best_i]; 261 | } 262 | 263 | frame_corr /= 8; 264 | 265 | for (sub = 2; sub < 10; sub++) 266 | { 267 | w = st->frame_weight[sub]; 268 | sw += w; 269 | sx += w * sub; 270 | sxx += w * sub * sub; 271 | sxy += w * sub * best[sub]; 272 | sy += w * best[sub]; 273 | } 274 | voiced = frame_corr >= .3; 275 | 276 | /* Linear regression to figure out the pitch contour. */ 277 | best_a = (sw * sxy - sx * sy) / (sw * sxx - sx * sx); 278 | if (voiced) 279 | { 280 | float max_a; 281 | float mean_pitch = sy / sw; 282 | 283 | /* Allow a relative variation of up to 1/4 over 8 sub-frames. */ 284 | max_a = mean_pitch / 32; 285 | best_a = MIN16(max_a, MAX16(-max_a, best_a)); 286 | } 287 | else 288 | { 289 | best_a = 0; 290 | } 291 | 292 | best_b = (sy - best_a * sx) / sw; 293 | 294 | /* Quantizing the pitch as "main" pitch + slope. */ 295 | center_pitch = best_b + 5.5 * best_a; 296 | main_pitch = (int)floor(.5 + 21. * log2(center_pitch / PITCH_MIN_PERIOD)); 297 | main_pitch = IMAX(0, IMIN(63, main_pitch)); 298 | modulation = (int)floor(.5 + 16 * 7 * best_a / center_pitch); 299 | modulation = IMAX(-3, IMIN(3, modulation)); 300 | 301 | for (sub = 0; sub < 4; sub++) 302 | { 303 | st->features[sub][2 * NB_BANDS] = .01 * (best[2 + 2 * sub] + best[2 + 2 * sub + 1] - 200); 304 | st->features[sub][2 * NB_BANDS + 1] = frame_corr - .5; 305 | } 306 | RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD); 307 | RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD); 308 | for (sub = 0; sub < 4; sub++) 309 | { 310 | float g = lpc_from_cepstrum(st->lpc, st->features[sub]); 311 | st->features[sub][2 * NB_BANDS + 2] = log10(g); 312 | for (i = 0; i < LPC_ORDER; i++) 313 | st->features[sub][2 * NB_BANDS + 3 + i] = st->lpc[i]; 314 | } 315 | if (ffeat) 316 | { 317 | for (i = 0; i < 4; i++) 318 | { 319 | fwrite(st->features[i], sizeof(float), NB_TOTAL_FEATURES, ffeat); 320 | } 321 | } 322 | } 323 | -------------------------------------------------------------------------------- /src/lpcnet_private.h: -------------------------------------------------------------------------------- 1 | #ifndef LPCNET_PRIVATE_H 2 | #define LPCNET_PRIVATE_H 3 | 4 | #include "celt_lpc.h" 5 | #include "common.h" 6 | #include "freq.h" 7 | #include "lpcnet.h" 8 | 9 | #define PITCH_MIN_PERIOD 32 // 10 | #define PITCH_MAX_PERIOD 256 // 11 | 12 | #define PITCH_FRAME_SIZE 320 // 13 | #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD + PITCH_FRAME_SIZE) // 14 | 15 | #define FORBIDDEN_INTERP 7 16 | 17 | struct LPCNetEncState 18 | { 19 | float analysis_mem[OVERLAP_SIZE]; // 20 | int pcount; // 21 | float pitch_mem[LPC_ORDER]; // 22 | float pitch_filt; // 23 | float xc[10][PITCH_MAX_PERIOD + 1]; // 24 | float frame_weight[10]; // 25 | float exc_buf[PITCH_BUF_SIZE]; // 26 | float pitch_max_path[2][PITCH_MAX_PERIOD]; // 27 | float pitch_max_path_all; // 28 | int best_i; // 29 | float lpc[LPC_ORDER]; // 30 | float features[4][NB_TOTAL_FEATURES]; // 31 | float sig_mem[LPC_ORDER]; // 32 | int exc_mem; // 33 | }; 34 | 35 | void process_superframe(LPCNetEncState *st, FILE *ffeat); 36 | 37 | void compute_frame_features(LPCNetEncState *st, const float *in); 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/opus_types.h: -------------------------------------------------------------------------------- 1 | /* (C) COPYRIGHT 1994-2002 Xiph.Org Foundation */ 2 | /* Modified by Jean-Marc Valin */ 3 | /* 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | - Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | - Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | /* opus_types.h based on ogg_types.h from libogg */ 28 | 29 | /** 30 | @file opus_types.h 31 | @brief Opus reference implementation types 32 | */ 33 | #ifndef OPUS_TYPES_H 34 | #define OPUS_TYPES_H 35 | 36 | /* Use the real stdint.h if it's there (taken from Paul Hsieh's pstdint.h) */ 37 | #if (defined(__STDC__) && __STDC__ && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_)) || defined (HAVE_STDINT_H)) 38 | #include 39 | 40 | typedef int16_t opus_int16; 41 | typedef uint16_t opus_uint16; 42 | typedef int32_t opus_int32; 43 | typedef uint32_t opus_uint32; 44 | #elif defined(_WIN32) 45 | 46 | # if defined(__CYGWIN__) 47 | # include <_G_config.h> 48 | typedef _G_int32_t opus_int32; 49 | typedef _G_uint32_t opus_uint32; 50 | typedef _G_int16 opus_int16; 51 | typedef _G_uint16 opus_uint16; 52 | # elif defined(__MINGW32__) 53 | typedef short opus_int16; 54 | typedef unsigned short opus_uint16; 55 | typedef int opus_int32; 56 | typedef unsigned int opus_uint32; 57 | # elif defined(__MWERKS__) 58 | typedef int opus_int32; 59 | typedef unsigned int opus_uint32; 60 | typedef short opus_int16; 61 | typedef unsigned short opus_uint16; 62 | # else 63 | /* MSVC/Borland */ 64 | typedef __int32 opus_int32; 65 | typedef unsigned __int32 opus_uint32; 66 | typedef __int16 opus_int16; 67 | typedef unsigned __int16 opus_uint16; 68 | # endif 69 | 70 | #elif defined(__MACOS__) 71 | 72 | # include 73 | typedef SInt16 opus_int16; 74 | typedef UInt16 opus_uint16; 75 | typedef SInt32 opus_int32; 76 | typedef UInt32 opus_uint32; 77 | 78 | #elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */ 79 | 80 | # include 81 | typedef int16_t opus_int16; 82 | typedef u_int16_t opus_uint16; 83 | typedef int32_t opus_int32; 84 | typedef u_int32_t opus_uint32; 85 | 86 | #elif defined(__BEOS__) 87 | 88 | /* Be */ 89 | # include 90 | typedef int16 opus_int16; 91 | typedef u_int16 opus_uint16; 92 | typedef int32_t opus_int32; 93 | typedef u_int32_t opus_uint32; 94 | 95 | #elif defined (__EMX__) 96 | 97 | /* OS/2 GCC */ 98 | typedef short opus_int16; 99 | typedef unsigned short opus_uint16; 100 | typedef int opus_int32; 101 | typedef unsigned int opus_uint32; 102 | 103 | #elif defined (DJGPP) 104 | 105 | /* DJGPP */ 106 | typedef short opus_int16; 107 | typedef unsigned short opus_uint16; 108 | typedef int opus_int32; 109 | typedef unsigned int opus_uint32; 110 | 111 | #elif defined(R5900) 112 | 113 | /* PS2 EE */ 114 | typedef int opus_int32; 115 | typedef unsigned opus_uint32; 116 | typedef short opus_int16; 117 | typedef unsigned short opus_uint16; 118 | 119 | #elif defined(__SYMBIAN32__) 120 | 121 | /* Symbian GCC */ 122 | typedef signed short opus_int16; 123 | typedef unsigned short opus_uint16; 124 | typedef signed int opus_int32; 125 | typedef unsigned int opus_uint32; 126 | 127 | #elif defined(CONFIG_TI_C54X) || defined (CONFIG_TI_C55X) 128 | 129 | typedef short opus_int16; 130 | typedef unsigned short opus_uint16; 131 | typedef long opus_int32; 132 | typedef unsigned long opus_uint32; 133 | 134 | #elif defined(CONFIG_TI_C6X) 135 | 136 | typedef short opus_int16; 137 | typedef unsigned short opus_uint16; 138 | typedef int opus_int32; 139 | typedef unsigned int opus_uint32; 140 | 141 | #else 142 | 143 | /* Give up, take a reasonable guess */ 144 | typedef short opus_int16; 145 | typedef unsigned short opus_uint16; 146 | typedef int opus_int32; 147 | typedef unsigned int opus_uint32; 148 | 149 | #endif 150 | 151 | #define opus_int int /* used for counters etc; at least 16 bits */ 152 | #define opus_int64 long long 153 | #define opus_int8 signed char 154 | 155 | #define opus_uint unsigned int /* used for counters etc; at least 16 bits */ 156 | #define opus_uint64 unsigned long long 157 | #define opus_uint8 unsigned char 158 | 159 | #endif /* OPUS_TYPES_H */ 160 | -------------------------------------------------------------------------------- /src/pitch.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2007-2008 CSIRO 2 | Copyright (c) 2007-2009 Xiph.Org Foundation 3 | Written by Jean-Marc Valin */ 4 | /** 5 | @file pitch.h 6 | @brief Pitch analysis 7 | */ 8 | 9 | /* 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions 12 | are met: 13 | 14 | - Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | 17 | - Redistributions in binary form must reproduce the above copyright 18 | notice, this list of conditions and the following disclaimer in the 19 | documentation and/or other materials provided with the distribution. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 25 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | 34 | #ifndef PITCH_H 35 | #define PITCH_H 36 | 37 | #include "arch.h" 38 | 39 | void pitch_downsample(opus_val16 *x_lp, 40 | int len); 41 | 42 | void pitch_search(const opus_val16 *x_lp, opus_val16 *y, 43 | int len, int max_pitch, int *pitch); 44 | 45 | opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, 46 | int N, int *T0, int prev_period, opus_val16 prev_gain); 47 | 48 | 49 | /* OPT: This is the kernel you really want to optimize. It gets used a lot 50 | by the prefilter and by the PLC. */ 51 | static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) 52 | { 53 | int j; 54 | opus_val16 y_0, y_1, y_2, y_3; 55 | celt_assert(len>=3); 56 | y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ 57 | y_0=*y++; 58 | y_1=*y++; 59 | y_2=*y++; 60 | for (j=0;j 27 | #include 28 | #include 29 | #include 30 | 31 | #include "lpcnet.h" 32 | #include "lpcnet_private.h" 33 | 34 | 35 | /****************************************************************************** 36 | Constants 37 | ******************************************************************************/ 38 | 39 | 40 | #ifndef FRAME_SIZE 41 | #define FRAME_SIZE 160 42 | #endif 43 | 44 | #ifndef HALF_FRAME 45 | #define HALF_FRAME (FRAME_SIZE / 2) 46 | #endif 47 | 48 | #ifndef LOG256 49 | #define LOG256 5.5451774445f 50 | #endif 51 | 52 | #ifndef LPC_ORDER 53 | #define LPC_ORDER 16 54 | #endif 55 | 56 | #ifndef NB_BANDS 57 | #define NB_BANDS 18 58 | #endif 59 | 60 | #ifndef OVERLAP_SIZE 61 | #define OVERLAP_SIZE 160 62 | #endif 63 | 64 | #ifndef PITCH_MAX_PERIOD 65 | #define PITCH_MAX_PERIOD 256 66 | #endif 67 | 68 | #ifndef WINDOW_SIZE 69 | #define WINDOW_SIZE (FRAME_SIZE + OVERLAP_SIZE) 70 | #endif 71 | 72 | #ifndef FREQ_SIZE 73 | #define FREQ_SIZE (WINDOW_SIZE / 2 + 1) 74 | #endif 75 | 76 | #ifndef log_approx 77 | #define log_approx(x) (0.69315f * log2_approx(x)) 78 | #endif 79 | 80 | #ifndef max 81 | #define max(a, b) (((a) > (b)) ? (a) : (b)) 82 | #endif 83 | 84 | #ifndef min 85 | #define min(a, b) (((a) < (b)) ? (a) : (b)) 86 | #endif 87 | 88 | 89 | /****************************************************************************** 90 | Conversions 91 | ******************************************************************************/ 92 | 93 | 94 | static inline float mulaw_to_linear(float u) { 95 | /* Convert mulaw-encoded audio to linear */ 96 | float s; 97 | float scale_1 = 32768.f / 255.f; 98 | u = u - 128; 99 | s = u >= 0 ? 1 : -1; 100 | u = fabs(u); 101 | return s * scale_1 * (exp(u / 128. * LOG256) - 1); 102 | } 103 | 104 | 105 | static inline int linear_to_mulaw(float x) { 106 | /* Convert linear audio to mulaw */ 107 | float u; 108 | float scale = 255.f / 32768.f; 109 | int s = x >= 0 ? 1 : -1; 110 | x = fabs(x); 111 | u = (s * (128 * log_approx(1 + scale * x) / LOG256)); 112 | u = 128 + u; 113 | if (u < 0) 114 | u = 0; 115 | if (u > 255) 116 | u = 255; 117 | return (int)floor(.5 + u); 118 | } 119 | 120 | 121 | /****************************************************************************** 122 | File output 123 | ******************************************************************************/ 124 | 125 | 126 | void write_audio(LPCNetEncState *st, const short *pcm, FILE *file) { 127 | // Iterate over frames in a block 128 | for (int k = 0; k < 4; k++) { 129 | 130 | // Container for sample-level features 131 | unsigned char data[4 * FRAME_SIZE]; 132 | 133 | // Iterate over samples in a frame 134 | for (int i = 0; i < FRAME_SIZE; i++) { 135 | 136 | // Compute prediction from lpc coefficients and previous samples 137 | float p = 0; 138 | for (int j = 0; j < LPC_ORDER; j++) 139 | p -= st->features[k][2 * NB_BANDS + 3 + j] * st->sig_mem[j]; 140 | 141 | // Compute excitation from sample and prediction 142 | float e = linear_to_mulaw(pcm[k * FRAME_SIZE + i] - p); 143 | 144 | // Mu-law encoded signal 145 | data[4 * i] = linear_to_mulaw(st->sig_mem[0]); 146 | 147 | // Mu-law encoded prediction 148 | data[4 * i + 1] = linear_to_mulaw(p); 149 | 150 | // Input excitation 151 | data[4 * i + 2] = st->exc_mem; 152 | 153 | // Target excitation 154 | data[4 * i + 3] = e; 155 | 156 | // Delay signal by one 157 | unsigned int size = (LPC_ORDER - 1) * sizeof(float); 158 | memmove(&st->sig_mem[1], &st->sig_mem[0], size); 159 | 160 | // Bound excitation 161 | e = min(255, max(0, e)); 162 | 163 | // Store computed values for next iteration 164 | st->sig_mem[0] = p + mulaw_to_linear(e); 165 | st->exc_mem = e; 166 | } 167 | 168 | // Write sample-rate features to disk 169 | fwrite(data, 4 * FRAME_SIZE, 1, file); 170 | } 171 | } 172 | 173 | 174 | /****************************************************************************** 175 | Entry point 176 | ******************************************************************************/ 177 | 178 | 179 | int main(int argc, char **argv) { 180 | float x[FRAME_SIZE]; 181 | FILE *output_sample_file = NULL; 182 | short pcm[FRAME_SIZE] = {0}; 183 | short pcmbuf[FRAME_SIZE * 4] = {0}; 184 | 185 | // Create encoder 186 | LPCNetEncState *st = lpcnet_encoder_create(); 187 | 188 | // Open input audio file 189 | FILE *input_file = fopen(argv[1], "rb"); 190 | 191 | // Open output feature file 192 | FILE *output_feature_file = fopen(argv[2], "wb"); 193 | 194 | // Open output file for sample-rate features and training targets 195 | if (argc == 4) output_sample_file = fopen(argv[3], "wb"); 196 | 197 | // Read in up to FRAME_SIZE samples 198 | while (fread(pcm, sizeof(short), FRAME_SIZE, input_file) == FRAME_SIZE) { 199 | 200 | // Cast to float 201 | for (int i = 0; i < FRAME_SIZE; i++) x[i] = pcm[i]; 202 | 203 | // Compute pitch, correlation, and bark-scale coefficients 204 | compute_frame_features(st, x); 205 | 206 | // Copy frame into position in 4-frame buffer 207 | memcpy(&pcmbuf[st->pcount * FRAME_SIZE], pcm, FRAME_SIZE * sizeof(*pcm)); 208 | st->pcount++; 209 | 210 | // Running on groups of 4 frames 211 | if (st->pcount == 4) { 212 | process_superframe(st, output_feature_file); 213 | 214 | // If training, write audio frame 215 | if (output_sample_file) write_audio(st, pcmbuf, output_sample_file); 216 | 217 | // Reset count 218 | st->pcount = 0; 219 | } 220 | } 221 | 222 | // Close files 223 | fclose(input_file); 224 | fclose(output_feature_file); 225 | if (output_sample_file) fclose(output_sample_file); 226 | 227 | // Clean-up encoder memory 228 | free(st); 229 | 230 | return 0; 231 | } 232 | --------------------------------------------------------------------------------