├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── clpcnet
    ├── __init__.py
    ├── __main__.py
    ├── assets
    │   ├── checkpoints
    │   │   ├── model.h5
    │   │   └── original.h5
    │   ├── partition
    │   │   ├── daps-segmented.json
    │   │   ├── ravdess-hifi.json
    │   │   ├── ravdess-variable.json
    │   │   └── vctk.json
    │   └── text
    │   │   └── ravdess
    │   │       ├── 01.txt
    │   │       └── 02.txt
    ├── config.py
    ├── convert.py
    ├── core.py
    ├── data.py
    ├── evaluate
    │   ├── __init__.py
    │   ├── dtw.py
    │   ├── duration.py
    │   ├── gather.py
    │   ├── objective
    │   │   ├── __init__.py
    │   │   ├── constant.py
    │   │   └── variable.py
    │   ├── pitch.py
    │   ├── plot.py
    │   ├── prosody.py
    │   └── subjective
    │   │   ├── __init__.py
    │   │   ├── constant.py
    │   │   └── variable.py
    ├── load.py
    ├── loudness.py
    ├── model.py
    ├── mp3.py
    ├── partition.py
    ├── pitch.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── augment.py
    │   └── core.py
    ├── session.py
    ├── train.py
    └── world.py
├── data
    └── .gitkeep
├── requirements.txt
├── runs
    ├── cache
    │   └── .gitkeep
    ├── checkpoints
    │   └── .gitkeep
    ├── eval
    │   └── .gitkeep
    └── log
    │   └── .gitkeep
├── setup.py
└── src
    ├── _kiss_fft_guts.h
    ├── arch.h
    ├── celt_lpc.c
    ├── celt_lpc.h
    ├── common.h
    ├── freq.c
    ├── freq.h
    ├── kiss_fft.c
    ├── kiss_fft.h
    ├── lpcnet.h
    ├── lpcnet_enc.c
    ├── lpcnet_private.h
    ├── opus_types.h
    ├── pitch.c
    ├── pitch.h
    └── preprocess.c


/.dockerignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | .git/
3 | bin/
4 | data/
5 | runs/
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Directories
 2 | *.egg-info/
 3 | __pycache__/
 4 | .ipynb_checkpoints/
 5 | .vscode/
 6 | data/
 7 | !data/.gitkeep
 8 | htk/
 9 | notebooks/
10 | runs/
11 | !runs/*/.gitkeep
12 | bin/
13 | packages/
14 | !clpcnet/assets/
15 | 
16 | # Metadata
17 | .DS_Store
18 | ._.DS_Store
19 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-cudnn7-runtime-ubuntu18.04
 2 | 
 3 | # Apt dependencies
 4 | RUN apt-get update && \
 5 |     apt-get install -y \
 6 |     ffmpeg \
 7 |     gcc-multilib \
 8 |     libsndfile1 \
 9 |     make \
10 |     sox \
11 |     wget
12 | 
13 | # Conda setup (from continuumio/miniconda3 image)
14 | ENV PATH /opt/conda/bin:$PATH
15 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
16 |     mkdir -p /opt && \
17 |     sh miniconda.sh -b -p /opt/conda && \
18 |     rm miniconda.sh && \
19 |     ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
20 |     echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
21 |     find /opt/conda/ -follow -type f -name '*.a' -delete && \
22 |     find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
23 |     /opt/conda/bin/conda clean -afy && \
24 |     /opt/conda/bin/conda create -n clpcnet python=3.7 -y && \
25 |     echo "conda activate clpcnet" >> ~/.bashrc
26 | 
27 | # Make RUN commands use the new environment
28 | SHELL ["conda", "run", "-n", "clpcnet", "/bin/bash", "-c"]
29 | 
30 | # Conda environment setup
31 | RUN conda install -c anaconda cudatoolkit=10.0 cudnn=7.6 -y
32 | 
33 | # Allow users to specify a directory for HTK
34 | ARG HTK=htk
35 | 
36 | # Setup htk
37 | COPY $HTK /htk
38 | WORKDIR /htk
39 | RUN ./configure --disable-hslab && make all && make install
40 | 
41 | # Copy python setup files
42 | COPY requirements.txt /clpcnet/requirements.txt
43 | 
44 | # Install python dependencies
45 | WORKDIR /clpcnet
46 | RUN pip install -r requirements.txt
47 | 
48 | # Copy C preprocessing code
49 | COPY Makefile /clpcnet/Makefile
50 | COPY src /clpcnet/src
51 | 
52 | # Build C preprocessing code
53 | RUN make
54 | 
55 | # Copy module
56 | COPY README.md /clpcnet/README.md
57 | COPY setup.py /clpcnet/setup.py
58 | COPY clpcnet /clpcnet/clpcnet
59 | 
60 | # Install module
61 | RUN pip install -e .
62 | 
63 | # Start bash shell when run
64 | CMD ["bash"]
65 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ADOBE RESEARCH LICENSE
 2 | 
 3 | This license (the "License") between Adobe Inc., having a place of business at 345 Park Avenue, San Jose, California 95110-2704 ("Adobe"), and you, the individual or entity exercising rights under this License ("you" or "your"), sets forth the terms for your use of certain research materials that are owned by Adobe (the "Licensed Materials"). By exercising rights under this License, you accept and agree to be bound by its terms. If you are exercising rights under this license on behalf of an entity, then "you" means you and such entity, and you (personally) represent and warrant that you (personally) have all necessary authority to bind that entity to the terms of this License.
 4 | 
 5 | 1.	GRANT OF LICENSE.
 6 | 
 7 | 1.1	Adobe grants you a nonexclusive, worldwide, royalty-free, fully paid license to (A) reproduce, use, modify, and publicly display and perform the Licensed Materials for noncommercial research purposes only; and (B) redistribute the Licensed Materials, and modifications or derivative works thereof, for noncommercial research purposes only, provided that you give recipients a copy of this License.
 8 | 
 9 | 1.2	You may add your own copyright statement to your modifications and may provide additional or different license terms for use, reproduction, modification, public display and performance, and redistribution of your modifications and derivative works, provided that such license terms limit the use, reproduction, modification, public display and performance, and redistribution of such modifications and derivative works to noncommercial research purposes only.
10 | 
11 | 1.3	For purposes of this License, noncommercial research purposes include academic research, teaching, and testing, but do not include commercial licensing or distribution, development of commercial products, or any other activity which results in commercial gain.
12 | 
13 | 2.	OWNERSHIP AND ATTRIBUTION. Adobe and its licensors own all right, title, and interest in the Licensed Materials. You must keep intact any copyright or other notices or disclaimers in the Licensed Materials.
14 | 
15 | 3.	DISCLAIMER OF WARRANTIES. THE LICENSED MATERIALS ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK AS TO THE RESULTS AND PERFORMANCE OF THE LICENSED MATERIALS IS ASSUMED BY YOU. ADOBE DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED OR STATUTORY, WITH REGARD TO ANY LICENSED MATERIALS PROVIDED UNDER THIS LICENSE, INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT OF THIRD-PARTY RIGHTS.
16 | 
17 | 4.	LIMITATION OF LIABILITY. IN NO EVENT WILL ADOBE BE LIABLE FOR ANY ACTUAL, INCIDENTAL, SPECIAL OR CONSEQUENTIAL DAMAGES OF ANY NATURE WHATSOEVER, INCLUDING WITHOUT LIMITATION, LOSS OF PROFITS OR OTHER COMMERCIAL LOSS, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF ANY LICENSED MATERIALS PROVIDED UNDER THIS LICENSE, EVEN IF ADOBE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
18 | 
19 | 5.	TERM AND TERMINATION.
20 | 
21 | 5.1	The License is effective upon acceptance by you and will remain in effect unless terminated earlier as permitted under this License.
22 | 
23 | 5.2	If you breach any material provision of this License, then your rights will terminate immediately.
24 | 
25 | 5.3	All clauses which by their nature should survive the termination of this License will survive such termination. In addition, and without limiting the generality of the preceding sentence, Sections 2 (Ownership and Attribution), 3 (Disclaimer of Warranties), 4 (Limitation of Liability) will survive termination of this License.
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | files = src/celt_lpc.c src/freq.c src/kiss_fft.c src/lpcnet_enc.c src/pitch.c src/preprocess.c
 2 | 
 3 | all: preprocess
 4 | 
 5 | preprocess:
 6 | 	mkdir -p bin/
 7 | 	gcc -Wall -W -O3 -g -I src/ $(files) -o bin/preprocess -lm
 8 | 
 9 | clean:
10 | 	rm -rf bin/preprocess
11 | 


--------------------------------------------------------------------------------
/clpcnet/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
 5 | logging.getLogger('tensorflow').setLevel(logging.FATAL)
 6 | 
 7 | from .config import *
 8 | from .core import *
 9 | from .model import DualDense, model
10 | from .session import Session
11 | from . import convert
12 | from . import data
13 | from . import evaluate
14 | from . import load
15 | from . import loudness
16 | from . import mp3
17 | from . import partition
18 | from . import pitch
19 | from . import preprocess
20 | from . import train
21 | 


--------------------------------------------------------------------------------
/clpcnet/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import clpcnet
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Entry point
 9 | ###############################################################################
10 | 
11 | 
12 | def parse_args():
13 |     """Parse command-line arguments"""
14 |     parser = argparse.ArgumentParser()
15 | 
16 |     # Audio I/O
17 |     parser.add_argument(
18 |         '--audio_files',
19 |         type=Path,
20 |         nargs='+',
21 |         help='The audio files to process')
22 |     parser.add_argument(
23 |         '--output_files',
24 |         type=Path,
25 |         nargs='+',
26 |         required=True,
27 |         help='The files to write the output audio')
28 | 
29 |     # Time-stretching
30 |     parser.add_argument(
31 |         '--source_alignment_files',
32 |         type=Path,
33 |         nargs='+',
34 |         help='The original alignments on disk. Used only for time-stretching.')
35 |     parser.add_argument(
36 |         '--target_alignment_files',
37 |         type=Path,
38 |         nargs='+',
39 |         help='The target alignments on disk. Used only for time-stretching.')
40 |     parser.add_argument(
41 |         '--constant_stretch',
42 |         type=float,
43 |         help='A constant value for time-stretching')
44 | 
45 |     # Pitch shifting
46 |     parser.add_argument(
47 |         '--source_pitch_files',
48 |         type=Path,
49 |         nargs='+',
50 |         help='The file containing the original pitch contours')
51 |     parser.add_argument(
52 |         '--source_periodicity_files',
53 |         type=Path,
54 |         nargs='+',
55 |         help='The file containing the original periodicities')
56 |     parser.add_argument(
57 |         '--target_pitch_files',
58 |         type=Path,
59 |         nargs='+',
60 |         help='The files containing the desired pitch contours')
61 |     parser.add_argument(
62 |         '--constant_shift',
63 |         type=float,
64 |         help='A constant value for pitch-shifting')
65 | 
66 |     # Model checkpoint
67 |     parser.add_argument(
68 |         '--checkpoint_file',
69 |         type=Path,
70 |         default=clpcnet.DEFAULT_CHECKPOINT,
71 |         help='The checkpoint file to load')
72 | 
73 |     return parser.parse_args()
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     clpcnet.from_files_to_files(**vars(parse_args()))
78 | 


--------------------------------------------------------------------------------
/clpcnet/assets/checkpoints/model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/assets/checkpoints/model.h5


--------------------------------------------------------------------------------
/clpcnet/assets/checkpoints/original.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/assets/checkpoints/original.h5


--------------------------------------------------------------------------------
/clpcnet/assets/partition/daps-segmented.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test": [
  3 |         "f1_script1_clean-00007",
  4 |         "f1_script1_clean-00003",
  5 |         "f1_script1_clean-00025",
  6 |         "f1_script1_clean-00019",
  7 |         "f1_script1_clean-00006",
  8 |         "f1_script1_clean-00014",
  9 |         "f1_script1_clean-00033",
 10 |         "f1_script1_clean-00009",
 11 |         "f1_script1_clean-00013",
 12 |         "f1_script1_clean-00012",
 13 |         "f3_script1_clean-00031",
 14 |         "f3_script1_clean-00034",
 15 |         "f3_script1_clean-00025",
 16 |         "f3_script1_clean-00028",
 17 |         "f3_script1_clean-00032",
 18 |         "f3_script1_clean-00006",
 19 |         "f3_script1_clean-00019",
 20 |         "f3_script1_clean-00004",
 21 |         "f3_script1_clean-00021",
 22 |         "f3_script1_clean-00014",
 23 |         "f4_script1_clean-00033",
 24 |         "f4_script1_clean-00005",
 25 |         "f4_script1_clean-00016",
 26 |         "f4_script1_clean-00010",
 27 |         "f4_script1_clean-00003",
 28 |         "f4_script1_clean-00029",
 29 |         "f4_script1_clean-00034",
 30 |         "f4_script1_clean-00011",
 31 |         "f4_script1_clean-00000",
 32 |         "f4_script1_clean-00019",
 33 |         "f5_script1_clean-00003",
 34 |         "f5_script1_clean-00021",
 35 |         "f5_script1_clean-00014",
 36 |         "f5_script1_clean-00015",
 37 |         "f5_script1_clean-00007",
 38 |         "f5_script1_clean-00013",
 39 |         "f5_script1_clean-00020",
 40 |         "f5_script1_clean-00032",
 41 |         "f5_script1_clean-00022",
 42 |         "f5_script1_clean-00000",
 43 |         "f6_script1_clean-00027",
 44 |         "f6_script1_clean-00031",
 45 |         "f6_script1_clean-00007",
 46 |         "f6_script1_clean-00005",
 47 |         "f6_script1_clean-00001",
 48 |         "f6_script1_clean-00016",
 49 |         "f6_script1_clean-00008",
 50 |         "f6_script1_clean-00012",
 51 |         "f6_script1_clean-00011",
 52 |         "f6_script1_clean-00020",
 53 |         "m1_script1_clean-00010",
 54 |         "m1_script1_clean-00018",
 55 |         "m1_script1_clean-00026",
 56 |         "m1_script1_clean-00017",
 57 |         "m1_script1_clean-00032",
 58 |         "m1_script1_clean-00016",
 59 |         "m1_script1_clean-00029",
 60 |         "m1_script1_clean-00033",
 61 |         "m1_script1_clean-00006",
 62 |         "m1_script1_clean-00001",
 63 |         "m3_script1_clean-00028",
 64 |         "m3_script1_clean-00007",
 65 |         "m3_script1_clean-00010",
 66 |         "m3_script1_clean-00002",
 67 |         "m3_script1_clean-00025",
 68 |         "m3_script1_clean-00024",
 69 |         "m3_script1_clean-00032",
 70 |         "m3_script1_clean-00000",
 71 |         "m3_script1_clean-00019",
 72 |         "m3_script1_clean-00033",
 73 |         "m4_script1_clean-00029",
 74 |         "m4_script1_clean-00018",
 75 |         "m4_script1_clean-00006",
 76 |         "m4_script1_clean-00021",
 77 |         "m4_script1_clean-00034",
 78 |         "m4_script1_clean-00004",
 79 |         "m4_script1_clean-00008",
 80 |         "m4_script1_clean-00028",
 81 |         "m4_script1_clean-00022",
 82 |         "m4_script1_clean-00005",
 83 |         "m5_script1_clean-00014",
 84 |         "m5_script1_clean-00003",
 85 |         "m5_script1_clean-00002",
 86 |         "m5_script1_clean-00027",
 87 |         "m5_script1_clean-00015",
 88 |         "m5_script1_clean-00022",
 89 |         "m5_script1_clean-00021",
 90 |         "m5_script1_clean-00016",
 91 |         "m5_script1_clean-00013",
 92 |         "m5_script1_clean-00017",
 93 |         "m6_script1_clean-00031",
 94 |         "m6_script1_clean-00033",
 95 |         "m6_script1_clean-00009",
 96 |         "m6_script1_clean-00010",
 97 |         "m6_script1_clean-00022",
 98 |         "m6_script1_clean-00016",
 99 |         "m6_script1_clean-00029",
100 |         "m6_script1_clean-00012",
101 |         "m6_script1_clean-00018",
102 |         "m6_script1_clean-00023"
103 |     ]
104 | }


--------------------------------------------------------------------------------
/clpcnet/assets/partition/ravdess-hifi.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "test": [
  3 |         "03-01-01-01-02-02-06",
  4 |         "03-01-08-01-02-02-13",
  5 |         "03-01-06-01-02-02-08",
  6 |         "03-01-06-01-01-01-07",
  7 |         "03-01-07-01-02-01-15",
  8 |         "03-01-04-01-02-01-19",
  9 |         "03-01-06-01-01-02-21",
 10 |         "03-01-05-01-01-02-22",
 11 |         "03-01-08-01-01-02-02",
 12 |         "03-01-02-01-02-01-07",
 13 |         "03-01-08-01-01-02-19",
 14 |         "03-01-04-01-02-02-12",
 15 |         "03-01-02-01-01-01-17",
 16 |         "03-01-03-01-02-02-16",
 17 |         "03-01-05-01-02-02-16",
 18 |         "03-01-04-01-02-01-02",
 19 |         "03-01-04-01-01-02-15",
 20 |         "03-01-02-01-02-02-15",
 21 |         "03-01-03-01-02-02-24",
 22 |         "03-01-02-01-01-02-01",
 23 |         "03-01-06-01-02-01-08",
 24 |         "03-01-04-01-02-01-03",
 25 |         "03-01-02-01-02-01-17",
 26 |         "03-01-08-01-01-01-06",
 27 |         "03-01-01-01-01-02-01",
 28 |         "03-01-01-01-02-01-11",
 29 |         "03-01-05-01-02-01-07",
 30 |         "03-01-04-01-02-02-08",
 31 |         "03-01-06-01-01-01-01",
 32 |         "03-01-03-01-01-01-17",
 33 |         "03-01-02-01-02-02-17",
 34 |         "03-01-02-01-01-02-15",
 35 |         "03-01-06-01-01-01-17",
 36 |         "03-01-05-01-01-02-05",
 37 |         "03-01-05-01-02-01-01",
 38 |         "03-01-02-01-02-01-13",
 39 |         "03-01-05-01-02-01-19",
 40 |         "03-01-03-01-01-01-10",
 41 |         "03-01-08-01-02-02-03",
 42 |         "03-01-05-01-01-02-11",
 43 |         "03-01-07-01-01-02-22",
 44 |         "03-01-03-01-01-02-07",
 45 |         "03-01-03-01-01-02-22",
 46 |         "03-01-03-01-01-02-10",
 47 |         "03-01-01-01-01-02-08",
 48 |         "03-01-08-01-02-01-15",
 49 |         "03-01-07-01-01-02-11",
 50 |         "03-01-07-01-01-01-02",
 51 |         "03-01-03-01-01-01-16",
 52 |         "03-01-02-01-01-01-24",
 53 |         "03-01-06-01-01-01-13",
 54 |         "03-01-06-01-02-01-17",
 55 |         "03-01-05-01-02-01-24",
 56 |         "03-01-03-01-01-02-01",
 57 |         "03-01-03-01-02-02-02",
 58 |         "03-01-01-01-01-02-21",
 59 |         "03-01-04-01-02-02-01",
 60 |         "03-01-02-01-01-01-12",
 61 |         "03-01-08-01-01-01-02",
 62 |         "03-01-06-01-01-02-01",
 63 |         "03-01-01-01-02-02-12",
 64 |         "03-01-03-01-01-01-15",
 65 |         "03-01-01-01-01-01-03",
 66 |         "03-01-08-01-02-02-01",
 67 |         "03-01-08-01-02-02-18",
 68 |         "03-01-04-01-02-02-23",
 69 |         "03-01-01-01-01-01-19",
 70 |         "03-01-03-01-02-02-05",
 71 |         "03-01-05-01-01-02-17",
 72 |         "03-01-01-01-01-02-24",
 73 |         "03-01-01-01-02-01-19",
 74 |         "03-01-08-01-01-02-23",
 75 |         "03-01-01-01-02-02-24",
 76 |         "03-01-01-01-01-01-10",
 77 |         "03-01-03-01-02-02-06",
 78 |         "03-01-05-01-02-02-24",
 79 |         "03-01-05-01-02-01-06",
 80 |         "03-01-02-01-01-01-18",
 81 |         "03-01-07-01-02-02-12",
 82 |         "03-01-08-01-01-02-05",
 83 |         "03-01-07-01-02-02-11",
 84 |         "03-01-07-01-01-01-12",
 85 |         "03-01-08-01-01-01-16",
 86 |         "03-01-07-01-01-01-13",
 87 |         "03-01-06-01-02-01-02",
 88 |         "03-01-03-01-02-02-13",
 89 |         "03-01-07-01-02-01-22",
 90 |         "03-01-01-01-02-01-08",
 91 |         "03-01-04-01-02-02-21",
 92 |         "03-01-01-01-01-01-06",
 93 |         "03-01-04-01-01-02-13",
 94 |         "03-01-06-01-01-02-22",
 95 |         "03-01-08-01-01-02-07",
 96 |         "03-01-03-01-01-01-05",
 97 |         "03-01-01-01-01-02-19",
 98 |         "03-01-08-01-02-01-07",
 99 |         "03-01-04-01-02-01-18",
100 |         "03-01-05-01-02-02-11",
101 |         "03-01-01-01-02-01-15",
102 |         "03-01-01-01-02-01-22",
103 |         "03-01-06-01-01-02-23",
104 |         "03-01-03-01-02-01-24",
105 |         "03-01-05-01-01-02-12",
106 |         "03-01-01-01-02-01-02",
107 |         "03-01-06-01-02-02-13",
108 |         "03-01-01-01-02-01-23",
109 |         "03-01-03-01-02-02-22",
110 |         "03-01-07-01-02-01-02",
111 |         "03-01-08-01-02-01-03",
112 |         "03-01-03-01-01-01-12",
113 |         "03-01-03-01-01-01-08",
114 |         "03-01-06-01-01-02-03",
115 |         "03-01-01-01-02-02-05",
116 |         "03-01-04-01-02-01-16",
117 |         "03-01-04-01-02-01-01",
118 |         "03-01-04-01-02-01-07",
119 |         "03-01-07-01-01-01-22",
120 |         "03-01-01-01-02-02-11",
121 |         "03-01-01-01-02-02-23",
122 |         "03-01-01-01-02-02-15",
123 |         "03-01-05-01-02-02-10",
124 |         "03-01-07-01-02-01-10",
125 |         "03-01-04-01-01-01-15",
126 |         "03-01-02-01-01-02-21",
127 |         "03-01-03-01-01-02-24",
128 |         "03-01-07-01-01-01-07",
129 |         "03-01-06-01-01-01-06",
130 |         "03-01-03-01-02-01-08",
131 |         "03-01-06-01-01-02-06",
132 |         "03-01-01-01-01-02-03",
133 |         "03-01-08-01-02-01-10",
134 |         "03-01-06-01-02-01-05",
135 |         "03-01-06-01-02-02-05",
136 |         "03-01-02-01-02-02-08",
137 |         "03-01-03-01-02-02-08",
138 |         "03-01-05-01-01-01-08",
139 |         "03-01-02-01-02-02-18",
140 |         "03-01-06-01-01-02-16",
141 |         "03-01-02-01-02-01-18",
142 |         "03-01-01-01-02-01-05",
143 |         "03-01-01-01-01-01-23",
144 |         "03-01-01-01-02-02-17",
145 |         "03-01-06-01-02-02-11",
146 |         "03-01-07-01-02-01-21",
147 |         "03-01-05-01-02-01-22",
148 |         "03-01-01-01-01-01-16",
149 |         "03-01-01-01-01-02-11",
150 |         "03-01-04-01-01-01-13",
151 |         "03-01-07-01-01-02-24",
152 |         "03-01-07-01-02-02-22",
153 |         "03-01-02-01-02-01-10",
154 |         "03-01-06-01-02-01-23",
155 |         "03-01-02-01-01-01-03",
156 |         "03-01-04-01-01-02-01",
157 |         "03-01-02-01-01-02-18",
158 |         "03-01-05-01-02-02-18",
159 |         "03-01-08-01-01-02-18",
160 |         "03-01-03-01-02-01-19",
161 |         "03-01-06-01-02-01-22",
162 |         "03-01-06-01-02-01-12",
163 |         "03-01-03-01-01-02-03",
164 |         "03-01-06-01-01-02-13",
165 |         "03-01-06-01-02-02-19",
166 |         "03-01-02-01-02-02-07",
167 |         "03-01-02-01-01-02-10",
168 |         "03-01-03-01-01-02-23",
169 |         "03-01-02-01-02-01-01",
170 |         "03-01-07-01-01-01-11",
171 |         "03-01-06-01-01-01-15",
172 |         "03-01-02-01-02-02-12",
173 |         "03-01-05-01-02-02-17",
174 |         "03-01-02-01-01-01-07",
175 |         "03-01-02-01-02-01-03",
176 |         "03-01-03-01-02-01-23",
177 |         "03-01-06-01-02-01-16",
178 |         "03-01-08-01-01-01-07",
179 |         "03-01-04-01-01-01-18",
180 |         "03-01-07-01-01-02-02",
181 |         "03-01-02-01-01-02-24",
182 |         "03-01-04-01-02-02-15",
183 |         "03-01-06-01-02-02-16",
184 |         "03-01-02-01-01-01-08",
185 |         "03-01-04-01-01-01-21",
186 |         "03-01-01-01-01-02-22",
187 |         "03-01-05-01-01-01-07",
188 |         "03-01-03-01-01-01-18",
189 |         "03-01-04-01-02-01-23",
190 |         "03-01-03-01-01-01-07",
191 |         "03-01-06-01-02-01-11",
192 |         "03-01-06-01-01-02-15",
193 |         "03-01-07-01-01-01-10",
194 |         "03-01-06-01-02-02-03",
195 |         "03-01-03-01-02-01-05",
196 |         "03-01-03-01-01-01-11",
197 |         "03-01-05-01-02-01-17",
198 |         "03-01-02-01-01-01-10",
199 |         "03-01-05-01-01-01-05",
200 |         "03-01-05-01-01-01-24",
201 |         "03-01-07-01-02-02-24",
202 |         "03-01-03-01-02-02-07",
203 |         "03-01-05-01-01-01-01",
204 |         "03-01-05-01-02-01-13",
205 |         "03-01-03-01-02-02-21",
206 |         "03-01-08-01-01-01-23",
207 |         "03-01-02-01-01-01-01",
208 |         "03-01-05-01-01-02-02",
209 |         "03-01-04-01-01-01-02",
210 |         "03-01-07-01-01-02-12",
211 |         "03-01-07-01-02-01-18",
212 |         "03-01-05-01-01-02-21",
213 |         "03-01-08-01-01-02-06",
214 |         "03-01-03-01-02-01-06",
215 |         "03-01-06-01-02-02-10",
216 |         "03-01-01-01-01-01-22",
217 |         "03-01-01-01-02-01-03",
218 |         "03-01-06-01-01-01-22",
219 |         "03-01-03-01-01-02-06",
220 |         "03-01-03-01-02-01-01",
221 |         "03-01-01-01-02-02-19",
222 |         "03-01-01-01-02-02-02",
223 |         "03-01-01-01-01-01-11",
224 |         "03-01-05-01-01-02-15",
225 |         "03-01-07-01-02-02-16",
226 |         "03-01-05-01-01-01-23",
227 |         "03-01-06-01-01-01-16",
228 |         "03-01-03-01-01-01-23",
229 |         "03-01-07-01-01-01-17",
230 |         "03-01-03-01-01-01-03",
231 |         "03-01-05-01-02-01-12",
232 |         "03-01-08-01-02-02-06",
233 |         "03-01-08-01-01-02-15",
234 |         "03-01-01-01-02-01-10",
235 |         "03-01-04-01-02-01-15",
236 |         "03-01-07-01-02-01-16",
237 |         "03-01-04-01-01-02-17",
238 |         "03-01-07-01-01-02-13",
239 |         "03-01-01-01-02-01-13",
240 |         "03-01-03-01-02-01-13",
241 |         "03-01-03-01-01-01-13",
242 |         "03-01-04-01-02-01-08",
243 |         "03-01-01-01-02-02-10",
244 |         "03-01-05-01-02-01-11",
245 |         "03-01-05-01-01-01-11",
246 |         "03-01-01-01-01-02-16",
247 |         "03-01-04-01-02-02-16",
248 |         "03-01-04-01-01-01-17",
249 |         "03-01-02-01-02-02-21",
250 |         "03-01-02-01-02-02-23",
251 |         "03-01-02-01-02-02-01",
252 |         "03-01-06-01-02-02-12",
253 |         "03-01-01-01-02-02-21",
254 |         "03-01-03-01-01-01-22",
255 |         "03-01-05-01-01-02-19",
256 |         "03-01-01-01-02-02-18",
257 |         "03-01-04-01-02-02-18",
258 |         "03-01-05-01-02-02-23",
259 |         "03-01-01-01-01-02-23",
260 |         "03-01-01-01-02-01-16",
261 |         "03-01-07-01-01-02-16",
262 |         "03-01-03-01-02-01-02",
263 |         "03-01-05-01-01-01-19",
264 |         "03-01-05-01-02-01-21",
265 |         "03-01-04-01-01-02-18",
266 |         "03-01-03-01-01-02-13",
267 |         "03-01-02-01-02-01-02",
268 |         "03-01-02-01-02-02-10",
269 |         "03-01-03-01-01-02-12",
270 |         "03-01-03-01-02-02-10",
271 |         "03-01-03-01-02-02-03",
272 |         "03-01-03-01-02-02-19",
273 |         "03-01-05-01-01-01-21",
274 |         "03-01-03-01-02-01-11",
275 |         "03-01-08-01-01-01-19",
276 |         "03-01-07-01-02-02-01"
277 |     ]
278 | }


--------------------------------------------------------------------------------
/clpcnet/assets/text/ravdess/01.txt:
--------------------------------------------------------------------------------
1 | Kids are talking by the door
2 | 


--------------------------------------------------------------------------------
/clpcnet/assets/text/ravdess/02.txt:
--------------------------------------------------------------------------------
1 | Dogs are sitting by the door
2 | 


--------------------------------------------------------------------------------
/clpcnet/config.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | 
 4 | ###############################################################################
 5 | # Configuration
 6 | ###############################################################################
 7 | 
 8 | 
 9 | # Run inference using checkpoint and settings from the original LPCNet
10 | ORIGINAL_LPCNET = False
11 | 
12 | # Ablations
13 | ABLATE_CREPE = False
14 | ABLATE_PITCH_REPR = False
15 | ABLATE_SAMPLING = False
16 | ABLATE_SAMPLING_TAIL = False
17 | 
18 | # Settings for using original lpcnet checkpoints
19 | if ORIGINAL_LPCNET:
20 |     ABLATE_CREPE = True
21 |     ABLATE_PITCH_REPR = True
22 |     ABLATE_SAMPLING = True
23 |     ABLATE_SAMPLING_TAIL = False
24 | 
25 | # Directories
26 | ASSETS_DIR = Path(__file__).parent / 'assets'
27 | DATA_DIR = Path(__file__).parent.parent / 'data'
28 | RUNS_DIR = Path(__file__).parent.parent / 'runs'
29 | CACHE_DIR = RUNS_DIR / 'cache'
30 | CHECKPOINT_DIR = RUNS_DIR / 'checkpoints'
31 | EVAL_DIR = RUNS_DIR / 'eval'
32 | LOG_DIR = RUNS_DIR / 'log'
33 | 
34 | # Default pretrained checkpoint
35 | if ORIGINAL_LPCNET:
36 |     DEFAULT_CHECKPOINT = ASSETS_DIR / 'checkpoints' / 'original.h5'
37 | else:
38 |     DEFAULT_CHECKPOINT = ASSETS_DIR / 'checkpoints' / 'model.h5'
39 | 
40 | # Pitch representation
41 | PITCH_BINS = 256
42 | FMAX = 550.  # Hz
43 | # 63 Hz is hard minimum imposed when using non-uniform pitch bins
44 | FMIN = 63. if ABLATE_PITCH_REPR else 50.  # Hz
45 | 
46 | # DSP parameters
47 | HOPSIZE = 160  # samples
48 | BLOCK_SIZE = 640  # samples
49 | LPC_ORDER = 16
50 | MAX_SAMPLE_VALUE = 32768
51 | PCM_LEVELS = 256
52 | PREEMPHASIS_COEF = 0.85
53 | SAMPLE_RATE = 16000  # Hz
54 | 
55 | # Training parameters
56 | AVERAGE_STEPS_PER_EPOCH = 436925  # batches per epoch
57 | BATCH_SIZE = 64  # items per batch
58 | FEATURE_CHUNK_SIZE = 15  # frames per item in batch
59 | LEARNING_RATE = 1e-3
60 | PCM_CHUNK_SIZE = HOPSIZE * FEATURE_CHUNK_SIZE  # samples per item in batch
61 | STEPS = 45000000
62 | WEIGHT_DECAY = 5e-5
63 | 
64 | # Neural network sizes
65 | SPECTRAL_FEATURE_SIZE = 38
66 | EMBEDDING_SIZE = 128
67 | GRU_A_SIZE = 384
68 | GRU_B_SIZE = 16
69 | 
70 | # Number of features on disk
71 | TOTAL_FEATURE_SIZE = 55
72 | 
73 | # Feature indices
74 | PITCH_IDX = 36
75 | CORRELATION_IDX = 37
76 | 


--------------------------------------------------------------------------------
/clpcnet/convert.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import numpy as np
  4 | 
  5 | import clpcnet
  6 | 
  7 | 
  8 | ###############################################################################
  9 | # Constants
 10 | ###############################################################################
 11 | 
 12 | 
 13 | # Define these explicitly, as they are used millions of times
 14 | INV_LOG_256 = 1. / math.log(256)
 15 | LOG_256_OVER_128 = math.log(256) / 128.
 16 | LI_TO_MU_SCALE = (clpcnet.PCM_LEVELS - 1.) / clpcnet.MAX_SAMPLE_VALUE
 17 | MU_TO_LI_SCALE = clpcnet.MAX_SAMPLE_VALUE / (clpcnet.PCM_LEVELS - 1.)
 18 | 
 19 | 
 20 | ###############################################################################
 21 | # Mulaw encoding and decoding
 22 | ###############################################################################
 23 | 
 24 | 
 25 | def linear_to_mulaw(linear):
 26 |     """Mu-law encode signal"""
 27 |     # Convert from [-MAX_SAMPLE_VALUE + 1, MAX_SAMPLE_VALUE] to [-126.5, 126.5]
 28 |     linear *= LI_TO_MU_SCALE
 29 | 
 30 |     # Mu-law encode
 31 |     mulaw = np.sign(linear) * (128 * np.log(1 + np.abs(linear)) * INV_LOG_256)
 32 | 
 33 |     # Shift to [0, 255]
 34 |     mulaw = np.round(mulaw) + 128
 35 | 
 36 |     # Clip
 37 |     return np.clip(mulaw, 0, 255).astype(np.int16)
 38 | 
 39 | 
 40 | def mulaw_to_linear(mulaw):
 41 |     """Decode mu-law signal"""
 42 |     # Zero-center
 43 |     mulaw = mulaw.astype(np.int16) - 128
 44 | 
 45 |     # Convert to linear
 46 |     linear = np.sign(mulaw) * (np.exp(np.abs(mulaw) * LOG_256_OVER_128) - 1)
 47 | 
 48 |     # Scale to [-MAX_SAMPLE_VALUE + 1, MAX_SAMPLE_VALUE]
 49 |     return linear * MU_TO_LI_SCALE
 50 | 
 51 | 
 52 | ###############################################################################
 53 | # Pitch representations
 54 | ###############################################################################
 55 | 
 56 | 
 57 | def bins_to_hz(bins,
 58 |                fmin=clpcnet.FMIN,
 59 |                fmax=clpcnet.FMAX,
 60 |                pitch_bins=clpcnet.PITCH_BINS):
 61 |     logmin, logmax = np.log2(fmin), np.log2(fmax)
 62 | 
 63 |     # Scale to base-2 log-space
 64 |     loghz = bins.astype(float) / (pitch_bins - 1) * (logmax - logmin) + logmin
 65 | 
 66 |     # Convert to hz
 67 |     return 2 ** loghz
 68 | 
 69 | 
 70 | def epochs_to_bins(epochs, sample_rate=clpcnet.SAMPLE_RATE):
 71 |     """Convert pitch in normalized pitch epochs to quantized bins"""
 72 |     return bins_to_hz(epochs_to_hz(epochs, sample_rate))
 73 | 
 74 | 
 75 | def epochs_to_hz(epochs, sample_rate=clpcnet.SAMPLE_RATE):
 76 |     """Convert pitch in normalized pitch epochs to Hz"""
 77 |     return sample_rate / (50 * epochs + 100.1)
 78 | 
 79 | 
 80 | def epochs_to_length(epochs):
 81 |     """Convert normalized pitch epochs to samples per period"""
 82 |     return (50 * epochs + 100.1).astype('int16')
 83 | 
 84 | 
 85 | def hz_to_bins(hz,
 86 |                fmin=clpcnet.FMIN,
 87 |                fmax=clpcnet.FMAX,
 88 |                pitch_bins=clpcnet.PITCH_BINS):
 89 |     logmin, logmax = np.log2(fmin), np.log2(fmax)
 90 | 
 91 |     # Clip pitch in base-2 log-space
 92 |     loghz = np.clip(np.log2(hz), logmin, logmax)
 93 | 
 94 |     # Scale to 0, 255
 95 |     return \
 96 |         ((loghz - logmin) / (logmax - logmin) * (pitch_bins - 1)).astype(int)
 97 | 
 98 | 
 99 | def hz_to_epochs(hz, sample_rate=clpcnet.SAMPLE_RATE):
100 |     """Convert pitch in Hz to normalized epochs"""
101 |     return (sample_rate / hz - 100.1) / 50.
102 | 
103 | 
104 | def hz_to_length(hz, sample_rate=clpcnet.SAMPLE_RATE):
105 |     """Convert pitch in Hz to number of samples per period"""
106 |     return (sample_rate / hz).astype('int16')
107 | 
108 | 
109 | def length_to_epochs(length):
110 |     """Convert pitch in number of samples per period to normalized epochs"""
111 |     return (length - 100.1) / 50.
112 | 
113 | 
114 | def length_to_hz(length, sample_rate=clpcnet.SAMPLE_RATE):
115 |     """Convert pitch in number of samples per period to Hz"""
116 |     return sample_rate / length
117 | 
118 | 
119 | def seconds_to_frames(seconds):
120 |     """Convert time in seconds to number of frames"""
121 |     return 1 + int(seconds * clpcnet.SAMPLE_RATE / clpcnet.HOPSIZE)
122 | 
123 | 
124 | def seconds_to_samples(seconds):
125 |     """Convert time in seconds to number of samples"""
126 |     return clpcnet.HOPSIZE * seconds_to_frames(seconds)
127 | 


--------------------------------------------------------------------------------
/clpcnet/data.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import itertools
  3 | import json
  4 | from pathlib import Path
  5 | 
  6 | import clpcnet
  7 | 
  8 | 
  9 | ###############################################################################
 10 | # Functional interface - file access
 11 | ###############################################################################
 12 | 
 13 | 
 14 | def files(name, directory, partition=None):
 15 |     """Get audio filenames"""
 16 |     return resolve(name).files(directory, partition)
 17 | 
 18 | 
 19 | def partition_file(name):
 20 |     """Get name of partition file"""
 21 |     return resolve(name).partition_file()
 22 | 
 23 | 
 24 | def partitions(name):
 25 |     """Get split of stems into partitions"""
 26 |     return resolve(name).partitions()
 27 | 
 28 | 
 29 | def stems(name, partition=None):
 30 |     """Get stems"""
 31 |     return resolve(name).stems(partition)
 32 | 
 33 | 
 34 | ###############################################################################
 35 | # Functional interface - conversions
 36 | ###############################################################################
 37 | 
 38 | 
 39 | def file_to_stem(name, file):
 40 |     """Convert file to stem"""
 41 |     return resolve(name).file_to_stem(file)
 42 | 
 43 | 
 44 | def stem_to_file(name, directory, stem):
 45 |     """Convert stem to file"""
 46 |     return resolve(name).stem_to_file(directory, stem)
 47 | 
 48 | 
 49 | ###############################################################################
 50 | # Base dataset
 51 | ###############################################################################
 52 | 
 53 | 
 54 | class Dataset(abc.ABC):
 55 | 
 56 |     ###########################################################################
 57 |     # File access
 58 |     ###########################################################################
 59 | 
 60 |     @classmethod
 61 |     def files(cls, directory, partition=None):
 62 |         """Get filenames"""
 63 |         # Get stems
 64 |         stems = cls.stems(partition)
 65 | 
 66 |         # Convert to files
 67 |         return [cls.stem_to_file(directory, stem) for stem in stems]
 68 | 
 69 |     @classmethod
 70 |     def partition_file(cls):
 71 |         """Get name of partition file"""
 72 |         return clpcnet.ASSETS_DIR / 'partition' / f'{cls.name}.json'
 73 | 
 74 |     @classmethod
 75 |     def partitions(cls):
 76 |         """Get split of stems into partitions"""
 77 |         with open(cls.partition_file()) as file:
 78 |             return json.load(file)
 79 | 
 80 |     @classmethod
 81 |     def stems(cls, partition=None):
 82 |         """Get stems"""
 83 |         # Get partitions
 84 |         partitions = cls.partitions()
 85 | 
 86 |         # Return all stems
 87 |         if partition is None:
 88 |             return itertools.chain(*partitions.values())
 89 | 
 90 |         # Return stems of a given partition
 91 |         return partitions[partition]
 92 | 
 93 |     ###########################################################################
 94 |     # Conversions
 95 |     ###########################################################################
 96 | 
 97 |     @staticmethod
 98 |     @abc.abstractmethod
 99 |     def file_to_stem(file):
100 |         """Convert file to stem"""
101 |         pass
102 | 
103 |     @staticmethod
104 |     @abc.abstractmethod
105 |     def stem_to_file(directory, stem):
106 |         """Convert stem to file"""
107 |         pass
108 | 
109 | 
110 | ###############################################################################
111 | # Datasets
112 | ###############################################################################
113 | 
114 | 
115 | class Daps(Dataset):
116 | 
117 |     name = 'daps'
118 | 
119 |     @staticmethod
120 |     def file_to_stem(file):
121 |         """Convert daps filename to stem"""
122 |         return file.stem[:-4]
123 | 
124 |     @staticmethod
125 |     def stem_to_file(directory, stem):
126 |         """Convert daps stem to file"""
127 |         return Path(directory, 'clean', f'{stem}.wav')
128 | 
129 | 
130 | class DapsSegmented(Dataset):
131 | 
132 |     name = 'daps-segmented'
133 | 
134 |     @staticmethod
135 |     def file_to_stem(file):
136 |         """Convert daps-segmented filename to stem"""
137 |         return file.stem
138 | 
139 |     @staticmethod
140 |     def stem_to_file(directory, stem):
141 |         """Convert daps-segmented stem to filen"""
142 |         return Path(directory, f'{stem[:-6]}-sentences', f'{stem}.wav')
143 | 
144 | 
145 | class RavdessHifi(Dataset):
146 | 
147 |     name = 'ravdess-hifi'
148 | 
149 |     @staticmethod
150 |     def file_to_stem(file):
151 |         """Convert ravdess filename to stem"""
152 |         return file.stem
153 | 
154 |     @staticmethod
155 |     def stem_to_file(directory, stem):
156 |         """Convert ravdess stem to filename"""
157 |         return directory / f'Actor_{stem[-2:]}' / f'{stem}.wav'
158 | 
159 | 
160 | class RavdessVariable(RavdessHifi):
161 | 
162 |     name = 'ravdess-variable'
163 | 
164 | 
165 | class Vctk(Dataset):
166 | 
167 |     name = 'vctk'
168 | 
169 |     @staticmethod
170 |     def file_to_stem(file):
171 |         """Convert vctk filename to stem"""
172 |         return file.stem[:-5]
173 | 
174 |     @staticmethod
175 |     def stem_to_file(directory, stem):
176 |         """Convert vctk stem to file"""
177 |         return Path(directory,
178 |                     'wav48_silence_trimmed',
179 |                     stem.split('_')[0],
180 |                     f'{stem}_mic2.flac')
181 | 
182 | 
183 | ###############################################################################
184 | # Utilities
185 | ###############################################################################
186 | 
187 | 
188 | def resolve(name):
189 |     """Resolve name of dataset to static template"""
190 |     if name == 'daps':
191 |         return Daps
192 |     elif name == 'daps-segmented':
193 |         return DapsSegmented
194 |     elif name == 'ravdess-hifi':
195 |         return RavdessHifi
196 |     elif name == 'ravdess-variable':
197 |         return RavdessVariable
198 |     elif name == 'vctk':
199 |         return Vctk
200 |     raise ValueError(f'Dataset {name} is not defined')
201 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from . import dtw
2 | from . import duration
3 | from . import pitch
4 | from . import plot
5 | from . import prosody
6 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/dtw.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | import scipy
 4 | 
 5 | import clpcnet
 6 | 
 7 | 
 8 | ###############################################################################
 9 | # Dtw alignment evaluation
10 | ###############################################################################
11 | 
12 | 
13 | def from_files(source_files, target_files):
14 |     """Evaluate dtw alignment score"""
15 |     dtw = DtwAlignmentScore()
16 | 
17 |     # Evaluate each pair of files
18 |     for source_file, target_file in zip(source_files, target_files):
19 |         dtw.update(clpcnet.load.audio(source_file),
20 |                    clpcnet.load.audio(target_file))
21 | 
22 |     # Compute aggregate dtw alignment score over files
23 |     return dtw()
24 | 
25 | 
26 | class DtwAlignmentScore:
27 |     """Batch update dtw alignment score"""
28 | 
29 |     def __init__(self):
30 |         self.distance_sum = 0.
31 |         self.count = 0
32 | 
33 |     def __call__(self):
34 |         """Return the rmse with the diagonal and the mean cosine distance"""
35 |         distance = self.distance_sum / self.count
36 |         return distance
37 | 
38 |     def update(self, source, target):
39 |         """Compute the dtw alignment score"""
40 |         # Compute mel features
41 |         source_feats = features(source)
42 |         target_feats = features(target)
43 | 
44 |         # Resample target features
45 |         interp_fn = scipy.interpolate.interp1d(
46 |             np.arange(target_feats.shape[1]),
47 |             target_feats)
48 |         target_feats_interp = interp_fn(
49 |             np.linspace(0, target_feats.shape[1] - 1, source_feats.shape[1]))
50 | 
51 |         # Perform alignment
52 |         D, path = librosa.sequence.dtw(target_feats_interp + 1e-10,
53 |                                        source_feats + 1e-10,
54 |                                        backtrack=True,
55 |                                        metric='cosine')
56 | 
57 |         # Update metrics
58 |         self.distance_sum += D[path[0, 0], path[0, 1]]
59 |         self.count += len(path)
60 | 
61 | 
62 | ###############################################################################
63 | # Utilities
64 | ###############################################################################
65 | 
66 | 
67 | def features(audio):
68 |     """Compute spectral features to use for dtw alignment"""
69 |     # Compute mfcc without energy
70 |     return librosa.feature.mfcc(audio, sr=clpcnet.SAMPLE_RATE)[1:]
71 | 
72 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/duration.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pypar
 3 | 
 4 | 
 5 | ###############################################################################
 6 | # Duration evaluation
 7 | ###############################################################################
 8 | 
 9 | 
10 | def from_files(source_files, target_files):
11 |     """Evaluate phoneme duration rmse"""
12 |     rmse = DurationRmse()
13 | 
14 |     # Evaluate each pair of files
15 |     for source_file, target_file in zip(source_files, target_files):
16 |         source = pypar.Alignment(source_file)
17 |         target = pypar.Alignment(target_file)
18 |         rmse.update(source, target)
19 | 
20 |     # Compute aggregate rmse over files
21 |     return rmse()
22 | 
23 | 
24 | class DurationRmse:
25 |     """Batch update rmse"""
26 | 
27 |     def __init__(self):
28 |         self.sum = 0.
29 |         self.count = 0
30 | 
31 |     def __call__(self):
32 |         """Return the rmse value"""
33 |         return np.sqrt(self.sum / self.count)
34 | 
35 |     def update(self, source, target):
36 |         """Compute the rmse of the phoneme durations"""
37 |         source_durations = np.array([p.duration() for p in source.phonemes()])
38 |         target_durations = np.array([p.duration() for p in target.phonemes()])
39 |         source_mask, target_mask = self.mask(source.phonemes(),
40 |                                              target.phonemes())
41 | 
42 |         # First and last are very often long silences with large error
43 |         differences = source_durations[source_mask][1:-1] - \
44 |                       target_durations[target_mask][1:-1]
45 | 
46 |         self.sum += (differences ** 2).sum()
47 |         self.count += differences.size
48 | 
49 |     @staticmethod
50 |     def mask(source, target):
51 |         """Retrieve the mask over values to use for evaluation"""
52 |         source_mask = np.full(len(source), True)
53 |         target_mask = np.full(len(target), True)
54 | 
55 |         source_idx, target_idx = 0, 0
56 |         while source_idx < len(source) or target_idx < len(target):
57 | 
58 |             # Handle only one alignment ending with silence
59 |             if target_idx >= len(target):
60 |                 source_mask[source_idx] = False
61 |                 source_idx += 1
62 |                 continue
63 |             if source_idx >= len(source):
64 |                 target_mask[target_idx] = False
65 |                 target_idx += 1
66 |                 continue
67 | 
68 |             # Phonemes match
69 |             if str(source[source_idx]) == str(target[target_idx]):
70 |                 source_idx += 1
71 |                 target_idx += 1
72 |                 continue
73 | 
74 |             # Phonemes don't match
75 |             if str(source[source_idx]) == pypar.SILENCE:
76 |                 source_mask[source_idx] = False
77 |                 source_idx += 1
78 |             elif str(target[target_idx]) == pypar.SILENCE:
79 |                 target_mask[target_idx] = False
80 |                 target_idx += 1
81 |             else:
82 |                 raise ValueError('Phonemes don\'t match!')
83 | 
84 |         return source_mask, target_mask
85 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/gather.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import copy
  3 | import json
  4 | import random
  5 | import shutil
  6 | from pathlib import Path
  7 | 
  8 | import numpy as np
  9 | import pyfoal
 10 | import soundfile
 11 | import tqdm
 12 | 
 13 | import clpcnet
 14 | 
 15 | 
 16 | ###############################################################################
 17 | # Gather
 18 | ###############################################################################
 19 | 
 20 | 
 21 | def from_files_to_files(examples, output_directory, gpu=None):
 22 |     """Gather files for evaluation"""
 23 |     output_directory.mkdir(exist_ok=True, parents=True)
 24 |     iterator = tqdm.tqdm(examples,
 25 |                          desc='Setting up evaluation directory',
 26 |                          dynamic_ncols=True)
 27 |     for example in iterator:
 28 |         stem = f'{example.stem}-{"" if example.seen else "un"}seen'
 29 |         prefix = output_directory / stem
 30 | 
 31 |         # Copy audio
 32 |         dst_audio_file = f'{prefix}.wav'
 33 |         soundfile.write(dst_audio_file,
 34 |                         clpcnet.load.audio(example.audio_file),
 35 |                         clpcnet.SAMPLE_RATE)
 36 | 
 37 |         # Estimate pitch
 38 |         pitch, periodicity = clpcnet.pitch.from_file(example.audio_file, gpu)
 39 | 
 40 |         # Scale pitch
 41 |         low = np.clip(.71 * pitch, clpcnet.FMIN, clpcnet.FMAX)
 42 |         high = np.clip(1.41 * pitch, clpcnet.FMIN, clpcnet.FMAX)
 43 | 
 44 |         # Save original pitch and periodicity and scaled pitch
 45 |         np.save(f'{prefix}-pitch.npy', pitch)
 46 |         np.save(f'{prefix}-ps-71-pitch.npy', low)
 47 |         np.save(f'{prefix}-ps-141-pitch.npy', high)
 48 |         np.save(f'{prefix}-periodicity.npy', periodicity)
 49 | 
 50 |         if example.text_file is not None:
 51 |             # Copy text
 52 |             dst_text_file = f'{prefix}.txt'
 53 |             shutil.copy2(example.text_file, dst_text_file)
 54 | 
 55 |             # Force align
 56 |             alignment = pyfoal.from_file(str(dst_text_file), str(dst_audio_file))
 57 | 
 58 |             # Time-stretch alignment by constant factor
 59 |             slow = stretch_alignment(alignment, .5)
 60 |             fast = stretch_alignment(alignment, 2.)
 61 | 
 62 |             # Save alignments
 63 |             alignment.save(f'{prefix}.json')
 64 |             slow.save(f'{prefix}-ts-50.json')
 65 |             fast.save(f'{prefix}-ts-200.json')
 66 | 
 67 | 
 68 | ###############################################################################
 69 | # Example selection
 70 | ###############################################################################
 71 | 
 72 | 
 73 | def daps(directory, stems):
 74 |     """Select evaluation examples from daps"""
 75 |     selected = [stem for stem in stems if '_script5_' in str(stem)
 76 |                 if int(stem.split('_')[0][1:]) < 5]
 77 |     return [Example(stem, directory / 'clean' / f'{stem}.wav', None, False)
 78 |             for stem in selected]
 79 | 
 80 | 
 81 | def daps_segmented(directory, stems):
 82 |     """Select evaluation examples from daps"""
 83 |     examples = []
 84 |     for stem in stems:
 85 |         # Get files
 86 |         audio_file = clpcnet.data.stem_to_file('daps-segmented',
 87 |                                                 directory,
 88 |                                                 stem)
 89 |         text_file = audio_file.with_suffix('.txt')
 90 | 
 91 |         # Create example
 92 |         examples.append(Example(stem, audio_file, text_file, False))
 93 | 
 94 |     return examples
 95 | 
 96 | 
 97 | def ravdess_hifi(directory, stems):
 98 |     """Select evaluation samples from ravdess"""
 99 |     # Get deterministic but random set of stems
100 |     random.seed(0)
101 |     stems = random.sample(stems, 100)
102 | 
103 |     # Create examples
104 |     text_directory = clpcnet.ASSETS_DIR / 'text' / 'ravdess'
105 |     return [Example(stem,
106 |                     clpcnet.data.stem_to_file('ravdess-hifi', directory, stem),
107 |                     text_directory / f'{stem.split("-")[4]}.txt',
108 |                     False)
109 |             for stem in stems]
110 | 
111 | 
112 | def vctk(directory, unseen, seen):
113 |     """Select evaluation examples from vctk"""
114 |     # Load speaker info
115 |     with open(directory / 'speaker-info.txt') as file:
116 |         lines = file.readlines()
117 |         speakers = [clpcnet.partition.VCTKSpeaker(line) for line in lines[1:]]
118 |     speakers = [s for s in speakers if s.id != 'p362']
119 | 
120 |     # Pick a few speakers
121 |     random.seed(0)
122 |     unseen_speakers = sample_speakers(speakers, unseen)
123 |     seen_speakers = sample_speakers(speakers, seen)
124 | 
125 |     # For each speaker, pick a few files
126 |     selected = []
127 |     stems = unseen + seen
128 |     for speaker in unseen_speakers + seen_speakers:
129 |         speaker_stems = [s for s in stems if speaker in s]
130 |         selected.extend(random.sample(speaker_stems, 4))
131 | 
132 |     # Get absolute paths to audio and text files
133 |     audio_directory = directory / 'wav48_silence_trimmed'
134 |     audio_files = [audio_directory / s.split('_')[0] / f'{s}_mic2.flac'
135 |                    for s in selected]
136 |     text_directory = directory / 'txt'
137 |     text_files = [text_directory / s.split('_')[0] / f'{s}.txt'
138 |                   for s in selected]
139 | 
140 |     iterator = enumerate(zip(selected, audio_files, text_files))
141 |     return [Example(s, audio, text, i >= len(selected) // 2)
142 |             for i, (s, audio, text) in iterator]
143 | 
144 | 
145 | ###############################################################################
146 | # Utilities
147 | ###############################################################################
148 | 
149 | 
150 | class Example:
151 | 
152 |     def __init__(self, stem, audio_file, text_file, seen):
153 |         self.stem = stem
154 |         self.audio_file = audio_file
155 |         self.text_file = text_file
156 |         self.seen = seen
157 | 
158 | 
159 | def sample_speakers(speakers, stems, n=8):
160 |     """Sample from a list of speakers"""
161 |     # Get relevant speakers
162 |     relevant = set(s.split('_')[0] for s in stems)
163 |     stem_speakers = [s for s in speakers if s.id in relevant]
164 | 
165 |     # Shuffle
166 |     random.shuffle(stem_speakers)
167 | 
168 |     # Pick first n // 2 of each gender
169 |     male = [s.id for s in stem_speakers if s.gender == 'M']
170 |     female = [s.id for s in stem_speakers if s.gender == 'F']
171 |     return male[:n // 2] + female [:n // 2]
172 | 
173 | 
174 | def stretch_alignment(alignment, rate):
175 |     """Stretch the alignment by the given rate"""
176 |     alignment = copy.deepcopy(alignment)
177 |     durations = [(1. / rate) * p.duration() for p in alignment.phonemes()]
178 |     alignment.update(durations=durations)
179 |     return alignment
180 | 
181 | 
182 | ###############################################################################
183 | # Entry point
184 | ###############################################################################
185 | 
186 | 
187 | def main():
188 |     """Create a directory of files for evaluation"""
189 |     # Parse command-line arguments
190 |     args = parse_args()
191 | 
192 |     # Get test partition stems
193 |     partition_file = clpcnet.ASSETS_DIR / 'partition' / f'{args.dataset}.json'
194 |     with open(partition_file) as file:
195 |         partition = json.load(file)
196 | 
197 |     # Get paths to selected examples
198 |     if args.dataset == 'daps':
199 |         test_unseen = partition['test']
200 |         examples = daps(args.directory, test_unseen)
201 |     elif args.dataset == 'daps-segmented':
202 |         test_unseen = partition['test']
203 |         examples = daps_segmented(args.directory, test_unseen)
204 |     elif args.dataset == 'ravdess-hifi':
205 |         test_unseen = partition['test']
206 |         examples = ravdess_hifi(args.directory, test_unseen)
207 |     elif args.dataset == 'vctk':
208 |         test_unseen, test_seen = partition['test'], partition['test-seen']
209 |         examples = vctk(args.directory, test_unseen, test_seen)
210 |     else:
211 |         raise ValueError(f'No dataset {args.dataset}')
212 | 
213 |     # Gather files for evaluation
214 |     from_files_to_files(examples, args.output_directory, args.gpu)
215 | 
216 | 
217 | def parse_args():
218 |     """Parse command-line arguments"""
219 |     parser = argparse.ArgumentParser()
220 |     parser.add_argument(
221 |         '--dataset',
222 |         default='vctk',
223 |         help='The dataset to gather evaluation files from')
224 |     parser.add_argument(
225 |         '--directory',
226 |         type=Path,
227 |         default=clpcnet.DATA_DIR,
228 |         help='The root directory of the dataset')
229 |     parser.add_argument(
230 |         '--output_directory',
231 |         type=Path,
232 |         default=clpcnet.EVAL_DIR / 'objective' / 'constant',
233 |         help='The output evaluation directory')
234 |     parser.add_argument(
235 |         '--gpu',
236 |         type=int,
237 |         default=None,
238 |         help='The gpu to use for pitch estimation')
239 | 
240 |     # Extend directories with dataset name
241 |     args = parser.parse_args()
242 |     args.directory = args.directory / args.dataset
243 |     args.output_directory = args.output_directory / args.dataset / 'data'
244 | 
245 |     return args
246 | 
247 | 
248 | if __name__ == '__main__':
249 |     main()
250 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/objective/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/evaluate/objective/__init__.py


--------------------------------------------------------------------------------
/clpcnet/evaluate/objective/variable.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import multiprocessing as mp
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import pyfoal
  8 | import soundfile
  9 | import torch
 10 | import tqdm
 11 | 
 12 | import clpcnet
 13 | 
 14 | 
 15 | ###############################################################################
 16 | # Constants
 17 | ###############################################################################
 18 | 
 19 | 
 20 | DEFAULT_DIRECTORY = clpcnet.DATA_DIR / 'ravdess-hifi'
 21 | 
 22 | 
 23 | ###############################################################################
 24 | # Variable-rate pitch shifting
 25 | ###############################################################################
 26 | 
 27 | 
 28 | def evaluate(directory=DEFAULT_DIRECTORY,
 29 |              run='clpcnet',
 30 |              checkpoint=clpcnet.DEFAULT_CHECKPOINT,
 31 |              gpu=None):
 32 |     """Evaluate variable-rate pitch shifting on ravdess"""
 33 |     # Get list of examples to generate
 34 |     with open(clpcnet.data.partition_file('ravdess-variable')) as file:
 35 |         pairs = json.load(file)['test']
 36 | 
 37 |     # Setup output directory
 38 |     output_directory = clpcnet.EVAL_DIR / \
 39 |                        'objective' / \
 40 |                        'variable' / \
 41 |                        'ravdess-hifi' / \
 42 |                        run
 43 |     output_directory.mkdir(exist_ok=True, parents=True)
 44 | 
 45 |     # Setup multiprocessing
 46 |     pool = mp.get_context('spawn').Pool()
 47 | 
 48 |     # Iterate over pairs
 49 |     iterator = tqdm.tqdm(
 50 |         pairs,
 51 |         total=len(pairs),
 52 |         dynamic_ncols=True,
 53 |         desc='Generating variable-ratio examples')
 54 |     for pair in iterator:
 55 | 
 56 |         # Load text
 57 |         statement = pair[0].split('-')[4]
 58 |         text_file = clpcnet.ASSETS_DIR / 'text' / 'ravdess' / f'{statement}.txt'
 59 |         with open(text_file) as file:
 60 |             text = file.read()
 61 | 
 62 |         # Load audio
 63 |         source_file = clpcnet.data.stem_to_file('ravdess-variable',
 64 |                                                 directory,
 65 |                                                 pair[0])
 66 |         target_file = clpcnet.data.stem_to_file('ravdess-variable',
 67 |                                                 directory,
 68 |                                                 pair[1])
 69 |         source = clpcnet.load.audio(source_file)
 70 |         target = clpcnet.load.audio(target_file)
 71 | 
 72 |         # Compute pitch
 73 |         source_pitch, source_periodicity = clpcnet.pitch.from_audio(source, gpu)
 74 |         target_pitch, target_periodicity = clpcnet.pitch.from_audio(target, gpu)
 75 | 
 76 |         # Compute alignment
 77 |         source_alignment = pyfoal.align(text, source, clpcnet.SAMPLE_RATE)
 78 |         target_alignment = pyfoal.align(text, target, clpcnet.SAMPLE_RATE)
 79 | 
 80 |         # Align periodicity for evaluation
 81 |         aligned_periodicity = clpcnet.pitch.align(target_periodicity,
 82 |                                                   source_periodicity,
 83 |                                                   target_alignment,
 84 |                                                   source_alignment)
 85 | 
 86 |         # Output file prefix
 87 |         prefix = output_directory / f'{pair[0]}_{pair[1]}'
 88 |         output_file = prefix.parent / (prefix.stem + '-transfer.wav')
 89 | 
 90 |         # Perform pitch shifting
 91 |         args = (output_file, source)
 92 |         kwargs = {'source_alignment': source_alignment,
 93 |                   'target_alignment': target_alignment,
 94 |                   'target_pitch': target_pitch,
 95 |                   'checkpoint_file': checkpoint,
 96 |                   'verbose': False}
 97 |         pool.apply_async(clpcnet.to_file, args, kwargs)
 98 |         # clpcnet.to_file(*args, **kwargs)
 99 | 
100 |         # Save stuff
101 |         np.save(prefix.parent / (prefix.stem + '-source.npy'), source_pitch)
102 |         np.save(prefix.parent / (prefix.stem + '-target.npy'), target_pitch)
103 |         np.save(prefix.parent / (prefix.stem + '-aligned.npy'), aligned_periodicity)
104 |         np.save(prefix.parent / (prefix.stem + '-sourceharm.npy'),
105 |                 source_periodicity)
106 |         np.save(prefix.parent / (prefix.stem + '-targetharm.npy'),
107 |                 target_periodicity)
108 |         source_alignment.save(prefix.parent / (prefix.stem + '-source.json'))
109 |         target_alignment.save(prefix.parent / (prefix.stem + '-target.json'))
110 |         with open(prefix.with_suffix('.txt'), 'w') as file:
111 |             file.write(text)
112 |         soundfile.write(f'{prefix}-source.wav', source, clpcnet.SAMPLE_RATE)
113 |         soundfile.write(f'{prefix}-target.wav', target, clpcnet.SAMPLE_RATE)
114 | 
115 |     # Close multiprocessing pool and wait for processes to finish
116 |     pool.close()
117 |     pool.join()
118 | 
119 |     # Pitch estimation
120 |     files = list(output_directory.glob('*-transfer.wav'))
121 |     prefixes = [f.parent / f.stem for f in files]
122 |     clpcnet.pitch.from_files_to_files(files, prefixes, gpu)
123 | 
124 |     # Forced alignment
125 |     pyfoal.from_files_to_files(
126 |         [f.parent / (f.stem[:-9] + '.txt') for f in files],
127 |         files,
128 |         [f.with_suffix('.json') for f in files])
129 | 
130 |     # Get pitch files to evaluate
131 |     source_pitch_files = sorted(output_directory.glob('*-pitch.npy'))
132 |     target_pitch_files = sorted(output_directory.glob('*-target.npy'))
133 |     source_periodicity_files = sorted(output_directory.glob('*-periodicity.npy'))
134 |     target_periodicity_files = sorted(output_directory.glob('*-aligned.npy'))
135 | 
136 |     # Evaluate pitch
137 |     rmse, precision, recall, f1, gpe_20, gpe_50, hist = \
138 |         clpcnet.evaluate.pitch.from_files(source_pitch_files,
139 |                                           target_pitch_files,
140 |                                           source_periodicity_files,
141 |                                           target_periodicity_files)
142 |     run_results = {
143 |             'f1': f1,
144 |             'precision': precision,
145 |             'recall': recall,
146 |             'rmse_cents': rmse,
147 |             'gpe_20': gpe_20,
148 |             'gpe_50': gpe_50}
149 |     hist_file = output_directory / f'{run}.png'
150 |     clpcnet.evaluate.plot.write_histogram(hist_file, hist)
151 | 
152 |     # Evaluate time-stretch
153 |     duration_dict = {}
154 | 
155 |     # Get duration files to evaluate
156 |     source_duration_files = sorted(output_directory.glob('*-transfer.json'))
157 |     target_duration_files = sorted(output_directory.glob('*-target.json'))
158 | 
159 |     # Forced alignment rmse metric
160 |     duration_rmse = clpcnet.evaluate.duration.from_files(
161 |         source_duration_files, target_duration_files)
162 |     duration_dict['force-align'] = {'rmse_seconds': duration_rmse}
163 | 
164 |     # DTW metrics
165 |     source_audio_files = [f.with_suffix('.wav') for f in source_duration_files]
166 |     target_audio_files = [f.with_suffix('.wav') for f in target_duration_files]
167 |     distance = clpcnet.evaluate.dtw.from_files(source_audio_files,
168 |                                                target_audio_files)
169 |     duration_dict['dtw'] = {'distance': distance}
170 |     run_results.update(duration_dict)
171 | 
172 |     # Load results file
173 |     try:
174 |         with open(output_directory / 'results.json') as file:
175 |             results = json.load(file)
176 |     except FileNotFoundError:
177 |         results = {}
178 | 
179 |     # Update results
180 |     results[run] = run_results
181 | 
182 |     # Write results
183 |     with open(output_directory / 'results.json', 'w') as file:
184 |         json.dump(results, file, indent=4)
185 | 
186 | 
187 | ###############################################################################
188 | # Entry point
189 | ###############################################################################
190 | 
191 | 
192 | def parse_args():
193 |     """Parse command-line arguments"""
194 |     parser = argparse.ArgumentParser()
195 |     parser.add_argument(
196 |         '--directory',
197 |         type=Path,
198 |         default=DEFAULT_DIRECTORY,
199 |         help='Root directory of the ravdess dataset')
200 |     parser.add_argument(
201 |         '--run',
202 |         default='clpcnet',
203 |         help='The evaluation run')
204 |     parser.add_argument(
205 |         '--checkpoint',
206 |         type=Path,
207 |         default=clpcnet.DEFAULT_CHECKPOINT,
208 |         help='The model checkpoint')
209 |     parser.add_argument(
210 |         '--gpu',
211 |         type=int,
212 |         default=None,
213 |         help='The index of the gpu to use')
214 |     return parser.parse_args()
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     evaluate(**vars(parse_args()))
219 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/pitch.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torchcrepe
  6 | import tqdm
  7 | 
  8 | import clpcnet
  9 | 
 10 | 
 11 | ###############################################################################
 12 | # Pitch evaluation
 13 | ###############################################################################
 14 | 
 15 | 
 16 | def from_files(source_pitch_files,
 17 |                target_pitch_files,
 18 |                source_periodicity_files,
 19 |                target_periodicity_files):
 20 |     """Evaluate pitch rmse in voiced regions and f1 of voiced/unvoiced"""
 21 |     metrics = PitchMetrics()
 22 | 
 23 |     # Voiced/unvoiced thresholding fn
 24 |     threshold = torchcrepe.threshold.Hysteresis()
 25 | 
 26 |     # Evaluate each pair of files
 27 |     iterator = zip(source_pitch_files,
 28 |                    target_pitch_files,
 29 |                    source_periodicity_files,
 30 |                    target_periodicity_files)
 31 |     iterator = tqdm.tqdm(iterator, desc='Evaluating pitch', dynamic_ncols=True)
 32 |     for source_pitch_file, \
 33 |         target_pitch_file, \
 34 |         source_periodicity_file, \
 35 |         target_periodicity_file in iterator:
 36 | 
 37 |         # Load files
 38 |         source_pitch = np.load(source_pitch_file)
 39 |         target_pitch = np.load(target_pitch_file)
 40 |         source_periodicity = np.load(source_periodicity_file)
 41 |         target_periodicity = np.load(target_periodicity_file)
 42 | 
 43 |         # Convert to torch
 44 |         source_pitch = torch.tensor(source_pitch)[None]
 45 |         target_pitch = torch.tensor(target_pitch)[None]
 46 |         source_periodicity = torch.tensor(source_periodicity)[None]
 47 |         target_periodicity = torch.tensor(target_periodicity)[None]
 48 | 
 49 |         # Threshold
 50 |         source = threshold(source_pitch, source_periodicity)
 51 |         target = threshold(target_pitch, target_periodicity)
 52 | 
 53 |         # Bound pitch
 54 |         source = torch.clamp(source, clpcnet.FMIN, clpcnet.FMAX)
 55 |         target = torch.clamp(target, clpcnet.FMIN, clpcnet.FMAX)
 56 | 
 57 |         # Compute metrics
 58 |         metrics.update(source, target)
 59 | 
 60 |     # Compute aggregate metrics over files
 61 |     return metrics()
 62 | 
 63 | 
 64 | class PitchMetrics:
 65 |     """Batch update pitch metrics"""
 66 | 
 67 |     gpe_20_threshold = 20. / 1200.
 68 |     gpe_50_threshold = 50. / 1200.
 69 | 
 70 |     def __init__(self, gpe_threshold=50):
 71 |         self.true_positives = 0
 72 |         self.false_positives = 0
 73 |         self.false_negatives = 0
 74 |         self.sum = 0.
 75 |         self.gpe_20_count = 0
 76 |         self.gpe_50_count = 0
 77 |         self.count = 0
 78 |         self.differences = []
 79 | 
 80 |     def __call__(self):
 81 |         """Compute the aggregate rmse, precision, recall, and f1"""
 82 |         precision = \
 83 |             self.true_positives / (self.true_positives + self.false_positives)
 84 |         recall = \
 85 |             self.true_positives / (self.true_positives + self.false_negatives)
 86 |         f1 = 2 * precision * recall / (precision + recall)
 87 |         rmse = 1200 * math.sqrt(self.sum / self.count)
 88 |         gpe_20 = self.gpe_20_count / self.count
 89 |         gpe_50 = self.gpe_50_count / self.count
 90 |         differences = 1200 * torch.cat(self.differences)
 91 |         return rmse, precision, recall, f1, gpe_20, gpe_50, differences
 92 | 
 93 |     def update(self, source, target):
 94 |         """Update the rmse, precision, recall, and f1"""
 95 |         source_voiced = ~torch.isnan(source)
 96 |         target_voiced = ~torch.isnan(target)
 97 |         overlap = source_voiced & target_voiced
 98 |         differences = torch.log2(source[overlap]) - torch.log2(target[overlap])
 99 |         self.true_positives += overlap.sum().item()
100 |         self.false_positives += (~source_voiced & target_voiced).sum().item()
101 |         self.false_negatives += (source_voiced & ~target_voiced).sum().item()
102 |         self.sum += (differences ** 2).sum().item()
103 |         self.gpe_20_count += (differences.abs() > self.gpe_20_threshold).sum().item()
104 |         self.gpe_50_count += (differences.abs() > self.gpe_50_threshold).sum().item()
105 |         self.count += source.numel()
106 |         self.differences.append(differences)
107 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | 
 4 | def write_histogram(file, histogram):
 5 |     """Create and write pitch histogram"""
 6 |     plt.figure()
 7 |     plt.hist(histogram.numpy(), bins=50)
 8 |     plt.title('Log pitch error distribution in voiced regions')
 9 |     plt.xlabel('Log pitch deviation')
10 |     plt.ylabel('Frequency')
11 |     plt.savefig(file)
12 |     plt.close()
13 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/prosody.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import random
  3 | import warnings
  4 | 
  5 | import numpy as np
  6 | import pyfoal
  7 | import pypar
  8 | import soundfile
  9 | 
 10 | import clpcnet
 11 | 
 12 | 
 13 | ###############################################################################
 14 | # Prosody transfer representation
 15 | ###############################################################################
 16 | 
 17 | 
 18 | class ProsodyTransfer:
 19 |     """Representation for a prosody transfer task"""
 20 | 
 21 |     def __init__(self, name, text, source_audio, target_audio, gpu=None):
 22 |         self.name = name
 23 |         self.text = text
 24 |         self.source_audio = source_audio
 25 |         self.target_audio = target_audio
 26 |         self.gpu = gpu
 27 | 
 28 |     def is_valid(self):
 29 |         """Check if phoneme alignments match"""
 30 |         # Get phoneme alignments
 31 |         source = self.source_alignment()
 32 |         target = self.target_alignment()
 33 | 
 34 |         # Get phonemes
 35 |         source_phonemes = source.phonemes()
 36 |         target_phonemes = target.phonemes()
 37 | 
 38 |         # Length and phoneme checks
 39 |         iterator = zip(source_phonemes, target_phonemes)
 40 |         if len(source_phonemes) != len(target_phonemes) or \
 41 |            any([str(s) != str(t) for s, t in iterator]):
 42 |            return False
 43 | 
 44 |         # Relative rate check
 45 |         rates = np.array(pypar.compare.per_phoneme_rate(source, target))
 46 |         if any(rates > 4.) or any(rates < .25):
 47 |             return False
 48 | 
 49 |         # Get pitch
 50 |         source_pitch, source_harm = self.source_pitch(return_periodicity=True)
 51 |         target_pitch = self.target_pitch()
 52 | 
 53 |         # Invert target
 54 |         aligned_pitch = clpcnet.pitch.align(source_pitch,
 55 |                                             target_pitch,
 56 |                                             source,
 57 |                                             target)
 58 | 
 59 |         # Threshold
 60 |         source_pitch = clpcnet.pitch.threshold(source_pitch, source_harm)
 61 | 
 62 |         # Extract voiced
 63 |         voiced = ~np.isnan(source_pitch)
 64 |         source = source_pitch[voiced]
 65 |         target = aligned_pitch[voiced]
 66 | 
 67 |         # Pitch range check
 68 |         if any(source > 400) or any(source < 65) or \
 69 |            any(target > 400) or any(target < 65):
 70 |             return False
 71 | 
 72 |         # Pitch shift range check
 73 |         rate = np.abs(target / source)
 74 |         return all(rate <= 2.5) and all(rate >= .4)
 75 | 
 76 | 
 77 |     @classmethod
 78 |     def from_file(cls, prefix, gpu=None):
 79 |         """Load from disk"""
 80 |         # Load text
 81 |         with open(prefix.with_suffix('.txt')) as file:
 82 |             text = file.read()
 83 | 
 84 |         # Load audio
 85 |         source_audio = clpcnet.load.audio(
 86 |             prefix.parent / (prefix.stem + '-source.wav'))
 87 |         target_audio = clpcnet.load.audio(
 88 |             prefix.parent / (prefix.stem + '-target.wav'))
 89 | 
 90 |         # Make transfer
 91 |         return cls(prefix.stem, text, source_audio, target_audio, gpu)
 92 | 
 93 |     def save(self, directory):
 94 |         """Save audio files to directory"""
 95 |         prefix = directory / f'{self.name}'
 96 | 
 97 |         # Save text
 98 |         with open(prefix.parent / (prefix.stem + '.txt'), 'w') as file:
 99 |             file.write(self.text)
100 | 
101 |         # Save audio
102 |         soundfile.write(prefix.parent / (prefix.stem + '-source.wav'),
103 |                         self.source_audio,
104 |                         clpcnet.SAMPLE_RATE)
105 |         soundfile.write(prefix.parent / (prefix.stem + '-target.wav'),
106 |                         self.target_audio,
107 |                         clpcnet.SAMPLE_RATE)
108 | 
109 |     def source_alignment(self):
110 |         """Retrieve the source alignment"""
111 |         if not hasattr(self, '_source_alignment'):
112 |             self._source_alignment = pyfoal.align(self.text,
113 |                                                   self.source_audio,
114 |                                                   clpcnet.SAMPLE_RATE)
115 |         return self._source_alignment
116 | 
117 |     def source_pitch(self, return_periodicity=False):
118 |         """Retrieve the source pitch"""
119 |         if not hasattr(self, '_source_pitch'):
120 |             self._source_pitch = clpcnet.pitch.from_audio(self.source_audio,
121 |                                                           self.gpu)
122 |         return \
123 |             self._source_pitch if return_periodicity else self._source_pitch[0]
124 | 
125 |     def target_alignment(self):
126 |         """Retrieve the target alignment"""
127 |         if not hasattr(self, '_target_alignment'):
128 |             self._target_alignment = pyfoal.align(self.text,
129 |                                                   self.target_audio,
130 |                                                   clpcnet.SAMPLE_RATE)
131 |         return self._target_alignment
132 | 
133 |     def target_pitch(self, return_periodicity=False):
134 |         """Retrieve the target pitch"""
135 |         if not hasattr(self, '_target_pitch'):
136 |             self._target_pitch = clpcnet.pitch.from_audio(self.target_audio,
137 |                                                           self.gpu)
138 |         return \
139 |             self._target_pitch if return_periodicity else self._target_pitch[0]
140 | 
141 | 
142 | ###############################################################################
143 | # Dataset generators
144 | ###############################################################################
145 | 
146 | 
147 | def ravdess_generator(directory, gpu=None):
148 |     """Generator over examples in ravdess dataset"""
149 |     # Get audio files
150 |     files = sorted(directory.glob('Actor_*/*.wav'))
151 | 
152 |     # Get file metadata
153 |     metadata = [RavdessFileMetadata(f) for f in files]
154 | 
155 |     # Filter out high intensity
156 |     metadata = [m for m in metadata if m.intensity == 1]
157 | 
158 |     # Statement text
159 |     text = {1: 'Kids are talking by the door',
160 |             2: 'Dogs are sitting by the door'}
161 | 
162 |     # We make five matches per statement per speaker. There are 20 speakers
163 |     # that satisfy this given our filtering, for a total of 100 matches.
164 |     for speaker in range(1, 25):
165 | 
166 |         # Skip speakers that cannot produce 5 matches
167 |         if speaker in [4, 9, 14, 20]:
168 |             continue
169 | 
170 |         for statement in range(1, 3):
171 | 
172 |             # Get relevant files
173 |             candidates = [
174 |                 m for m in metadata
175 |                 if m.actor == speaker and m.statement == statement]
176 | 
177 |             # Iterate over unique pairs in random order
178 |             matches = 0
179 |             iterator = list(itertools.combinations(candidates, 2))
180 |             random.shuffle(iterator)
181 |             for sample_a, sample_b in iterator:
182 | 
183 |                 # Create match
184 |                 transfer = ProsodyTransfer(
185 |                     f'{sample_a.file.stem}_{sample_b.file.stem}',
186 |                     text[statement],
187 |                     clpcnet.load.audio(sample_a.file),
188 |                     clpcnet.load.audio(sample_b.file),
189 |                     gpu=gpu)
190 | 
191 |                 # Check if phoneme alignments match
192 |                 if transfer.is_valid():
193 |                     yield transfer
194 | 
195 |                     # Check if we've made enough matches
196 |                     matches += 1
197 |                     if matches == 5:
198 |                         break
199 | 
200 |             # Raise if we couldn't find enough matches
201 |             if matches != 5:
202 |                 warnings.warn(f'Can only find {matches} of 5 matches')
203 |                 # raise ValueError(f'Can only find {matches} of 5 matches')
204 | 
205 | 
206 | ###############################################################################
207 | # Utilities
208 | ###############################################################################
209 | 
210 | 
211 | class RavdessFileMetadata:
212 |     """Parses the filename into metadata"""
213 | 
214 |     def __init__(self, file):
215 |         self.file = file
216 | 
217 |         entries = file.stem.split('-')
218 |         self.modality = int(entries[0])
219 |         self.channel = int(entries[1])
220 |         self.emotion = int(entries[2])
221 |         self.intensity = int(entries[3])
222 |         self.statement = int(entries[4])
223 |         self.repetition = int(entries[5])
224 |         self.actor = int(entries[6])
225 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/subjective/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/clpcnet/evaluate/subjective/__init__.py


--------------------------------------------------------------------------------
/clpcnet/evaluate/subjective/constant.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import multiprocessing as mp
  3 | from pathlib import Path
  4 | 
  5 | import soundfile
  6 | 
  7 | import clpcnet
  8 | 
  9 | 
 10 | ###############################################################################
 11 | # Constants
 12 | ###############################################################################
 13 | 
 14 | 
 15 | DURATION_RATIOS = [50, 71, 100, 141, 200]
 16 | PITCH_RATIOS = [67, 80, 100, 125, 150]
 17 | 
 18 | 
 19 | ###############################################################################
 20 | # Subjective evaluation generation
 21 | ###############################################################################
 22 | 
 23 | 
 24 | def generate(directory,
 25 |              run='clpcnet',
 26 |              checkpoint=clpcnet.DEFAULT_CHECKPOINT,
 27 |              gpu=None):
 28 |     """Preprare files for subjective evaluation on daps"""
 29 |     # Get daps files for evaluation
 30 |     files = clpcnet.data.files('daps-segmented', directory, 'test')
 31 | 
 32 |     # Setup output directory
 33 |     output_directory = clpcnet.EVAL_DIR / \
 34 |                        'subjective' / \
 35 |                        'constant' / \
 36 |                        'daps-segmented'
 37 |     output_directory.mkdir(exist_ok=True, parents=True)
 38 | 
 39 |     # Setup multiprocessing
 40 |     pool = mp.get_context('spawn').Pool()
 41 | 
 42 |     # Generate pitch-shifting examples
 43 |     generate_pitch(output_directory / 'constant-pitch',
 44 |                    files,
 45 |                    pool,
 46 |                    run,
 47 |                    checkpoint,
 48 |                    gpu)
 49 | 
 50 |     # Generate time-stretching examples
 51 |     generate_duration(output_directory / 'constant-duration',
 52 |                       files,
 53 |                       pool,
 54 |                       run,
 55 |                       checkpoint)
 56 | 
 57 |     # Close the pool and wait until processes finish
 58 |     pool.close()
 59 |     pool.join()
 60 | 
 61 |     # Convert to mp3
 62 |     wavfiles = list(output_directory.rglob('*.wav'))
 63 |     clpcnet.mp3.convert_files(wavfiles)
 64 | 
 65 |     # Remove wav files
 66 |     for file in wavfiles:
 67 |         file.unlink()
 68 | 
 69 | 
 70 | ###############################################################################
 71 | # Constant-rate duration generation
 72 | ###############################################################################
 73 | 
 74 | 
 75 | def generate_duration(output_directory,
 76 |                       files,
 77 |                       pool,
 78 |                       run='clpcnet',
 79 |                       checkpoint=clpcnet.DEFAULT_CHECKPOINT):
 80 |     """Prepare constant-rate time-stretching files for subjective evaluation"""
 81 |     # Setup output directory
 82 |     original_directory = output_directory / 'original'
 83 |     original_directory.mkdir(exist_ok=True, parents=True)
 84 | 
 85 |     # Iterate over utterances
 86 |     for file in files:
 87 | 
 88 |         # Write original audio
 89 |         original_file = \
 90 |             original_directory / \
 91 |             f'constant-duration_original_{file.stem.replace("_", "-")}.wav'
 92 |         soundfile.write(original_file,
 93 |                         clpcnet.load.audio(file),
 94 |                         clpcnet.SAMPLE_RATE)
 95 | 
 96 |         # Constant shifting with lpcnet
 97 |         pool.apply_async(generate_duration_lpcnet,
 98 |                          (file, output_directory, run, checkpoint))
 99 |         # generate_duration_lpcnet(file, output_directory, run, checkpoint)
100 | 
101 | 
102 | def generate_duration_lpcnet(file,
103 |                              output_directory,
104 |                              run='clpcnet',
105 |                              checkpoint=clpcnet.DEFAULT_CHECKPOINT):
106 |     """Generate examples using lpcnet"""
107 |     for ratio in DURATION_RATIOS:
108 | 
109 |         # Get run name
110 |         name = f'{run}-{ratio:03d}'
111 | 
112 |         # Make output directory
113 |         directory = output_directory / name
114 |         directory.mkdir(exist_ok=True, parents=True)
115 | 
116 |         # Generate
117 |         output_file = directory / \
118 |             f'constant-duration_{name}_{file.stem.replace("_", "-")}.wav'
119 |         clpcnet.from_file_to_file(file,
120 |                                   output_file,
121 |                                   constant_stretch=ratio / 100.,
122 |                                   checkpoint_file=checkpoint,
123 |                                   verbose=False)
124 | 
125 | 
126 | ###############################################################################
127 | # Constant-rate pitch generation
128 | ###############################################################################
129 | 
130 | 
131 | def generate_pitch(output_directory,
132 |                    files,
133 |                    pool,
134 |                    run='clplcnet',
135 |                    checkpoint=clpcnet.DEFAULT_CHECKPOINT,
136 |                    gpu=None):
137 |     """Prepare constant-rate pitch-shifting files for subjective evaluation"""
138 |     # Setup output directory
139 |     original_directory = output_directory / 'original'
140 |     original_directory.mkdir(exist_ok=True, parents=True)
141 | 
142 |     # Iterate over utterances
143 |     for file in files:
144 | 
145 |         # Write original audio
146 |         original_file = \
147 |             original_directory / \
148 |             f'constant-pitch_original_{file.stem.replace("_", "-")}.wav'
149 |         soundfile.write(original_file,
150 |                         clpcnet.load.audio(file),
151 |                         clpcnet.SAMPLE_RATE)
152 | 
153 |         # Constant shifting with lpcnet
154 |         pool.apply_async(generate_pitch_lpcnet,
155 |                          (file, output_directory, run, checkpoint))
156 |         # generate_pitch_lpcnet(file, output_directory, run, checkpoint)
157 | 
158 | 
159 | def generate_pitch_lpcnet(file,
160 |                           output_directory,
161 |                           run='clpcnet',
162 |                           checkpoint=clpcnet.DEFAULT_CHECKPOINT):
163 |     """Generate examples using lpcnet"""
164 |     for ratio in PITCH_RATIOS:
165 | 
166 |         # Get run name
167 |         name = f'{run}-{ratio:03d}'
168 | 
169 |         # Make output directory
170 |         directory = output_directory / name
171 |         directory.mkdir(exist_ok=True, parents=True)
172 | 
173 |         # Generate
174 |         output_file = \
175 |             directory / \
176 |             f'constant-pitch_{name}_{file.stem.replace("_", "-")}.wav'
177 |         clpcnet.from_file_to_file(file,
178 |                                   output_file,
179 |                                   constant_shift=ratio / 100.,
180 |                                   checkpoint_file=checkpoint,
181 |                                   verbose=False)
182 | 
183 | 
184 | ###############################################################################
185 | # Entry point
186 | ###############################################################################
187 | 
188 | 
189 | def parse_args():
190 |     """Parse command-line arguments"""
191 |     parser = argparse.ArgumentParser()
192 |     parser.add_argument(
193 |         '--directory',
194 |         type=Path,
195 |         default=clpcnet.DATA_DIR / 'daps-segmented',
196 |         help='The root directory of the segmented daps dataset')
197 |     parser.add_argument(
198 |         '--run',
199 |         default='clpcnet',
200 |         help='The evaluation run')
201 |     parser.add_argument(
202 |         '--checkpoint',
203 |         type=Path,
204 |         default=clpcnet.DEFAULT_CHECKPOINT,
205 |         help='The checkpoint to use')
206 |     parser.add_argument(
207 |         '--gpu',
208 |         type=int,
209 |         default=None,
210 |         help='The gpu to use for pitch estimation')
211 | 
212 |     return parser.parse_args()
213 | 
214 | 
215 | if __name__ == '__main__':
216 |     generate(**vars(parse_args()))
217 | 


--------------------------------------------------------------------------------
/clpcnet/evaluate/subjective/variable.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import multiprocessing as mp
  4 | from pathlib import Path
  5 | 
  6 | import pyfoal
  7 | import soundfile
  8 | import tqdm
  9 | 
 10 | import clpcnet
 11 | 
 12 | 
 13 | ###############################################################################
 14 | # Constants
 15 | ###############################################################################
 16 | 
 17 | 
 18 | DEFAULT_DIRECTORY = clpcnet.DATA_DIR / 'ravdess-hifi'
 19 | DEFAULT_OUTPUT_DIRECTORY = clpcnet.EVAL_DIR / \
 20 |                            'subjective' / \
 21 |                            'variable' / \
 22 |                            'ravdess-hifi'
 23 | 
 24 | 
 25 | ###############################################################################
 26 | # Variable-rate pitch shifting
 27 | ###############################################################################
 28 | 
 29 | 
 30 | def evaluate(directory=DEFAULT_DIRECTORY,
 31 |              output_directory=DEFAULT_OUTPUT_DIRECTORY,
 32 |              run='clpcnet',
 33 |              checkpoint=clpcnet.DEFAULT_CHECKPOINT,
 34 |              gpu=None):
 35 |     """Evaluate variable-rate pitch shifting on ravdess"""
 36 |     # Get list of examples to generate
 37 |     with open(clpcnet.data.partition_file('ravdess-variable')) as file:
 38 |         pairs = json.load(file)['test']
 39 | 
 40 |     # Setup output directory
 41 |     original_directory = output_directory / 'original'
 42 |     run_directory = output_directory / run
 43 |     original_directory.mkdir(exist_ok=True, parents=True)
 44 |     run_directory.mkdir(exist_ok=True, parents=True)
 45 | 
 46 |     # Setup multiprocessing
 47 |     pool = mp.get_context('spawn').Pool()
 48 | 
 49 |     # Iterate over pairs
 50 |     for pair in tqdm.tqdm(pairs):
 51 | 
 52 |         # Load text
 53 |         statement = pair[0].split('-')[4]
 54 |         text_file = clpcnet.ASSETS_DIR / 'text' / 'ravdess' / f'{statement}.txt'
 55 |         with open(text_file) as file:
 56 |             text = file.read()
 57 | 
 58 |         # Load audio
 59 |         source_file = clpcnet.data.stem_to_file('ravdess-variable',
 60 |                                                 directory,
 61 |                                                 pair[0])
 62 |         target_file = clpcnet.data.stem_to_file('ravdess-variable',
 63 |                                                 directory,
 64 |                                                 pair[1])
 65 |         source = clpcnet.load.audio(source_file)
 66 |         target = clpcnet.load.audio(target_file)
 67 | 
 68 |         # Compute pitch
 69 |         target_pitch, _ = clpcnet.pitch.from_audio(target, gpu)
 70 | 
 71 |         # Compute alignment
 72 |         source_alignment = pyfoal.align(text, source, clpcnet.SAMPLE_RATE)
 73 |         target_alignment = pyfoal.align(text, target, clpcnet.SAMPLE_RATE)
 74 | 
 75 |         # Output file template
 76 |         template = 'variable_{}_' + f'{pair[0]}-{pair[1]}.wav'
 77 | 
 78 |         # Generate with clpcnet
 79 |         clpcnet_file = run_directory / template.format(run)
 80 |         args = (clpcnet_file, source)
 81 |         kwargs = {'source_alignment': source_alignment,
 82 |                   'target_alignment': target_alignment,
 83 |                   'target_pitch': target_pitch,
 84 |                   'checkpoint_file': checkpoint,
 85 |                   'verbose': False}
 86 |         pool.apply_async(clpcnet.to_file, args, kwargs)
 87 |         # clpcnet.to_file(*args, **kwargs)
 88 | 
 89 |         # Write original file
 90 |         original_file = original_directory / template.format('original')
 91 |         soundfile.write(original_file, target, clpcnet.SAMPLE_RATE)
 92 | 
 93 |     # Close multiprocessing pool and wait for processes to finish
 94 |     pool.close()
 95 |     pool.join()
 96 | 
 97 |     # Convert to mp3
 98 |     wavfiles = list(output_directory.rglob('*.wav'))
 99 |     clpcnet.mp3.convert_files(wavfiles)
100 | 
101 |     # Remove wav files
102 |     for file in wavfiles:
103 |         file.unlink()
104 | 
105 | 
106 | ###############################################################################
107 | # Entry point
108 | ###############################################################################
109 | 
110 | 
111 | def parse_args():
112 |     """Parse command-line arguments"""
113 |     parser = argparse.ArgumentParser()
114 |     parser.add_argument(
115 |         '--directory',
116 |         type=Path,
117 |         default=DEFAULT_DIRECTORY,
118 |         help='Root directory of the ravdess dataset')
119 |     parser.add_argument(
120 |         '--output_directory',
121 |         type=Path,
122 |         default=DEFAULT_OUTPUT_DIRECTORY,
123 |         help='The location to store files for subjective evaluation')
124 |     parser.add_argument(
125 |         '--run',
126 |         default='clpcnet',
127 |         help='The evaluation run')
128 |     parser.add_argument(
129 |         '--checkpoint',
130 |         type=Path,
131 |         default=clpcnet.DEFAULT_CHECKPOINT,
132 |         help='The checkpoint to use')
133 |     parser.add_argument(
134 |         '--gpu',
135 |         type=int,
136 |         default=None,
137 |         help='The index of the gpu to use')
138 |     return parser.parse_args()
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     evaluate(**vars(parse_args()))
143 | 


--------------------------------------------------------------------------------
/clpcnet/load.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import soundfile
 3 | 
 4 | import clpcnet
 5 | 
 6 | 
 7 | def audio(file):
 8 |     """Load audio from disk
 9 | 
10 |     Arguments
11 |         file : string
12 |             The audio file to load
13 | 
14 |     Returns
15 |         audio : np.array(shape=(samples,))
16 |             The audio
17 |     """
18 |     # Load
19 |     audio, sample_rate = soundfile.read(file)
20 | 
21 |     # Convert to mono if necessary
22 |     if audio.ndim == 2:
23 |         if audio.shape[1] == 2:
24 |             audio = audio.mean(1)
25 |         else:
26 |             audio = audio.squeeze()
27 | 
28 |     # Resample
29 |     return clpcnet.preprocess.resample(audio, sample_rate)
30 | 
31 | 
32 | def features(file):
33 |     """Load frame-rate features from disk for inference
34 | 
35 |     Arguments
36 |         file : string
37 |             The feature file
38 | 
39 |     Returns
40 |         features : np.array(shape=(frames, clpcnet.TOTAL_FEATURE_SIZE))
41 |     """
42 |     # Load test features
43 |     features = np.fromfile(file, dtype=np.float32)
44 | 
45 |     # shape=(time, channels)
46 |     features = np.reshape(features, (-1, clpcnet.TOTAL_FEATURE_SIZE))
47 | 
48 |     # Zero-out unused bark-scale coefficients
49 |     features[:, 18:36] = 0
50 |     return features[None]
51 | 
52 | 
53 | def model(file=clpcnet.DEFAULT_CHECKPOINT, gpu=None):
54 |     """Setup the LPCNet model for training
55 | 
56 |     Arguments
57 |         file : string
58 |             The model weight file
59 |         use_gpu : bool
60 |             Whether to use gpu compute
61 |     """
62 |     # Bind to generate function
63 |     clpcnet.from_features.session = clpcnet.Session(file, gpu)
64 | 
65 | 
66 | def yin(file):
67 |     """Load yin pitch and periodicity from file"""
68 |     # Load features
69 |     yin_features = features(file)
70 | 
71 |     # Slice yin pitch and periodicity
72 |     return yin_features[0, :, clpcnet.PITCH_IDX], \
73 |            yin_features[0, :, clpcnet.CORRELATION_IDX]
74 | 


--------------------------------------------------------------------------------
/clpcnet/loudness.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import librosa
 4 | import numpy as np
 5 | 
 6 | import clpcnet
 7 | 
 8 | 
 9 | ###############################################################################
10 | # A-weighted loudness
11 | ###############################################################################
12 | 
13 | 
14 | def a_weighted(audio, n_fft=1024, min_db=-100.):
15 |     """Retrieve the per-frame loudness"""
16 |     # Cache weights so long as n_fft doesn't change
17 |     if not hasattr(a_weighted, 'weights') or \
18 |             (hasattr(a_weighted, 'n_fft') and a_weighted.n_fft != n_fft):
19 |         a_weighted.weights = perceptual_weights(n_fft)
20 |         a_weighted.n_fft = n_fft
21 | 
22 |     # Take stft
23 |     stft = librosa.stft(audio,
24 |                         n_fft=n_fft,
25 |                         hop_length=clpcnet.HOPSIZE,
26 |                         win_length=n_fft,
27 |                         pad_mode='constant')
28 | 
29 |     # Compute magnitude on db scale
30 |     db = librosa.amplitude_to_db(np.abs(stft))
31 | 
32 |     # Apply A-weighting
33 |     weighted = db + a_weighted.weights
34 | 
35 |     # Threshold
36 |     weighted[weighted < min_db] = min_db
37 | 
38 |     # Average over weighted frequencies
39 |     return weighted.mean(axis=0)
40 | 
41 | 
42 | def perceptual_weights(n_fft=1024, ref_db=20.):
43 |     """A-weighted frequency-dependent perceptual loudness weights"""
44 |     frequencies = librosa.fft_frequencies(sr=clpcnet.SAMPLE_RATE, n_fft=n_fft)
45 | 
46 |     # A warning is raised for nearly inaudible frequencies, but it ends up
47 |     # defaulting to -100 db. That default is fine for our purposes.
48 |     with warnings.catch_warnings():
49 |         warnings.simplefilter('ignore', RuntimeWarning)
50 |         return librosa.A_weighting(frequencies)[:, None] - ref_db
51 | 
52 | 
53 | ###############################################################################
54 | # Utilities
55 | ###############################################################################
56 | 
57 | 
58 | def limit(audio, delay=40, attack_coef=.9, release_coef=.9995, threshold=.99):
59 |     """Apply a limiter to prevent clipping"""
60 |     # Delay compensation
61 |     audio = np.pad(audio, (0, delay - 1))
62 | 
63 |     current_gain = 1.
64 |     delay_index = 0
65 |     delay_line = np.zeros(delay)
66 |     envelope = 0
67 | 
68 |     for idx, sample in enumerate(audio):
69 |         # Update signal history
70 |         delay_line[delay_index] = sample
71 |         delay_index = (delay_index + 1) % delay
72 | 
73 |         # Calculate envelope
74 |         envelope = max(abs(sample), envelope * release_coef)
75 | 
76 |         # Calcuate gain
77 |         target_gain = threshold / envelope if envelope > threshold else 1.
78 |         current_gain = \
79 |             current_gain * attack_coef + target_gain * (1 - attack_coef)
80 | 
81 |         # Apply gain
82 |         audio[idx] = delay_line[delay_index] * current_gain
83 | 
84 |     return audio[delay - 1:]
85 | 


--------------------------------------------------------------------------------
/clpcnet/model.py:
--------------------------------------------------------------------------------
  1 | '''Copyright (c) 2018 Mozilla
  2 |    Modified by Max Morrison
  3 | 
  4 |    Redistribution and use in source and binary forms, with or without
  5 |    modification, are permitted provided that the following conditions
  6 |    are met:
  7 | 
  8 |    - Redistributions of source code must retain the above copyright
  9 |    notice, this list of conditions and the following disclaimer.
 10 | 
 11 |    - Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 16 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 17 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 18 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 19 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 23 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 24 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | '''
 27 | import functools
 28 | import math
 29 | import os
 30 | import sys
 31 | 
 32 | # Import keras without printing backend
 33 | stderr = sys.stderr
 34 | sys.stderr = open(os.devnull, 'w')
 35 | import keras
 36 | sys.stderr = stderr
 37 | 
 38 | import numpy as np
 39 | from keras import backend as K
 40 | from keras.layers import Concatenate, Input, Reshape
 41 | 
 42 | import clpcnet
 43 | 
 44 | 
 45 | ###############################################################################
 46 | # LPCNet model construction
 47 | ###############################################################################
 48 | 
 49 | 
 50 | def model(training=False, use_gpu=True):
 51 |     """Build the LPCNet model"""
 52 | 
 53 |     ###########################################################################
 54 |     # Inputs
 55 |     ###########################################################################
 56 | 
 57 |     # Signal, prediction, and excitation inputs
 58 |     sample_rate_feats = Input(shape=(None, 3))
 59 | 
 60 |     # Bark-scale coefficients and pitch correlation
 61 |     spectral_feats = Input(shape=(None, clpcnet.SPECTRAL_FEATURE_SIZE))
 62 | 
 63 |     # Pitch period
 64 |     pitch = Input(shape=(None, 1))
 65 | 
 66 |     ###########################################################################
 67 |     # Create graph
 68 |     ###########################################################################
 69 | 
 70 |     # Build and link frame-rate network
 71 |     frame_rate_feats = frame_rate_network(spectral_feats, pitch, training)
 72 | 
 73 |     # Build and add sample-rate network
 74 |     probabilities, decoder_model = sample_rate_network(
 75 |         frame_rate_feats, sample_rate_feats, use_gpu)
 76 | 
 77 |     # Build lpcnet model
 78 |     model = keras.models.Model([sample_rate_feats, spectral_feats, pitch],
 79 |                                probabilities)
 80 | 
 81 |     # Build encoder model
 82 |     encoder_model = encoder(spectral_feats, pitch, frame_rate_feats)
 83 | 
 84 |     return model, encoder_model, decoder_model
 85 | 
 86 | 
 87 | ###############################################################################
 88 | # Model components
 89 | ###############################################################################
 90 | 
 91 | 
 92 | def decoder(sample_rate_feats,
 93 |             sample_rate_embedding,
 94 |             gru_a,
 95 |             gru_b,
 96 |             dual_dense):
 97 |     """Build the LPCNet decoder"""
 98 | 
 99 |     ###########################################################################
100 |     # Inputs
101 |     ###########################################################################
102 | 
103 |     # Frame-rate features upsampled to the sampling rate
104 |     upsampled = Input(shape=(None, 128))
105 | 
106 |     # GRU A initial state
107 |     gru_a_init = Input(shape=(clpcnet.GRU_A_SIZE,))
108 | 
109 |     # GRU B initial state
110 |     gru_b_init = Input(shape=(clpcnet.GRU_B_SIZE,))
111 | 
112 |     ###########################################################################
113 |     # Link
114 |     ###########################################################################
115 | 
116 |     # Concatenate sample-rate and upsampled frame-rate features
117 |     all_sample_rate_feats = Concatenate()([sample_rate_embedding, upsampled])
118 | 
119 |     # Add sample-rate gru A to graph
120 |     activation, gru_a_state = gru_a(all_sample_rate_feats,
121 |                                     initial_state=gru_a_init)
122 | 
123 |     # Residual connection between upsampled features and rnn output
124 |     # Note: this is NOT in the original LPCNet paper, but is in the code
125 |     activation = Concatenate()([activation, upsampled])
126 | 
127 |     # Add sample-rate gru B to graph
128 |     activation, gru_b_state = gru_b(activation,
129 |                                     initial_state=gru_b_init)
130 | 
131 |     # Add dual fully-connected layer to graph
132 |     probabilities = dual_dense(activation)
133 | 
134 |     # Specify model start and end points
135 |     inputs = [sample_rate_feats, upsampled, gru_a_init, gru_b_init]
136 |     outputs = [probabilities, gru_a_state, gru_b_state]
137 | 
138 |     return keras.models.Model(inputs, outputs)
139 | 
140 | 
141 | def encoder(spectral_feats, pitch, frame_rate_feats):
142 |     """Create the LPCNet encoder"""
143 |     return keras.models.Model([spectral_feats, pitch], frame_rate_feats)
144 | 
145 | 
146 | def frame_rate_network(spectral_feats, pitch, training=False):
147 |     """Create the LPCNet frame-rate network"""
148 | 
149 |     ###########################################################################
150 |     # Build
151 |     ###########################################################################
152 | 
153 |     # Pitch embedding table
154 |     pitch_embedding_table = keras.layers.Embedding(
155 |         clpcnet.PITCH_BINS, 64, name='embed_pitch')
156 | 
157 |     # 1d convolutions
158 |     conv_fn = functools.partial(keras.layers.Conv1D,
159 |                                 128,
160 |                                 3,
161 |                                 padding='valid' if training else 'same',
162 |                                 activation='tanh')
163 |     conv1, conv2 = conv_fn(name='feature_conv1'), conv_fn(name='feature_conv2')
164 | 
165 |     # Dense layers
166 |     dense_fn = functools.partial(keras.layers.Dense, 128, activation='tanh')
167 |     dense1 = dense_fn(name='feature_dense1')
168 |     dense2 = dense_fn(name='feature_dense2')
169 | 
170 |     ###########################################################################
171 |     # Link
172 |     ###########################################################################
173 | 
174 |     # Embed pitch
175 |     pitch_embedding = Reshape((-1, 64))(pitch_embedding_table(pitch))
176 | 
177 |     # Join frame-rate features
178 |     features = Concatenate()([spectral_feats, pitch_embedding])
179 | 
180 |     # Convolution layer forward pass
181 |     activation = conv2(conv1(features))
182 | 
183 |     # Dense layer forward pass
184 |     # Note: The residual connection shown in the paper was later found
185 |     # to be harmful. Therefore, it is omitted.
186 |     return dense2(dense1(activation))
187 | 
188 | 
189 | def sample_rate_network(frame_rate_feats, sample_rate_feats, use_gpu=True):
190 |     """Create the LPCNet sample-rate network"""
191 | 
192 |     ###########################################################################
193 |     # Build
194 |     ###########################################################################
195 | 
196 |     # PCM sample embedding table
197 |     sample_rate_embedding_table = keras.layers.Embedding(
198 |         clpcnet.PCM_LEVELS,
199 |         clpcnet.EMBEDDING_SIZE,
200 |         embeddings_initializer=sample_rate_embedding_initializer,
201 |         name='embed_sig')
202 | 
203 |     # Upsampler
204 |     repeat = keras.layers.Lambda(
205 |         lambda x: K.repeat_elements(x, clpcnet.HOPSIZE, 1))
206 | 
207 |     # Get gru function based on compute
208 |     if use_gpu:
209 |         gru_fn = functools.partial(
210 |             keras.layers.CuDNNGRU, return_sequences=True, return_state=True)
211 |     else:
212 |         gru_fn = functools.partial(keras.layers.GRU,
213 |                                    return_sequences=True,
214 |                                    return_state=True,
215 |                                    recurrent_activation='sigmoid',
216 |                                    reset_after='true')
217 | 
218 |     # Gru layers
219 |     gru_a = gru_fn(clpcnet.GRU_A_SIZE, name='gru_a')
220 |     gru_b = gru_fn(clpcnet.GRU_B_SIZE, name='gru_b')
221 | 
222 |     # Dual fully-connected layer
223 |     dual_dense = DualDense(
224 |         clpcnet.PCM_LEVELS, activation='softmax', name='dual_fc')
225 | 
226 |     ###########################################################################
227 |     # Link
228 |     ###########################################################################
229 | 
230 |     # Embed Audio
231 |     sample_rate_embedding = sample_rate_embedding_table(sample_rate_feats)
232 |     sample_rate_embedding = \
233 |         Reshape((-1, 3 * clpcnet.EMBEDDING_SIZE))(sample_rate_embedding)
234 | 
235 |     # Upsample the frame-rate features to the sampling rate
236 |     upsampled = repeat(frame_rate_feats) # Residual connection ---------------
237 |                                                                            # |
238 |     # Concatenate sample-rate and upsampled frame-rate features            # |
239 |     all_sample_rate_feats = Concatenate()(                                 # |
240 |         [sample_rate_embedding, upsampled])                                # |
241 |                                                                            # |
242 |     # Add sample-rate gru A to graph.                                      # |
243 |     activation = gru_a(all_sample_rate_feats)[0]                           # |
244 |                                                                            # |
245 |     # Residual connection between upsampled features and rnn output        # |
246 |     # Note: this is NOT in the original LPCNet paper                       # |
247 |     activation = Concatenate()([activation, upsampled]) # <-------------------
248 | 
249 |     # Add sample-rate gru B to graph
250 |     activation = gru_b(activation)[0]
251 | 
252 |     # Add dual fully-connected layer to graph
253 |     probabilities = dual_dense(activation)
254 | 
255 |     # Reuse components to build decoder model
256 |     decoder_model = decoder(sample_rate_feats,
257 |                             sample_rate_embedding,
258 |                             gru_a,
259 |                             gru_b,
260 |                             dual_dense)
261 | 
262 |     return probabilities, decoder_model
263 | 
264 | 
265 | ###############################################################################
266 | # Custom keras layer
267 | ###############################################################################
268 | 
269 | 
270 | class DualDense(keras.layers.Layer):
271 |     """Dual fully-connected layer"""
272 | 
273 |     channels = 2
274 | 
275 |     def __init__(self, output_size, activation=None, name=None):
276 |         super().__init__(name=name)
277 |         self.output_size = output_size
278 |         self.activation = keras.activations.get(activation)
279 | 
280 |         # Network weights
281 |         self.kernel, self.bias, self.factor = None, None, None
282 | 
283 |     def build(self, input_shape):
284 |         """Initialize the DualDense layer weights"""
285 |         assert len(input_shape) >= 2
286 | 
287 |         # Kernel
288 |         kernel_shape = (self.output_size, input_shape[-1], self.channels)
289 |         self.kernel = self.add_weight(
290 |             name='kernel',
291 |             shape=kernel_shape,
292 |             initializer=keras.initializers.get('glorot_uniform'),
293 |             regularizer=keras.regularizers.get(None),
294 |             constraint=keras.constraints.get(None))
295 | 
296 |         # Bias
297 |         bias_shape = (self.output_size, self.channels)
298 |         self.bias = self.add_weight(
299 |             name='bias',
300 |             shape=bias_shape,
301 |             initializer=keras.initializers.get('zeros'),
302 |             regularizer=keras.regularizers.get(None),
303 |             constraint=keras.constraints.get(None))
304 | 
305 |         # Learned scale factor
306 |         self.factor = self.add_weight(
307 |             name='factor',
308 |             shape=bias_shape,
309 |             initializer=keras.initializers.get('ones'),
310 |             regularizer=keras.regularizers.get(None),
311 |             constraint=keras.constraints.get(None))
312 | 
313 |     def call(self, inputs):
314 |         """Forward pass through the DualDense layer"""
315 |         # Pass through two linear maps
316 |         output = K.dot(inputs, self.kernel) + self.bias
317 | 
318 |         # Scaled tanh nonlinearity
319 |         output = K.tanh(output) * self.factor
320 | 
321 |         # Sum over the two channels of dual-dense layer
322 |         output = K.sum(output, axis=-1)
323 | 
324 |         # Apply optional output activation
325 |         return self.activation(output)
326 | 
327 | 
328 | ###############################################################################
329 | # Custom embedding initializer
330 | ###############################################################################
331 | 
332 | 
333 | def sample_rate_embedding_initializer(shape, dtype=None):
334 |     """Initializer for the sample-rate feature embedding table"""
335 |     # Get output shape
336 |     shape = (np.prod(shape[:-1]), shape[-1])
337 | 
338 |     # Initialize as uniform noise in [-sqrt(3), sqrt(3)]
339 |     weights = np.random.uniform(-1.7321, 1.7321, shape)
340 | 
341 |     # Add a unique offset to each weight such that the embedding
342 |     # table is encouraged to be ordered
343 |     line = np.arange(-.5 * shape[0] + .5, .5 * shape[0] - .4)
344 |     line *= math.sqrt(12) / shape[0]
345 |     return weights + np.reshape(line, (shape[0], 1))
346 | 


--------------------------------------------------------------------------------
/clpcnet/mp3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import glob
  3 | import multiprocessing as mp
  4 | import os
  5 | import shutil
  6 | import subprocess
  7 | from pathlib import Path
  8 | 
  9 | 
 10 | ###############################################################################
 11 | # Convert audio to mp3
 12 | ###############################################################################
 13 | 
 14 | 
 15 | def convert_file(input_file, output_file=None, verbose=False):
 16 |     """Convert audio file to mp3"""
 17 |     # Handle input files starting with hyphen
 18 |     clean_input = False
 19 |     if input_file.stem.startswith('-'):
 20 |         dummy_file = input_file.parent / input_file.name[1:]
 21 |         shutil.copyfile(input_file, dummy_file)
 22 |         input_file = dummy_file
 23 |         clean_input = True
 24 | 
 25 |     # Handle output files starting with hyphen
 26 |     clean_output = False
 27 |     if output_file.stem.startswith('-'):
 28 |         output_file = output_file.parent / output_file.name[1:]
 29 |         clean_output = True
 30 | 
 31 |     # Default output filename is same as input but with MP3 extension
 32 |     if output_file is None:
 33 |         output_file = input_file.with_suffix('.mp3')
 34 | 
 35 |     # Convert
 36 |     args = [
 37 |         'ffmpeg',
 38 |         '-y',
 39 |         '-i',
 40 |         str(input_file),
 41 |         '-b:a',
 42 |         '320k',
 43 |         str(output_file)]
 44 |     process = subprocess.Popen(
 45 |         args,
 46 |         stdout=subprocess.PIPE,
 47 |         stderr=subprocess.PIPE,
 48 |         universal_newlines=True)
 49 |     stdout, stderr = process.communicate()
 50 | 
 51 |     # Maybe print
 52 |     if verbose or process.returncode != 0:
 53 |         print(stdout)
 54 |         print(stderr)
 55 | 
 56 |     # Clean-up input files starting with hyphen
 57 |     if clean_input:
 58 |         os.remove(input_file)
 59 | 
 60 |     # Clean-up output files starting with hyphen
 61 |     if clean_output:
 62 |         os.replace(output_file, output_file.parent / ('-' + output_file.name))
 63 | 
 64 | 
 65 | def convert_files(input_files, output_files=None):
 66 |     """Convert audio files to mp3"""
 67 |     # Convert to paths
 68 |     input_files = [Path(file) for file in input_files]
 69 | 
 70 |     # Default output filename is same as input but with MP3 extension
 71 |     if output_files is None:
 72 |         output_files = [file.with_suffix('.mp3') for file in input_files]
 73 | 
 74 |     # Multiprocess conversion
 75 |     with mp.Pool() as pool:
 76 |         pool.starmap(convert_file, zip(input_files, output_files))
 77 | 
 78 |     # for input_file, output_file in zip(input_files, output_files):
 79 |     #     convert_file(input_file, output_file)
 80 | 
 81 | 
 82 | ###############################################################################
 83 | # Entry point
 84 | ###############################################################################
 85 | 
 86 | 
 87 | def expand_files(files):
 88 |     """Expands a wildcard to a list of paths for Windows compatibility"""
 89 |     # Split at whitespace
 90 |     files = files.split()
 91 | 
 92 |     # Handle wildcard expansion
 93 |     if len(files) == 1 and '*' in files[0]:
 94 |         files = glob.glob(files[0])
 95 | 
 96 |     # Convert to Path objects
 97 |     return files
 98 | 
 99 | 
100 | def parse_args():
101 |     """Parse command-line arguments"""
102 |     parser = argparse.ArgumentParser()
103 | 
104 |     # Handle wildcards across platforms
105 |     if os.name == 'nt':
106 |         parser.add_argument(
107 |             '--input_files',
108 |             type=expand_files,
109 |             help='The audio files to convert to mp3')
110 |     else:
111 |         parser.add_argument(
112 |             '--input_files',
113 |             nargs='+',
114 |             help='The audio files to convert to mp3')
115 | 
116 |     parser.add_argument(
117 |         '--output_files',
118 |         type=Path,
119 |         nargs='+',
120 |         help='The corresponding output files. ' +
121 |              'Uses same filename with mp3 extension by default')
122 |     return parser.parse_args()
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     convert_files(**vars(parse_args()))
127 | 


--------------------------------------------------------------------------------
/clpcnet/partition.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import functools
  3 | import itertools
  4 | import json
  5 | import random
  6 | from pathlib import Path
  7 | 
  8 | import tqdm
  9 | 
 10 | import clpcnet
 11 | 
 12 | 
 13 | ###############################################################################
 14 | # Partition
 15 | ###############################################################################
 16 | 
 17 | 
 18 | def daps_segmented(directory):
 19 |     """Partition daps-segmented dataset"""
 20 |     files = list(directory.rglob('*.wav'))
 21 | 
 22 |     # Get files corresponding to each selected speaker
 23 |     speaker_files = {
 24 |         s: [f for f in files if f.stem.split('_')[0] == s]
 25 |         for s in ['f1', 'f3', 'f4', 'f5', 'f6', 'm1', 'm3', 'm4', 'm5', 'm6']}
 26 | 
 27 |     # Deterministic but random selection
 28 |     random.seed(0)
 29 |     test_files = itertools.chain(
 30 |         *[random.sample(f, 10) for f in speaker_files.values()])
 31 | 
 32 |     return {'test': [f.stem for f in test_files]}
 33 | 
 34 | 
 35 | def ravdess_hifi(directory):
 36 |     """Partition ravdess dataset"""
 37 |     partition_file = clpcnet.ASSETS_DIR / 'partition' / 'ravdess-variable.json'
 38 |     with open(partition_file) as file:
 39 |         pairs = json.load(file)['test']
 40 |     stems = set(list(itertools.chain(*pairs)))
 41 |     return {'test': list(stems)}
 42 | 
 43 | 
 44 | def ravdess_variable(directory, gpu=None):
 45 |     """Partition ravdess dataset into prosody transfer pairs"""
 46 |     pairs = []
 47 |     generator = clpcnet.evaluate.prosody.ravdess_generator(directory, gpu)
 48 |     for transfer in tqdm.tqdm(generator):
 49 |         pairs.append(transfer.name.split('_'))
 50 |     return {'test': pairs}
 51 | 
 52 | 
 53 | def vctk(directory, rejects=['p341_101']):
 54 |     """Partition vctk dataset"""
 55 |     # Load speaker info
 56 |     with open(directory / 'speaker-info.txt') as file:
 57 |         lines = file.readlines()
 58 |         speakers = [VCTKSpeaker(line) for line in lines[1:]]
 59 | 
 60 |         # Filter out speakers where mic 2 is not available
 61 |         speakers = [s for s in speakers if s.id not in ['p280', 'p315']]
 62 | 
 63 |     # Shuffle speakers
 64 |     random.seed(0)
 65 |     random.shuffle(speakers)
 66 | 
 67 |     # Partition speakers
 68 |     male = [s.id for s in speakers if s.gender == 'M']
 69 |     female = [s.id for s in speakers if s.gender == 'F']
 70 |     train_speaker = male[:-4] + female[:-4]
 71 |     test_speaker = male[-4:] + female[-4:]
 72 | 
 73 |     # Get file lists relative to root directory
 74 |     text_directory = directory / 'txt'
 75 |     train_files = chain_list_files(text_directory, train_speaker)
 76 |     test_files = chain_list_files(text_directory, test_speaker)
 77 | 
 78 |     # Require mic 2 be available
 79 |     train_files = vctk_mic_check(train_files)
 80 |     test_files = vctk_mic_check(test_files)
 81 | 
 82 |     # Move some train files to a separate test partition of seen speakers
 83 |     test_seen_speaker = male[:10] + female[:10]
 84 |     test_seen_files = [
 85 |         random.sample([f for f in train_files
 86 |                        if s in f.stem and f.stem not in rejects], 5)
 87 |         for s in test_seen_speaker]
 88 |     test_seen_files = list(itertools.chain(*test_seen_files))
 89 |     train_files = [f for f in train_files if f not in test_seen_files]
 90 | 
 91 |     # Pack partition dictionary
 92 |     return {
 93 |         'train': sorted([f.stem for f in train_files
 94 |                          if f.stem not in rejects]),
 95 |         'test': sorted([f.stem for f in test_files
 96 |                         if f.stem not in rejects]),
 97 |         'test-seen': sorted([f.stem for f in test_seen_files])}
 98 | 
 99 | 
100 | ###############################################################################
101 | # Utilities
102 | ###############################################################################
103 | 
104 | 
105 | class VCTKSpeaker:
106 | 
107 |     def __init__(self, line):
108 |         line = self.strip_comment(line)
109 |         self.id, _, self.gender = line.split()[:3]
110 | 
111 |     @staticmethod
112 |     def strip_comment(line):
113 |         comment_index = line.find('(')
114 |         return line[:comment_index] if comment_index != -1 else line
115 | 
116 | 
117 | def chain_list_files(directory, subdirectories):
118 |     """List files in all subdirectories"""
119 |     return list(itertools.chain(
120 |         *[(directory / sd).glob('**/*') for sd in subdirectories]))
121 | 
122 | 
123 | def vctk_mic_check(files):
124 |     """Filter files by whether mic 2 is available"""
125 |     directory = files[0].parent.parent.parent / 'wav48_silence_trimmed'
126 |     result = []
127 |     for file in files:
128 |         speaker = file.parent.name
129 |         if (directory / speaker / f'{file.stem}_mic2.flac').exists():
130 |             result.append(file)
131 |     return result
132 | 
133 | 
134 | ###############################################################################
135 | # Entry point
136 | ###############################################################################
137 | 
138 | 
139 | def main():
140 |     """Partition dataset"""
141 |     # Parse command-line arguments
142 |     args = parse_args()
143 | 
144 |     # Get partitioning function
145 |     if args.dataset == 'daps-segmented':
146 |         partition_fn = daps_segmented
147 |     elif args.dataset == 'ravdess-hifi':
148 |         partition_fn = ravdess_hifi
149 |     elif args.dataset == 'ravdess-variable':
150 |         partition_fn = functools.partial(ravdess_variable, gpu=args.gpu)
151 |     elif args.dataset == 'vctk':
152 |         partition_fn = vctk
153 |     else:
154 |         raise ValueError(f'No dataset {args.dataset}')
155 | 
156 |     # Partition
157 |     partition = partition_fn(args.directory)
158 | 
159 |     # Save to disk
160 |     with open(clpcnet.data.partition_file(args.dataset), 'w') as file:
161 |         json.dump(partition, file, indent=4)
162 | 
163 | 
164 | def parse_args():
165 |     """Parse command-line arguments"""
166 |     parser = argparse.ArgumentParser()
167 |     parser.add_argument(
168 |         '--dataset',
169 |         default='vctk',
170 |         help='The name of the dataset')
171 |     parser.add_argument(
172 |         '--directory',
173 |         type=Path,
174 |         default=clpcnet.DATA_DIR,
175 |         help='The data directory')
176 |     parser.add_argument(
177 |         '--gpu',
178 |         type=int,
179 |         default=None,
180 |         help='The gpu to use')
181 | 
182 |     # Extend directory with dataset name
183 |     args = parser.parse_args()
184 |     dataset = \
185 |         'ravdess-hifi' if args.dataset == 'ravdess-variable' else args.dataset
186 |     args.directory = args.directory / dataset
187 | 
188 |     return args
189 | 
190 | 
191 | if __name__ == '__main__':
192 |     main()
193 | 


--------------------------------------------------------------------------------
/clpcnet/pitch.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import tempfile
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pypar
  7 | import torch
  8 | import torchcrepe
  9 | import tqdm
 10 | 
 11 | import clpcnet
 12 | 
 13 | 
 14 | ###############################################################################
 15 | # Pitch methods
 16 | ###############################################################################
 17 | 
 18 | 
 19 | def crepe(audio, gpu=None):
 20 |     """Preprocess crepe pitch from audio"""
 21 |     # Highpass
 22 |     audio = clpcnet.preprocess.highpass(audio)
 23 | 
 24 |     # Estimate pitch
 25 |     pitch, periodicity = torchcrepe.predict(
 26 |         torch.tensor(audio.copy(), dtype=torch.float)[None],
 27 |         sample_rate=clpcnet.SAMPLE_RATE,
 28 |         fmin=clpcnet.FMIN,
 29 |         fmax=clpcnet.FMAX,
 30 |         model='full',
 31 |         return_periodicity=True,
 32 |         batch_size=1024,
 33 |         device='cpu' if gpu is None else f'cuda:{gpu}')
 34 | 
 35 |     # Detach from graph
 36 |     pitch = pitch.cpu().squeeze().numpy()
 37 |     periodicity = periodicity.cpu().squeeze().numpy()
 38 | 
 39 |     # Set low energy frames to unvoiced
 40 |     periodicity[clpcnet.loudness.a_weighted(audio) < -60.] = 0.
 41 | 
 42 |     return pitch, periodicity
 43 | 
 44 | 
 45 | def yin(audio):
 46 |     """Preprocess yin pitch from audio"""
 47 |     with tempfile.TemporaryDirectory() as directory:
 48 |         prefix = Path(directory) / 'tmp'
 49 | 
 50 |         # Preprocess and save to disk
 51 |         clpcnet.preprocess.from_audio_to_file(audio, prefix)
 52 | 
 53 |         # Load features
 54 |         features = clpcnet.load.features(f'{prefix}-frames.f32')
 55 | 
 56 |         # Extrect pitch and periodicity
 57 |         pitch = features[0, :, clpcnet.PITCH_IDX]
 58 |         periodicity = features[0, :, clpcnet.CORRELATION_IDX]
 59 | 
 60 |         # Convert to hz
 61 |         pitch = clpcnet.convert.epochs_to_hz(pitch)
 62 | 
 63 |         # Bound
 64 |         pitch[pitch > clpcnet.FMAX] = clpcnet.FMAX
 65 |         pitch[pitch < clpcnet.FMIN] = clpcnet.FMIN
 66 | 
 67 |         # Scale periodicity to [0, 1]
 68 |         return pitch, (periodicity + .4) / .8
 69 | 
 70 | 
 71 | ###############################################################################
 72 | # Interface
 73 | ###############################################################################
 74 | 
 75 | 
 76 | def from_audio(audio, gpu=None):
 77 |     """Preprocess pitch from audio"""
 78 |     if clpcnet.ABLATE_CREPE:
 79 |         return yin(audio)
 80 |     return crepe(audio, gpu)
 81 | 
 82 | 
 83 | def from_audio_to_file(audio, prefix, gpu=None):
 84 |     """Perform pitch estimation on audio and save to disk"""
 85 |     # Perform pitch estimation
 86 |     pitch, periodicity = from_audio(audio, gpu)
 87 | 
 88 |     # Save to disk
 89 |     np.save(f'{prefix}-pitch.npy', pitch)
 90 |     np.save(f'{prefix}-periodicity.npy', periodicity)
 91 | 
 92 | 
 93 | def from_dataset_to_files(dataset,
 94 |                           directory,
 95 |                           cache,
 96 |                           gpu=None):
 97 |     """Perform pitch estimation on dataset and save to disk"""
 98 |     # Get filenames
 99 |     files = clpcnet.data.files(dataset, directory, 'train')
100 | 
101 |     # Get prefixes
102 |     prefixes = [
103 |         cache / f'{clpcnet.data.file_to_stem(dataset, file)}-r100'
104 |         for file in files]
105 | 
106 |     # Perform pitch estimation
107 |     from_files_to_files(files, prefixes, gpu)
108 | 
109 | 
110 | def from_file(file, gpu=None):
111 |     """Preprocess crepe pitch from file"""
112 |     # Load and estimate pitch
113 |     return from_audio(clpcnet.load.audio(file), gpu)
114 | 
115 | 
116 | def from_file_to_file(file, prefix, gpu=None):
117 |     """Preprocess crepe pitch from file and save to disk"""
118 |     pitch, periodicity = from_file(file, gpu)
119 |     np.save(f'{prefix}-pitch.npy', pitch)
120 |     np.save(f'{prefix}-periodicity.npy', periodicity)
121 | 
122 | 
123 | def from_files_to_files(files, prefixes, gpu=None):
124 |     """Preprocess pitch from files and save to disk"""
125 |     iterator = zip(files, prefixes)
126 |     iterator = tqdm.tqdm(iterator, desc='pitch estimation', dynamic_ncols=True)
127 |     for file, prefix in iterator:
128 |         from_file_to_file(file, prefix, gpu)
129 | 
130 | 
131 | ###############################################################################
132 | # Utilities
133 | ###############################################################################
134 | 
135 | 
136 | def align(source, target, source_alignment, target_alignment):
137 |     """Align target pitch with source by inverting the alignment"""
138 |     # Get relative rates for each frame
139 |     rates = pypar.compare.per_frame_rate(source_alignment,
140 |                                          target_alignment,
141 |                                          clpcnet.SAMPLE_RATE,
142 |                                          clpcnet.HOPSIZE)
143 | 
144 |     # Get interpolation indices
145 |     indices = np.cumsum(np.array(rates))
146 | 
147 |     # Interpolate
148 |     return np.interp(indices, np.arange(len(target)), target)
149 | 
150 | 
151 | def threshold(pitch, periodicity):
152 |     """Threshold pitch via periodicity contour"""
153 |     return torchcrepe.threshold.Hysteresis()(
154 |         torch.tensor(pitch)[None],
155 |         torch.tensor(periodicity)[None]).squeeze().numpy()
156 | 
157 | 
158 | ###############################################################################
159 | # Entry point
160 | ###############################################################################
161 | 
162 | 
163 | def parse_args():
164 |     """Parse command-line arguments"""
165 |     parser = argparse.ArgumentParser()
166 |     parser.add_argument(
167 |         '--dataset',
168 |         default='vctk',
169 |         help='The dataset to perform pitch tracking on')
170 |     parser.add_argument(
171 |         '--directory',
172 |         type=Path,
173 |         default=clpcnet.DATA_DIR,
174 |         help='The data directory')
175 |     parser.add_argument(
176 |         '--cache',
177 |         type=Path,
178 |         default=clpcnet.CACHE_DIR,
179 |         help='The cache directory')
180 |     parser.add_argument(
181 |         '--gpu',
182 |         type=int,
183 |         default=None,
184 |         help='The gpu to use for pitch tracking')
185 | 
186 |     # Extend directories with dataset name
187 |     args = parser.parse_args()
188 |     args.directory = args.directory / args.dataset
189 |     args.cache = args.cache / args.dataset
190 | 
191 |     return args
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     from_dataset_to_files(**vars(parse_args()))
196 | 


--------------------------------------------------------------------------------
/clpcnet/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/clpcnet/preprocess/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import clpcnet
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Entry point
 9 | ###############################################################################
10 | 
11 | 
12 | def parse_args():
13 |     """Parse command-line arguments"""
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument(
16 |         '--dataset',
17 |         default='vctk',
18 |         help='The dataset to preprocess')
19 |     parser.add_argument(
20 |         '--directory',
21 |         type=Path,
22 |         default=clpcnet.DATA_DIR,
23 |         help='The data directory')
24 |     parser.add_argument(
25 |         '--cache',
26 |         type=Path,
27 |         default=clpcnet.CACHE_DIR,
28 |         help='The cache directory')
29 | 
30 |     # Extend directories with dataset name
31 |     args = parser.parse_args()
32 |     args.directory = args.directory / args.dataset
33 |     args.cache = args.cache / args.dataset
34 | 
35 |     return args
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     clpcnet.preprocess.from_dataset_to_files(**vars(parse_args()))
40 | 


--------------------------------------------------------------------------------
/clpcnet/preprocess/augment.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import multiprocessing as mp
  3 | import os
  4 | import random
  5 | from pathlib import Path
  6 | 
  7 | import numpy as np
  8 | import soundfile
  9 | import tqdm
 10 | 
 11 | import clpcnet
 12 | 
 13 | 
 14 | ###############################################################################
 15 | # Constants
 16 | ###############################################################################
 17 | 
 18 | 
 19 | ALLOWED_SCALES = [50, 67, 75, 80, 125, 133, 150, 200]
 20 | DATASET = 'vctk'
 21 | PASSES = 8
 22 | 
 23 | 
 24 | ###############################################################################
 25 | # Data augmentation
 26 | ###############################################################################
 27 | 
 28 | 
 29 | def dataset(dataset=DATASET,
 30 |             directory=clpcnet.CACHE_DIR / DATASET,
 31 |             cache=clpcnet.DATA_DIR / DATASET,
 32 |             allowed_scales=ALLOWED_SCALES,
 33 |             passes=PASSES,
 34 |             gpu=None):
 35 |     """Perform data augmentation for a given dataset"""
 36 |     # Compute the current histogram from pitch files in cache and determine
 37 |     # for each example which scales have been used
 38 |     counts, scales = count_cache(dataset, cache)
 39 | 
 40 |     # Get list of audio files
 41 |     files = clpcnet.data.files(dataset, directory, 'train')
 42 |     random.seed(0)
 43 |     random.shuffle(files)
 44 | 
 45 |     # Preprocessing workers
 46 |     feature_pool = mp.Pool(min(os.cpu_count() - 1, 2))
 47 |     pitch_pool = mp.Pool(1)
 48 | 
 49 |     # Iterate over dataset
 50 |     for i in range(passes):
 51 |         iterator = tqdm.tqdm(files,
 52 |                              dynamic_ncols=True,
 53 |                              desc=f'augmentation pass {i}')
 54 |         for file in iterator:
 55 | 
 56 |             # Load pitch
 57 |             stem = clpcnet.data.file_to_stem(dataset, file)
 58 |             pitch = np.load(cache / f'{stem}-r100-pitch.npy')
 59 |             periodicity = np.load(cache / f'{stem}-r100-periodicity.npy')
 60 | 
 61 |             # Threshold pitch
 62 |             pitch = clpcnet.pitch.threshold(pitch, periodicity)
 63 | 
 64 |             # Select scale to use that maximizes entropy
 65 |             scale, counts = select_scale(pitch[~np.isnan(pitch)],
 66 |                                         counts,
 67 |                                         allowed_scales,
 68 |                                         scales[stem])
 69 | 
 70 |             # No unused scale for this file
 71 |             if scale is None:
 72 |                 continue
 73 | 
 74 |             # Load audio
 75 |             audio, sample_rate = soundfile.read(file)
 76 | 
 77 |             # Scale audio
 78 |             scaled = clpcnet.preprocess.resample(audio,
 79 |                                                  (scale / 100.) * sample_rate,
 80 |                                                  sample_rate)
 81 | 
 82 |             # Resample to lpcnet sample rate
 83 |             scaled = clpcnet.preprocess.resample(scaled, sample_rate)
 84 | 
 85 |             # Preprocess
 86 |             prefix = f'{cache / stem}-r{scale:03}'
 87 |             feature_pool.apply_async(clpcnet.preprocess.from_audio_to_file,
 88 |                                      (scaled, prefix))
 89 |             pitch_pool.apply_async(clpcnet.pitch.from_audio_to_file,
 90 |                                    (scaled, prefix, gpu))
 91 |             # clpcnet.pitch.from_audio_to_file(scaled, prefix, gpu)
 92 | 
 93 |             # Mark scale as used
 94 |             scales[stem].append(scale)
 95 | 
 96 |     # Close worker pools
 97 |     feature_pool.close()
 98 |     pitch_pool.close()
 99 | 
100 |     # Wait for preprocessing to finish
101 |     feature_pool.join()
102 |     pitch_pool.join()
103 | 
104 | 
105 | ###############################################################################
106 | # Utilities
107 | ###############################################################################
108 | 
109 | 
110 | def count_cache(dataset, cache):
111 |     """Compute pitch histogram and used scales of examples in cache"""
112 |     counts = np.zeros(clpcnet.PITCH_BINS, dtype=int)
113 |     scales = {}
114 | 
115 |     # Loop over pitch files
116 |     for file in cache.glob('*-pitch.npy'):
117 | 
118 |         # Load pitch
119 |         pitch = np.load(file)
120 |         periodicity = np.load(str(file).replace('-pitch.npy',
121 |                                                 '-periodicity.npy'))
122 | 
123 |         # Add pitch to histogram
124 |         counts += count_pitch(clpcnet.pitch.threshold(pitch, periodicity))
125 | 
126 |         # Add scale to used set
127 |         stem = file.stem[:-11]
128 |         if stem not in scales:
129 |             scales[stem] = []
130 |         scales[stem].append(int(file.stem[-9:-6]))
131 | 
132 |     return counts, scales
133 | 
134 | 
135 | def count_pitch(pitch):
136 |     """Compute pitch histogram on pitch in Hz"""
137 |     bins = clpcnet.convert.hz_to_bins(pitch[~np.isnan(pitch)])
138 |     return np.bincount(bins, minlength=clpcnet.PITCH_BINS)
139 | 
140 | 
141 | def entropy(counts):
142 |     """Compute the entropy of the categorical distribution defined by counts"""
143 |     # Compute categorical distribution parameters
144 |     distribution = counts / counts.sum(keepdims=True)
145 | 
146 |     # Compute entropy contribution of each category
147 |     contribution = distribution * np.log2(distribution)
148 |     contribution[np.isnan(contribution)] = 0.
149 | 
150 |     return - (1. / np.log2(len(distribution))) * contribution.sum()
151 | 
152 | 
153 | def scale_pitch(pitch, scale):
154 |     """Scale pitch by scale factor"""
155 |     # Scale
156 |     scale_min = clpcnet.FMIN / pitch.min()
157 |     scale_max = clpcnet.FMAX / pitch.max()
158 |     scale = scale_min if scale < scale_min else scale
159 |     scale = scale_max if scale > scale_max else scale
160 |     pitch = scale * pitch.copy()
161 | 
162 |     # Interpolate
163 |     scaled = np.interp(np.arange(0, len(pitch), scale),
164 |                        np.arange(len(pitch)),
165 |                        pitch)
166 | 
167 |     return scaled, int(100 * scale)
168 | 
169 | 
170 | def select_scale(pitch, counts, allowed_scales, used_scales):
171 |     """
172 |     Shift the pitch by all allowed scales. If scale causes pitch to be
173 |     outside (50, 550), use the closest scale that keeps pitch in this range.
174 |     Do not use scale values that have already been used for this file.
175 |     """
176 |     best_entropy, best_scale = None, None
177 |     for scale in set(allowed_scales) - set(used_scales):
178 | 
179 |         # Scale pitch
180 |         scaled, scale = scale_pitch(pitch, scale / 100.)
181 | 
182 |         # If scale was clipped, make sure we can still use it
183 |         if scale in used_scales:
184 |             continue
185 | 
186 |         # Get pitch histogram
187 |         scale_counts = counts + count_pitch(scaled)
188 | 
189 |         # Measure entropy for this scale
190 |         scale_entropy = entropy(scale_counts)
191 | 
192 |         # Select scale if it maximizes entropy
193 |         if best_entropy is None or \
194 |            (best_entropy is not None and scale_entropy > best_entropy):
195 |             best_entropy, best_scale = scale_entropy, scale
196 |             counts = scale_counts
197 | 
198 |     return best_scale, counts
199 | 
200 | 
201 | ###############################################################################
202 | # Entry point
203 | ###############################################################################
204 | 
205 | 
206 | def parse_args():
207 |     """Parse command-line arguments"""
208 |     parser = argparse.ArgumentParser()
209 | 
210 |     parser.add_argument(
211 |         '--dataset',
212 |         default=DATASET,
213 |         help='The name of the dataset')
214 |     parser.add_argument(
215 |         '--directory',
216 |         default=clpcnet.DATA_DIR,
217 |         type=Path,
218 |         help='The data directory')
219 |     parser.add_argument(
220 |         '--cache',
221 |         default=clpcnet.CACHE_DIR,
222 |         type=Path,
223 |         help='The cache directory')
224 |     parser.add_argument(
225 |         '--allowed_scales',
226 |         nargs='+',
227 |         type=float,
228 |         default=ALLOWED_SCALES,
229 |         help='The allowable scale values for resampling')
230 |     parser.add_argument(
231 |         '--passes',
232 |         type=int,
233 |         default=PASSES,
234 |         help='The number of augmentation passes to make over the dataset')
235 |     parser.add_argument(
236 |         '--gpu',
237 |         type=int,
238 |         default=None,
239 |         help='The index of the gpu to use')
240 | 
241 |     # Extend directories with dataset name
242 |     args = parser.parse_args()
243 |     args.directory = args.directory / args.dataset
244 |     args.cache = args.cache / args.dataset
245 | 
246 |     return args
247 | 
248 | 
249 | if __name__ == '__main__':
250 |     dataset(**vars(parse_args()))
251 | 


--------------------------------------------------------------------------------
/clpcnet/preprocess/core.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import subprocess
  3 | import tempfile
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import resampy
  8 | import scipy
  9 | 
 10 | import clpcnet
 11 | 
 12 | 
 13 | __all__ = ['from_audio',
 14 |            'from_audio_to_file',
 15 |            'from_dataset_to_files',
 16 |            'from_file_to_file',
 17 |            'from_files_to_files',
 18 |            'clip',
 19 |            'highpass',
 20 |            'pad',
 21 |            'preemphasis',
 22 |            'resample']
 23 | 
 24 | 
 25 | ###############################################################################
 26 | # Preprocessing transforms
 27 | ###############################################################################
 28 | 
 29 | 
 30 | def clip(audio, threshold=.99):
 31 |     """Normalize audio"""
 32 |     maximum = np.abs(audio).max()
 33 |     return audio * threshold / maximum if maximum > threshold else audio
 34 | 
 35 | 
 36 | def highpass(audio, sample_rate=clpcnet.SAMPLE_RATE, cutoff=65., order=5):
 37 |     """Highpass audio"""
 38 |     # Get filter coefficients
 39 |     b, a = scipy.signal.butter(
 40 |         order, cutoff / (sample_rate / 2), btype='high')
 41 | 
 42 |     # Filter
 43 |     return scipy.signal.filtfilt(b, a, audio)
 44 | 
 45 | 
 46 | def pad(audio):
 47 |     """Pad the audio to be a multiple of the block size"""
 48 |     padding = 2 * clpcnet.BLOCK_SIZE - (audio.size % clpcnet.BLOCK_SIZE)
 49 |     return np.pad(audio, (clpcnet.HOPSIZE // 2, padding))
 50 | 
 51 | 
 52 | def preemphasis(audio, coefficient=clpcnet.PREEMPHASIS_COEF):
 53 |     """Apply preemphasis filter"""
 54 |     result = np.zeros_like(audio)
 55 |     memory = 0.
 56 |     for i in range(len(audio)):
 57 |         result[i] = audio[i] + memory
 58 |         memory = -coefficient * audio[i]
 59 |     return result
 60 | 
 61 | 
 62 | def resample(audio, sample_rate, target_rate=clpcnet.SAMPLE_RATE):
 63 |     """Resample audio"""
 64 |     if sample_rate != target_rate:
 65 |         return resampy.resample(audio, sample_rate, target_rate)
 66 |     return audio
 67 | 
 68 | 
 69 | ###############################################################################
 70 | # Preprocess data
 71 | ###############################################################################
 72 | 
 73 | 
 74 | def from_dataset_to_files(dataset, directory, cache):
 75 |     """Preprocess dataset"""
 76 |     # Get filenames
 77 |     files = clpcnet.data.files(dataset, directory, 'train')
 78 | 
 79 |     # Get prefixes
 80 |     prefixes = [
 81 |         cache / f'{clpcnet.data.file_to_stem(dataset, file)}-r100'
 82 |         for file in files]
 83 | 
 84 |     # Create cache
 85 |     cache.mkdir(exist_ok=True, parents=True)
 86 | 
 87 |     # Preprocess from joined audio
 88 |     clpcnet.preprocess.from_files_to_files(files, prefixes)
 89 | 
 90 | 
 91 | def from_audio(audio):
 92 |     """Preprocess audio"""
 93 |     # Preprocess to a file in a temporary directory
 94 |     with tempfile.TemporaryDirectory() as directory:
 95 |         prefix = Path(directory) / 'features'
 96 | 
 97 |         # Preprocess
 98 |         from_audio_to_file(audio, prefix)
 99 | 
100 |         # Load features
101 |         return clpcnet.load.features(f'{prefix}-frames.f32')
102 | 
103 | 
104 | def from_audio_to_file(audio, prefix):
105 |     """Preprocess audio and save to disk"""
106 |     # Get number of frames before padding
107 |     frames = 1 + int(len(audio) // clpcnet.HOPSIZE)
108 | 
109 |     # Transform
110 |     audio = clpcnet.loudness.limit(preemphasis(highpass(pad(audio))))
111 | 
112 |     # Convert to 16-bit int
113 |     audio = (audio * clpcnet.MAX_SAMPLE_VALUE).astype(np.int16)
114 | 
115 |     # Write audio to temporary storage and preprocess
116 |     with tempfile.TemporaryDirectory() as directory:
117 |         file = Path(directory) / 'audio.s16'
118 | 
119 |         # Save to disk
120 |         audio.tofile(file)
121 | 
122 |         # Preprocess from file
123 |         from_binary_file_to_file(file, prefix, frames)
124 | 
125 | 
126 | def from_file_to_file(file, prefix):
127 |     """Load, preprocess, and save to disk"""
128 |     from_audio_to_file(clpcnet.load.audio(file), prefix)
129 | 
130 | 
131 | def from_files_to_files(files, prefixes):
132 |     """Load, preprocess, and save many files"""
133 |     with mp.Pool() as pool:
134 |         pool.starmap(from_file_to_file, zip(files, prefixes))
135 | 
136 | 
137 | ###############################################################################
138 | # Utilities
139 | ###############################################################################
140 | 
141 | 
142 | def from_binary_file_to_file(file, prefix, frames):
143 |     """Preprocess from binary s16 file"""
144 |     # Write intermediate output to temporary file
145 |     with tempfile.TemporaryDirectory() as directory:
146 |         frame_file = f'{directory}-frames.f32'
147 |         sample_file = f'{directory}-samples.u8'
148 | 
149 |         # Preprocess in C
150 |         args = [str(Path(__file__).parent.parent.parent / 'bin' / 'preprocess'),
151 |                 str(file),
152 |                 frame_file,
153 |                 sample_file]
154 |         subprocess.Popen(args).wait()
155 | 
156 |         # Truncate to original number of frames
157 |         features = np.fromfile(frame_file, dtype=np.float32)
158 |         features = features[:frames * clpcnet.TOTAL_FEATURE_SIZE]
159 |         features.tofile(f'{prefix}-frames.f32')
160 |         samples = np.fromfile(sample_file, dtype=np.uint8)
161 |         samples = samples[:4 * frames * clpcnet.HOPSIZE]
162 |         samples.tofile(f'{prefix}-samples.u8')
163 | 


--------------------------------------------------------------------------------
/clpcnet/session.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import os
 3 | import sys
 4 | 
 5 | # Import keras without printing backend
 6 | stderr = sys.stderr
 7 | sys.stderr = open(os.devnull, 'w')
 8 | import keras
 9 | sys.stderr = stderr
10 | 
11 | import tensorflow as tf
12 | 
13 | import clpcnet
14 | 
15 | 
16 | ###############################################################################
17 | # Tensorflow session management
18 | ###############################################################################
19 | 
20 | 
21 | class Session:
22 | 
23 |     def __init__(self, file, gpu=None):
24 |         self.file = file
25 |         self.gpu = gpu
26 | 
27 |         # Tensorflow setup
28 |         if gpu is None:
29 |             config = tf.compat.v1.ConfigProto(device_count={'GPU': 0})
30 |         else:
31 |             gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
32 |             config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
33 | 
34 |         self.session = tf.compat.v1.Session(config=config)
35 |         self.graph = tf.compat.v1.get_default_graph()
36 | 
37 |         # Keras setup
38 |         keras.backend.set_session(self.session)
39 | 
40 |         # Device management
41 |         device = 'CPU' if gpu is None else 'GPU'
42 |         number = '0' if gpu is None else str(gpu)
43 |         self.device = f'/{device}:{number}'
44 | 
45 |         # Build LPCNet
46 |         model, encoder, decoder = clpcnet.model(use_gpu=gpu is not None)
47 |         optimizer = keras.optimizers.Adam(clpcnet.LEARNING_RATE,
48 |                                           amsgrad=True,
49 |                                           decay=clpcnet.WEIGHT_DECAY)
50 |         model.compile(optimizer=optimizer,
51 |                       loss='sparse_categorical_crossentropy',
52 |                       metrics=['sparse_categorical_accuracy'])
53 | 
54 |         # Load pretrained weights
55 |         model.load_weights(file)
56 | 
57 |         # Bind model components for inference
58 |         self.encoder = encoder
59 |         self.decoder = decoder
60 | 
61 |     @contextlib.contextmanager
62 |     def context(self):
63 |         """Context manager for tensorflow setup"""
64 |         with tf.device(self.device):
65 |             with self.graph.as_default():
66 |                 yield
67 | 


--------------------------------------------------------------------------------
/clpcnet/world.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pypar
  3 | import pyworld
  4 | import scipy
  5 | import soundfile
  6 | import torch
  7 | 
  8 | import clpcnet
  9 | 
 10 | 
 11 | ###############################################################################
 12 | # WORLD constants
 13 | ###############################################################################
 14 | 
 15 | 
 16 | ALLOWED_RANGE = .8
 17 | 
 18 | 
 19 | ###############################################################################
 20 | # Pitch-shifting and time-stretching with WORLD
 21 | ###############################################################################
 22 | 
 23 | 
 24 | def from_audio(audio,
 25 |                source_alignment=None,
 26 |                target_alignment=None,
 27 |                target_pitch=None,
 28 |                constant_stretch=None,
 29 |                constant_shift=None):
 30 |     """Pitch-shifting and time-stretching with WORLD"""
 31 |     # World parameterization
 32 |     audio = audio.squeeze().numpy()
 33 |     pitch, spectrogram, aperiodicity = analyze(audio)
 34 | 
 35 |     # Variable-ratio pitch-shifting
 36 |     if target_pitch is not None:
 37 |         target_pitch = target_pitch.squeeze().numpy()
 38 | 
 39 |         if (len(target_pitch) != len(pitch) and
 40 |             source_alignment is None and
 41 |             target_alignment is None):
 42 |             raise ValueError(
 43 |                 f'Source pitch of length {len(pitch)} incompatible ' +
 44 |                 f'with target pitch of length {len(target_pitch)}.')
 45 |         pitch = target_pitch.astype(np.float64)
 46 | 
 47 |     # Constant-ratio pitch-shifting
 48 |     if constant_shift is not None:
 49 |         pitch *= constant_shift
 50 | 
 51 |     # Variable-ratio time-stretching
 52 |     if source_alignment is not None and target_alignment is not None:
 53 | 
 54 |         # Align spectrogram and aperiodicity
 55 |         spectrogram = clpcnet.pitch.align(None, spectrogram, target_alignment, source_alignment)
 56 |         aperiodicity = clpcnet.pitch.align(None, aperiodicity, target_alignment, source_alignment)
 57 | 
 58 |     # Constant-ratio time-stretching
 59 |     if constant_stretch is not None:
 60 | 
 61 |         # Get new duration
 62 |         duration = len(audio) / clpcnet.SAMPLE_RATE / constant_stretch
 63 | 
 64 |         # Stretch features
 65 |         pitch, spectrogram, aperiodicity = linear_time_stretch(
 66 |             pitch, spectrogram, aperiodicity, duration)
 67 | 
 68 |     # Synthesize using modified parameters
 69 |     vocoded = pyworld.synthesize(pitch,
 70 |                                  spectrogram,
 71 |                                  aperiodicity,
 72 |                                  clpcnet.SAMPLE_RATE,
 73 |                                  clpcnet.HOPSIZE / clpcnet.SAMPLE_RATE * 1000.)
 74 | 
 75 |     # Trim zero padding
 76 |     return vocoded
 77 | 
 78 | 
 79 | def from_file_to_file(input_file,
 80 |                       output_file,
 81 |                       source_alignment_file=None,
 82 |                       target_alignment_file=None,
 83 |                       target_pitch_file=None,
 84 |                       constant_stretch=None,
 85 |                       constant_shift=None):
 86 |     """Perform pitch-shifting and time-stretching with WORLD on files"""
 87 |     source = torch.tensor(clpcnet.load.audio(input_file))[None]
 88 | 
 89 |     # Load source alignment
 90 |     if source_alignment_file is not None:
 91 |         source_alignment = pypar.Alignment(source_alignment_file)
 92 |     else:
 93 |         source_alignment = None
 94 | 
 95 |     # Load target alignment
 96 |     if target_alignment_file is not None:
 97 |         target_alignment = pypar.Alignment(target_alignment_file)
 98 |     else:
 99 |         target_alignment = None
100 | 
101 |     # Load target pitch
102 |     if target_pitch_file is not None:
103 |         target_pitch = torch.tensor(np.load(target_pitch_file))[None]
104 |     else:
105 |         target_pitch = None
106 | 
107 |     to_file(source,
108 |             output_file,
109 |             source_alignment,
110 |             target_alignment,
111 |             target_pitch,
112 |             constant_stretch,
113 |             constant_shift)
114 | 
115 | 
116 | def to_file(source,
117 |             output_file,
118 |             source_alignment=None,
119 |             target_alignment=None,
120 |             target_pitch=None,
121 |             constant_stretch=None,
122 |             constant_shift=None):
123 |     """Perform pitch-shifting and time-stretching with WORLD and save"""
124 |     vocoded = from_audio(source,
125 |                          source_alignment,
126 |                          target_alignment,
127 |                          target_pitch,
128 |                          constant_stretch,
129 |                          constant_shift)
130 |     soundfile.write(output_file, vocoded, clpcnet.SAMPLE_RATE)
131 | 
132 | 
133 | ###############################################################################
134 | # Vocoding utilities
135 | ###############################################################################
136 | 
137 | 
138 | def analyze(audio):
139 |     """Convert an audio signal to WORLD parameter representation
140 |     Arguments
141 |         audio : np.array(shape=(samples,))
142 |             The audio being analyzed
143 |     Returns
144 |         pitch : np.array(shape=(frames,))
145 |             The pitch contour
146 |         spectrogram : np.array(shape=(frames, channels))
147 |             The audio spectrogram
148 |         aperiodicity : np.array(shape=(frames,))
149 |             The voiced/unvoiced confidence
150 |     """
151 |     # Cast to double
152 |     audio = audio.astype(np.float64)
153 | 
154 |     # Hopsize in milliseconds
155 |     frame_period = clpcnet.HOPSIZE / clpcnet.SAMPLE_RATE * 1000.
156 | 
157 |     # Pitch
158 |     pitch, time = pyworld.dio(audio,
159 |                               clpcnet.SAMPLE_RATE,
160 |                               frame_period=frame_period,
161 |                               f0_floor=clpcnet.FMIN,
162 |                               f0_ceil=clpcnet.FMAX,
163 |                               allowed_range=ALLOWED_RANGE)
164 |     pitch = pyworld.stonemask(audio, pitch, time, clpcnet.SAMPLE_RATE)
165 | 
166 |     # Spectrogram
167 |     spectrogram = pyworld.cheaptrick(audio, pitch, time, clpcnet.SAMPLE_RATE)
168 | 
169 |     # Aperiodicity
170 |     aperiodicity = pyworld.d4c(audio, pitch, time, clpcnet.SAMPLE_RATE)
171 | 
172 |     return pitch, spectrogram, aperiodicity
173 | 
174 | 
175 | def linear_time_stretch(prev_pitch,
176 |                         prev_spectrogram,
177 |                         prev_aperiodicity,
178 |                         duration):
179 |     """Apply time stretch in WORLD parameter space
180 |     Arguments
181 |         prev_pitch : np.array(shape=(frames,))
182 |             The pitch to be stretched
183 |         prev_spectrogram : np.array(shape=(frames, frequencies))
184 |             The spectrogram to be stretched
185 |         prev_aperiodicity : np.array(shape=(frames, frequencies))
186 |             The aperiodicity to be stretched
187 |         duration : float
188 |             The new duration in seconds
189 |     """
190 |     # Number of frames before and after
191 |     prev_frames = len(prev_pitch)
192 |     next_frames = clpcnet.convert.seconds_to_frames(duration)
193 | 
194 |     # Time-aligned grid before and after
195 |     prev_grid = np.linspace(0, prev_frames - 1, prev_frames)
196 |     next_grid = np.linspace(0, prev_frames - 1, next_frames)
197 | 
198 |     # Apply time stretch to pitch
199 |     pitch = linear_time_stretch_pitch(
200 |         prev_pitch, prev_grid, next_grid, next_frames)
201 | 
202 |     # Allocate spectrogram and aperiodicity buffers
203 |     frequencies = prev_spectrogram.shape[1]
204 |     spectrogram = np.zeros((next_frames, frequencies))
205 |     aperiodicity = np.zeros((next_frames, frequencies))
206 | 
207 |     # Apply time stretch to all channels of spectrogram and aperiodicity
208 |     for i in range(frequencies):
209 |         spectrogram[:, i] = scipy.interp(
210 |             next_grid, prev_grid, prev_spectrogram[:, i])
211 |         aperiodicity[:, i] = scipy.interp(
212 |             next_grid, prev_grid, prev_aperiodicity[:, i])
213 | 
214 |     return pitch, spectrogram, aperiodicity
215 | 
216 | 
217 | def linear_time_stretch_pitch(pitch, prev_grid, next_grid, next_frames):
218 |     """Perform time-stretching on pitch features"""
219 |     if (pitch == 0.).all():
220 |         return np.zeros(next_frames)
221 | 
222 |     # Get unvoiced tokens
223 |     unvoiced = pitch == 0.
224 | 
225 |     # Linearly interpolate unvoiced regions
226 |     pitch[unvoiced] = np.interp(
227 |         np.where(unvoiced)[0], np.where(~unvoiced)[0], pitch[~unvoiced])
228 | 
229 |     # Apply time stretch to pitch
230 |     pitch = scipy.interp(next_grid, prev_grid, pitch)
231 | 
232 |     # Apply time stretch to unvoiced sequence
233 |     unvoiced = scipy.interp(next_grid, prev_grid, unvoiced)
234 | 
235 |     # Reapply unvoiced tokens
236 |     pitch[unvoiced > .5] = 0.
237 | 
238 |     return pitch
239 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/data/.gitkeep


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | h5py==2.10.0
 2 | keras==2.3.1
 3 | librosa
 4 | matplotlib
 5 | numpy
 6 | protobuf==3.20.1
 7 | pyfoal
 8 | pypar
 9 | pyworld
10 | scipy
11 | soundfile
12 | tensorflow-gpu==1.15
13 | torch
14 | torchcrepe
15 | tqdm
16 | 


--------------------------------------------------------------------------------
/runs/cache/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/cache/.gitkeep


--------------------------------------------------------------------------------
/runs/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/runs/eval/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/eval/.gitkeep


--------------------------------------------------------------------------------
/runs/log/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maxrmorrison/clpcnet/5f5809a7812c9623b2ac09c21744746b56c5029a/runs/log/.gitkeep


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from pkg_resources import parse_requirements
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | with open('README.md') as file:
 7 |     long_description = file.read()
 8 | 
 9 | 
10 | with open(Path(__file__).parent / 'requirements.txt') as file:
11 |     requirements = [str(req) for req in parse_requirements(file)]
12 | 
13 | 
14 | setup(
15 |     name='clpcnet',
16 |     version='0.0.1',
17 |     description='Neural pitch-shifting and time-stretching with controllable lpcnet',
18 |     author='Max Morrison',
19 |     author_email='maxrmorrison@gmail.com',
20 |     url='https://github.com/maxrmorrison/clpcnet',
21 |     packages=['clpcnet'],
22 |     package_data={'clpcnet': ['assets/*']},
23 |     long_description=long_description,
24 |     long_description_content_type='text/markdown',
25 |     keywords='speech vocoder prosody pitch-shifting time-stretching lpcnet',
26 |     install_requires=requirements)
27 | 


--------------------------------------------------------------------------------
/src/_kiss_fft_guts.h:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2003-2004, Mark Borgerding
  2 | 
  3 |   All rights reserved.
  4 | 
  5 |   Redistribution and use in source and binary forms, with or without
  6 |    modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright notice,
  9 |        this list of conditions and the following disclaimer.
 10 |     * Redistributions in binary form must reproduce the above copyright notice,
 11 |        this list of conditions and the following disclaimer in the
 12 |        documentation and/or other materials provided with the distribution.
 13 | 
 14 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 15 |   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 16 |   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 17 |   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 18 |   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 19 |   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 20 |   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 21 |   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 22 |   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23 |   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24 |   POSSIBILITY OF SUCH DAMAGE.*/
 25 | 
 26 | #ifndef KISS_FFT_GUTS_H
 27 | #define KISS_FFT_GUTS_H
 28 | 
 29 | #define MIN(a,b) ((a)<(b) ? (a):(b))
 30 | #define MAX(a,b) ((a)>(b) ? (a):(b))
 31 | 
 32 | /* kiss_fft.h
 33 |    defines kiss_fft_scalar as either short or a float type
 34 |    and defines
 35 |    typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
 36 | #include "kiss_fft.h"
 37 | 
 38 | /*
 39 |   Explanation of macros dealing with complex math:
 40 | 
 41 |    C_MUL(m,a,b)         : m = a*b
 42 |    C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
 43 |    C_SUB( res, a,b)     : res = a - b
 44 |    C_SUBFROM( res , a)  : res -= a
 45 |    C_ADDTO( res , a)    : res += a
 46 |  * */
 47 | #ifdef FIXED_POINT
 48 | #include "arch.h"
 49 | 
 50 | 
 51 | #define SAMP_MAX 2147483647
 52 | #define TWID_MAX 32767
 53 | #define TRIG_UPSCALE 1
 54 | 
 55 | #define SAMP_MIN -SAMP_MAX
 56 | 
 57 | 
 58 | #   define S_MUL(a,b) MULT16_32_Q15(b, a)
 59 | 
 60 | #   define C_MUL(m,a,b) \
 61 |       do{ (m).r = SUB32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
 62 |           (m).i = ADD32_ovflw(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
 63 | 
 64 | #   define C_MULC(m,a,b) \
 65 |       do{ (m).r = ADD32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
 66 |           (m).i = SUB32_ovflw(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
 67 | 
 68 | #   define C_MULBYSCALAR( c, s ) \
 69 |       do{ (c).r =  S_MUL( (c).r , s ) ;\
 70 |           (c).i =  S_MUL( (c).i , s ) ; }while(0)
 71 | 
 72 | #   define DIVSCALAR(x,k) \
 73 |         (x) = S_MUL(  x, (TWID_MAX-((k)>>1))/(k)+1 )
 74 | 
 75 | #   define C_FIXDIV(c,div) \
 76 |         do {    DIVSCALAR( (c).r , div);  \
 77 |                 DIVSCALAR( (c).i  , div); }while (0)
 78 | 
 79 | #define  C_ADD( res, a,b)\
 80 |     do {(res).r=ADD32_ovflw((a).r,(b).r);  (res).i=ADD32_ovflw((a).i,(b).i); \
 81 |     }while(0)
 82 | #define  C_SUB( res, a,b)\
 83 |     do {(res).r=SUB32_ovflw((a).r,(b).r);  (res).i=SUB32_ovflw((a).i,(b).i); \
 84 |     }while(0)
 85 | #define C_ADDTO( res , a)\
 86 |     do {(res).r = ADD32_ovflw((res).r, (a).r);  (res).i = ADD32_ovflw((res).i,(a).i);\
 87 |     }while(0)
 88 | 
 89 | #define C_SUBFROM( res , a)\
 90 |     do {(res).r = ADD32_ovflw((res).r,(a).r);  (res).i = SUB32_ovflw((res).i,(a).i); \
 91 |     }while(0)
 92 | 
 93 | #if defined(OPUS_ARM_INLINE_ASM)
 94 | #include "arm/kiss_fft_armv4.h"
 95 | #endif
 96 | 
 97 | #if defined(OPUS_ARM_INLINE_EDSP)
 98 | #include "arm/kiss_fft_armv5e.h"
 99 | #endif
100 | #if defined(MIPSr1_ASM)
101 | #include "mips/kiss_fft_mipsr1.h"
102 | #endif
103 | 
104 | #else  /* not FIXED_POINT*/
105 | 
106 | #   define S_MUL(a,b) ( (a)*(b) )
107 | #define C_MUL(m,a,b) \
108 |     do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
109 |         (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
110 | #define C_MULC(m,a,b) \
111 |     do{ (m).r = (a).r*(b).r + (a).i*(b).i;\
112 |         (m).i = (a).i*(b).r - (a).r*(b).i; }while(0)
113 | 
114 | #define C_MUL4(m,a,b) C_MUL(m,a,b)
115 | 
116 | #   define C_FIXDIV(c,div) /* NOOP */
117 | #   define C_MULBYSCALAR( c, s ) \
118 |     do{ (c).r *= (s);\
119 |         (c).i *= (s); }while(0)
120 | #endif
121 | 
122 | #ifndef CHECK_OVERFLOW_OP
123 | #  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
124 | #endif
125 | 
126 | #ifndef C_ADD
127 | #define  C_ADD( res, a,b)\
128 |     do { \
129 |             CHECK_OVERFLOW_OP((a).r,+,(b).r)\
130 |             CHECK_OVERFLOW_OP((a).i,+,(b).i)\
131 |             (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
132 |     }while(0)
133 | #define  C_SUB( res, a,b)\
134 |     do { \
135 |             CHECK_OVERFLOW_OP((a).r,-,(b).r)\
136 |             CHECK_OVERFLOW_OP((a).i,-,(b).i)\
137 |             (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
138 |     }while(0)
139 | #define C_ADDTO( res , a)\
140 |     do { \
141 |             CHECK_OVERFLOW_OP((res).r,+,(a).r)\
142 |             CHECK_OVERFLOW_OP((res).i,+,(a).i)\
143 |             (res).r += (a).r;  (res).i += (a).i;\
144 |     }while(0)
145 | 
146 | #define C_SUBFROM( res , a)\
147 |     do {\
148 |             CHECK_OVERFLOW_OP((res).r,-,(a).r)\
149 |             CHECK_OVERFLOW_OP((res).i,-,(a).i)\
150 |             (res).r -= (a).r;  (res).i -= (a).i; \
151 |     }while(0)
152 | #endif /* C_ADD defined */
153 | 
154 | #ifdef FIXED_POINT
155 | /*#  define KISS_FFT_COS(phase)  TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * cos (phase))))
156 | #  define KISS_FFT_SIN(phase)  TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * sin (phase))))*/
157 | #  define KISS_FFT_COS(phase)  floor(.5+TWID_MAX*cos (phase))
158 | #  define KISS_FFT_SIN(phase)  floor(.5+TWID_MAX*sin (phase))
159 | #  define HALF_OF(x) ((x)>>1)
160 | #elif defined(USE_SIMD)
161 | #  define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
162 | #  define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
163 | #  define HALF_OF(x) ((x)*_mm_set1_ps(.5f))
164 | #else
165 | #  define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
166 | #  define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
167 | #  define HALF_OF(x) ((x)*.5f)
168 | #endif
169 | 
170 | #define  kf_cexp(x,phase) \
171 |         do{ \
172 |                 (x)->r = KISS_FFT_COS(phase);\
173 |                 (x)->i = KISS_FFT_SIN(phase);\
174 |         }while(0)
175 | 
176 | #define  kf_cexp2(x,phase) \
177 |    do{ \
178 |       (x)->r = TRIG_UPSCALE*celt_cos_norm((phase));\
179 |       (x)->i = TRIG_UPSCALE*celt_cos_norm((phase)-32768);\
180 | }while(0)
181 | 
182 | #endif /* KISS_FFT_GUTS_H */
183 | 


--------------------------------------------------------------------------------
/src/arch.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2003-2008 Jean-Marc Valin
  2 |    Copyright (c) 2007-2008 CSIRO
  3 |    Copyright (c) 2007-2009 Xiph.Org Foundation
  4 |    Written by Jean-Marc Valin */
  5 | /**
  6 |    @file arch.h
  7 |    @brief Various architecture definitions for CELT
  8 | */
  9 | /*
 10 |    Redistribution and use in source and binary forms, with or without
 11 |    modification, are permitted provided that the following conditions
 12 |    are met:
 13 | 
 14 |    - Redistributions of source code must retain the above copyright
 15 |    notice, this list of conditions and the following disclaimer.
 16 | 
 17 |    - Redistributions in binary form must reproduce the above copyright
 18 |    notice, this list of conditions and the following disclaimer in the
 19 |    documentation and/or other materials provided with the distribution.
 20 | 
 21 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 22 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 25 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 26 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 27 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 28 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 29 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 30 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 31 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32 | */
 33 | 
 34 | #ifndef ARCH_H
 35 | #define ARCH_H
 36 | 
 37 | #include "opus_types.h"
 38 | #include "common.h"
 39 | 
 40 | # if !defined(__GNUC_PREREQ)
 41 | #  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
 42 | #   define __GNUC_PREREQ(_maj,_min) \
 43 |  ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
 44 | #  else
 45 | #   define __GNUC_PREREQ(_maj,_min) 0
 46 | #  endif
 47 | # endif
 48 | 
 49 | #define CELT_SIG_SCALE 32768.f
 50 | 
 51 | #define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__);
 52 | #ifdef ENABLE_ASSERTIONS
 53 | #include <stdio.h>
 54 | #include <stdlib.h>
 55 | #ifdef __GNUC__
 56 | __attribute__((noreturn))
 57 | #endif
 58 | static inline void _celt_fatal(const char *str, const char *file, int line)
 59 | {
 60 |    fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
 61 |    abort();
 62 | }
 63 | #define celt_assert(cond) {if (!(cond)) {celt_fatal("assertion failed: " #cond);}}
 64 | #define celt_assert2(cond, message) {if (!(cond)) {celt_fatal("assertion failed: " #cond "\n" message);}}
 65 | #else
 66 | #define celt_assert(cond)
 67 | #define celt_assert2(cond, message)
 68 | #endif
 69 | 
 70 | #define IMUL32(a,b) ((a)*(b))
 71 | 
 72 | #define MIN16(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 16-bit value.   */
 73 | #define MAX16(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 16-bit value.   */
 74 | #define MIN32(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum 32-bit value.   */
 75 | #define MAX32(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum 32-bit value.   */
 76 | #define IMIN(a,b) ((a) < (b) ? (a) : (b))   /**< Minimum int value.   */
 77 | #define IMAX(a,b) ((a) > (b) ? (a) : (b))   /**< Maximum int value.   */
 78 | #define UADD32(a,b) ((a)+(b))
 79 | #define USUB32(a,b) ((a)-(b))
 80 | 
 81 | /* Set this if opus_int64 is a native type of the CPU. */
 82 | /* Assume that all LP64 architectures have fast 64-bit types; also x86_64
 83 |    (which can be ILP32 for x32) and Win64 (which is LLP64). */
 84 | #if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
 85 | #define OPUS_FAST_INT64 1
 86 | #else
 87 | #define OPUS_FAST_INT64 0
 88 | #endif
 89 | 
 90 | #define PRINT_MIPS(file)
 91 | 
 92 | #ifdef FIXED_POINT
 93 | 
 94 | typedef opus_int16 opus_val16;
 95 | typedef opus_int32 opus_val32;
 96 | typedef opus_int64 opus_val64;
 97 | 
 98 | typedef opus_val32 celt_sig;
 99 | typedef opus_val16 celt_norm;
100 | typedef opus_val32 celt_ener;
101 | 
102 | #define Q15ONE 32767
103 | 
104 | #define SIG_SHIFT 12
105 | /* Safe saturation value for 32-bit signals. Should be less than
106 |    2^31*(1-0.85) to avoid blowing up on DC at deemphasis.*/
107 | #define SIG_SAT (300000000)
108 | 
109 | #define NORM_SCALING 16384
110 | 
111 | #define DB_SHIFT 10
112 | 
113 | #define EPSILON 1
114 | #define VERY_SMALL 0
115 | #define VERY_LARGE16 ((opus_val16)32767)
116 | #define Q15_ONE ((opus_val16)32767)
117 | 
118 | #define SCALEIN(a)      (a)
119 | #define SCALEOUT(a)     (a)
120 | 
121 | #define ABS16(x) ((x) < 0 ? (-(x)) : (x))
122 | #define ABS32(x) ((x) < 0 ? (-(x)) : (x))
123 | 
124 | static inline opus_int16 SAT16(opus_int32 x) {
125 |    return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
126 | }
127 | 
128 | #ifdef FIXED_DEBUG
129 | #include "fixed_debug.h"
130 | #else
131 | 
132 | #include "fixed_generic.h"
133 | 
134 | #ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR
135 | #include "arm/fixed_arm64.h"
136 | #elif OPUS_ARM_INLINE_EDSP
137 | #include "arm/fixed_armv5e.h"
138 | #elif defined (OPUS_ARM_INLINE_ASM)
139 | #include "arm/fixed_armv4.h"
140 | #elif defined (BFIN_ASM)
141 | #include "fixed_bfin.h"
142 | #elif defined (TI_C5X_ASM)
143 | #include "fixed_c5x.h"
144 | #elif defined (TI_C6X_ASM)
145 | #include "fixed_c6x.h"
146 | #endif
147 | 
148 | #endif
149 | 
150 | #else /* FIXED_POINT */
151 | 
152 | typedef float opus_val16;
153 | typedef float opus_val32;
154 | typedef float opus_val64;
155 | 
156 | typedef float celt_sig;
157 | typedef float celt_norm;
158 | typedef float celt_ener;
159 | 
160 | #define Q15ONE 1.0f
161 | 
162 | #define NORM_SCALING 1.f
163 | 
164 | #define EPSILON 1e-15f
165 | #define VERY_SMALL 1e-30f
166 | #define VERY_LARGE16 1e15f
167 | #define Q15_ONE ((opus_val16)1.f)
168 | 
169 | /* This appears to be the same speed as C99's fabsf() but it's more portable. */
170 | #define ABS16(x) ((float)fabs(x))
171 | #define ABS32(x) ((float)fabs(x))
172 | 
173 | #define QCONST16(x,bits) (x)
174 | #define QCONST32(x,bits) (x)
175 | 
176 | #define NEG16(x) (-(x))
177 | #define NEG32(x) (-(x))
178 | #define NEG32_ovflw(x) (-(x))
179 | #define EXTRACT16(x) (x)
180 | #define EXTEND32(x) (x)
181 | #define SHR16(a,shift) (a)
182 | #define SHL16(a,shift) (a)
183 | #define SHR32(a,shift) (a)
184 | #define SHL32(a,shift) (a)
185 | #define PSHR32(a,shift) (a)
186 | #define VSHR32(a,shift) (a)
187 | 
188 | #define PSHR(a,shift)   (a)
189 | #define SHR(a,shift)    (a)
190 | #define SHL(a,shift)    (a)
191 | #define SATURATE(x,a)   (x)
192 | #define SATURATE16(x)   (x)
193 | 
194 | #define ROUND16(a,shift)  (a)
195 | #define SROUND16(a,shift) (a)
196 | #define HALF16(x)       (.5f*(x))
197 | #define HALF32(x)       (.5f*(x))
198 | 
199 | #define ADD16(a,b) ((a)+(b))
200 | #define SUB16(a,b) ((a)-(b))
201 | #define ADD32(a,b) ((a)+(b))
202 | #define SUB32(a,b) ((a)-(b))
203 | #define ADD32_ovflw(a,b) ((a)+(b))
204 | #define SUB32_ovflw(a,b) ((a)-(b))
205 | #define MULT16_16_16(a,b)     ((a)*(b))
206 | #define MULT16_16(a,b)     ((opus_val32)(a)*(opus_val32)(b))
207 | #define MAC16_16(c,a,b)     ((c)+(opus_val32)(a)*(opus_val32)(b))
208 | 
209 | #define MULT16_32_Q15(a,b)     ((a)*(b))
210 | #define MULT16_32_Q16(a,b)     ((a)*(b))
211 | 
212 | #define MULT32_32_Q31(a,b)     ((a)*(b))
213 | 
214 | #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
215 | #define MAC16_32_Q16(c,a,b)     ((c)+(a)*(b))
216 | 
217 | #define MULT16_16_Q11_32(a,b)     ((a)*(b))
218 | #define MULT16_16_Q11(a,b)     ((a)*(b))
219 | #define MULT16_16_Q13(a,b)     ((a)*(b))
220 | #define MULT16_16_Q14(a,b)     ((a)*(b))
221 | #define MULT16_16_Q15(a,b)     ((a)*(b))
222 | #define MULT16_16_P15(a,b)     ((a)*(b))
223 | #define MULT16_16_P13(a,b)     ((a)*(b))
224 | #define MULT16_16_P14(a,b)     ((a)*(b))
225 | #define MULT16_32_P16(a,b)     ((a)*(b))
226 | 
227 | #define DIV32_16(a,b)     (((opus_val32)(a))/(opus_val16)(b))
228 | #define DIV32(a,b)     (((opus_val32)(a))/(opus_val32)(b))
229 | 
230 | #define SCALEIN(a)      ((a)*CELT_SIG_SCALE)
231 | #define SCALEOUT(a)     ((a)*(1/CELT_SIG_SCALE))
232 | 
233 | #define SIG2WORD16(x) (x)
234 | 
235 | #endif /* !FIXED_POINT */
236 | 
237 | #ifndef GLOBAL_STACK_SIZE
238 | #ifdef FIXED_POINT
239 | #define GLOBAL_STACK_SIZE 120000
240 | #else
241 | #define GLOBAL_STACK_SIZE 120000
242 | #endif
243 | #endif
244 | 
245 | #endif /* ARCH_H */
246 | 


--------------------------------------------------------------------------------
/src/celt_lpc.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2009-2010 Xiph.Org Foundation
  2 |    Written by Jean-Marc Valin */
  3 | /*
  4 |    Redistribution and use in source and binary forms, with or without
  5 |    modification, are permitted provided that the following conditions
  6 |    are met:
  7 | 
  8 |    - Redistributions of source code must retain the above copyright
  9 |    notice, this list of conditions and the following disclaimer.
 10 | 
 11 |    - Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 16 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 17 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 18 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 19 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 23 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 24 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | */
 27 | #include "celt_lpc.h"
 28 | #include "arch.h"
 29 | #include "common.h"
 30 | #include "pitch.h"
 31 | 
 32 | float _celt_lpc(
 33 |       opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */
 34 |       opus_val16 *rc,
 35 | const opus_val32 *ac,  /* in:  [0...p] autocorrelation values  */
 36 | int          p
 37 | )
 38 | {
 39 |    int i, j;
 40 |    opus_val32 r;
 41 |    opus_val32 error = ac[0];
 42 | #ifdef FIXED_POINT
 43 |    opus_val32 lpc[LPC_ORDER];
 44 | #else
 45 |    float *lpc = _lpc;
 46 | #endif
 47 | 
 48 |    RNN_CLEAR(lpc, p);
 49 |    RNN_CLEAR(rc, p);
 50 |    if (ac[0] != 0)
 51 |    {
 52 |       for (i = 0; i < p; i++) {
 53 |          /* Sum up this iteration's reflection coefficient */
 54 |          opus_val32 rr = 0;
 55 |          for (j = 0; j < i; j++)
 56 |             rr += MULT32_32_Q31(lpc[j],ac[i - j]);
 57 |          rr += SHR32(ac[i + 1],3);
 58 |          r = -SHL32(rr,3)/error;
 59 |          rc[i] = r;
 60 |          /*  Update LPC coefficients and total error */
 61 |          lpc[i] = SHR32(r,3);
 62 |          for (j = 0; j < (i+1)>>1; j++)
 63 |          {
 64 |             opus_val32 tmp1, tmp2;
 65 |             tmp1 = lpc[j];
 66 |             tmp2 = lpc[i-1-j];
 67 |             lpc[j]     = tmp1 + MULT32_32_Q31(r,tmp2);
 68 |             lpc[i-1-j] = tmp2 + MULT32_32_Q31(r,tmp1);
 69 |          }
 70 | 
 71 |          error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
 72 |          /* Bail out once we get 30 dB gain */
 73 | #ifdef FIXED_POINT
 74 |          if (error<SHR32(ac[0],10))
 75 |             break;
 76 | #else
 77 |          if (error<.001f*ac[0])
 78 |             break;
 79 | #endif
 80 |       }
 81 |    }
 82 | #ifdef FIXED_POINT
 83 |    for (i=0;i<p;i++)
 84 |       _lpc[i] = ROUND16(lpc[i],16);
 85 | #endif
 86 |    return error;
 87 | }
 88 | 
 89 | 
 90 | void celt_fir(
 91 |          const opus_val16 *x,
 92 |          const opus_val16 *num,
 93 |          opus_val16 *y,
 94 |          int N,
 95 |          int ord)
 96 | {
 97 |    int i,j;
 98 |    opus_val16 rnum[ord];
 99 |    for(i=0;i<ord;i++)
100 |       rnum[i] = num[ord-i-1];
101 |    for (i=0;i<N-3;i+=4)
102 |    {
103 |       opus_val32 sum[4];
104 |       sum[0] = SHL32(EXTEND32(x[i  ]), SIG_SHIFT);
105 |       sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT),
106 |       sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
107 |       sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
108 |       xcorr_kernel(rnum, x+i-ord, sum, ord);
109 |       y[i  ] = ROUND16(sum[0], SIG_SHIFT);
110 |       y[i+1] = ROUND16(sum[1], SIG_SHIFT);
111 |       y[i+2] = ROUND16(sum[2], SIG_SHIFT);
112 |       y[i+3] = ROUND16(sum[3], SIG_SHIFT);
113 |    }
114 |    for (;i<N;i++)
115 |    {
116 |       opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
117 |       for (j=0;j<ord;j++)
118 |          sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
119 |       y[i] = ROUND16(sum, SIG_SHIFT);
120 |    }
121 | }
122 | 
123 | int _celt_autocorr(
124 |                    const opus_val16 *x,   /*  in: [0...n-1] samples x   */
125 |                    opus_val32       *ac,  /* out: [0...lag-1] ac values */
126 |                    const opus_val16       *window,
127 |                    int          overlap,
128 |                    int          lag,
129 |                    int          n)
130 | {
131 |    opus_val32 d;
132 |    int i, k;
133 |    int fastN=n-lag;
134 |    int shift;
135 |    const opus_val16 *xptr;
136 |    opus_val16 xx[n];
137 |    celt_assert(n>0);
138 |    celt_assert(overlap>=0);
139 |    if (overlap == 0)
140 |    {
141 |       xptr = x;
142 |    } else {
143 |       for (i=0;i<n;i++)
144 |          xx[i] = x[i];
145 |       for (i=0;i<overlap;i++)
146 |       {
147 |          xx[i] = MULT16_16_Q15(x[i],window[i]);
148 |          xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
149 |       }
150 |       xptr = xx;
151 |    }
152 |    shift=0;
153 |    celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
154 |    for (k=0;k<=lag;k++)
155 |    {
156 |       for (i = k+fastN, d = 0; i < n; i++)
157 |          d = MAC16_16(d, xptr[i], xptr[i-k]);
158 |       ac[k] += d;
159 |    }
160 |    return shift;
161 | }
162 | 


--------------------------------------------------------------------------------
/src/celt_lpc.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2009-2010 Xiph.Org Foundation
 2 |    Written by Jean-Marc Valin */
 3 | /*
 4 |    Redistribution and use in source and binary forms, with or without
 5 |    modification, are permitted provided that the following conditions
 6 |    are met:
 7 | 
 8 |    - Redistributions of source code must retain the above copyright
 9 |    notice, this list of conditions and the following disclaimer.
10 | 
11 |    - Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | */
27 | 
28 | #ifndef PLC_H
29 | #define PLC_H
30 | 
31 | #include "arch.h"
32 | 
33 | #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
34 | #include "x86/celt_lpc_sse.h"
35 | #endif
36 | 
37 | #define LPC_ORDER 16
38 | 
39 | float _celt_lpc(opus_val16 *_lpc, opus_val16 *rc, const opus_val32 *ac, int p);
40 | 
41 | void celt_fir(
42 |          const opus_val16 *x,
43 |          const opus_val16 *num,
44 |          opus_val16 *y,
45 |          int N,
46 |          int ord);
47 | 
48 | int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
49 |          const opus_val16 *window, int overlap, int lag, int n);
50 | 
51 | #endif /* PLC_H */
52 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H
 2 | #define COMMON_H
 3 | 
 4 | 
 5 | #include <math.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | 
 9 | 
10 | float lpc_from_cepstrum(float *lpc, const float *cepstrum);
11 | 
12 | #define LOG256 5.5451774445f
13 | static inline float log2_approx(float x)
14 | {
15 |    int integer;
16 |    float frac;
17 |    union {
18 |       float f;
19 |       int i;
20 |    } in;
21 |    in.f = x;
22 |    integer = (in.i>>23)-127;
23 |    in.i -= integer<<23;
24 |    frac = in.f - 1.5f;
25 |    frac = -0.41445418f + frac*(0.95909232f
26 |           + frac*(-0.33951290f + frac*0.16541097f));
27 |    return 1+integer+frac;
28 | }
29 | 
30 | #define log_approx(x) (0.69315f*log2_approx(x))
31 | 
32 | /** Copy n elements from src to dst. The 0* term provides compile-time type checking  */
33 | #ifndef OVERRIDE_RNN_COPY
34 | #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
35 | #endif
36 | 
37 | /** Copy n elements from src to dst, allowing overlapping regions. The 0* term
38 |     provides compile-time type checking */
39 | #ifndef OVERRIDE_RNN_MOVE
40 | #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
41 | #endif
42 | 
43 | /** Set n elements of dst to zero */
44 | #ifndef OVERRIDE_RNN_CLEAR
45 | #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
46 | #endif
47 | 
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/src/freq.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018 Mozilla */
  2 | /*
  3 |    Redistribution and use in source and binary forms, with or without
  4 |    modification, are permitted provided that the following conditions
  5 |    are met:
  6 | 
  7 |    - Redistributions of source code must retain the above copyright
  8 |    notice, this list of conditions and the following disclaimer.
  9 | 
 10 |    - Redistributions in binary form must reproduce the above copyright
 11 |    notice, this list of conditions and the following disclaimer in the
 12 |    documentation and/or other materials provided with the distribution.
 13 | 
 14 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 15 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 16 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 17 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 18 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 19 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 20 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 21 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 22 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 23 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | */
 26 | #include <math.h>
 27 | #include <stdlib.h>
 28 | #include <stdio.h>
 29 | #include <string.h>
 30 | 
 31 | #include "arch.h"
 32 | #include "celt_lpc.h"
 33 | #include "common.h"
 34 | #include "freq.h"
 35 | #include "kiss_fft.h"
 36 | #include "pitch.h"
 37 | 
 38 | #define SQUARE(x) ((x)*(x))
 39 | 
 40 | static const opus_int16 eband5ms[] = {
 41 | /*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k*/
 42 |   0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40
 43 | };
 44 | 
 45 | static const float compensation[] = {
 46 |     0.8f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.666667f, 0.5f, 0.5f, 0.5f, 0.333333f, 0.25f, 0.25f, 0.2f, 0.166667f, 0.173913f
 47 | };
 48 | 
 49 | typedef struct {
 50 |   int init;
 51 |   kiss_fft_state *kfft;
 52 |   float half_window[OVERLAP_SIZE];
 53 |   float dct_table[NB_BANDS*NB_BANDS];
 54 | } CommonState;
 55 | 
 56 | 
 57 | 
 58 | void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
 59 |   int i;
 60 |   float sum[NB_BANDS] = {0};
 61 |   for (i=0;i<NB_BANDS-1;i++)
 62 |   {
 63 |     int j;
 64 |     int band_size;
 65 |     band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
 66 |     for (j=0;j<band_size;j++) {
 67 |       float tmp;
 68 |       float frac = (float)j/band_size;
 69 |       tmp = SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r);
 70 |       tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
 71 |       sum[i] += (1-frac)*tmp;
 72 |       sum[i+1] += frac*tmp;
 73 |     }
 74 |   }
 75 |   sum[0] *= 2;
 76 |   sum[NB_BANDS-1] *= 2;
 77 |   for (i=0;i<NB_BANDS;i++)
 78 |   {
 79 |     bandE[i] = sum[i];
 80 |   }
 81 | }
 82 | 
 83 | void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {
 84 |   int i;
 85 |   float sum[NB_BANDS] = {0};
 86 |   for (i=0;i<NB_BANDS-1;i++)
 87 |   {
 88 |     int j;
 89 |     int band_size;
 90 |     band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
 91 |     for (j=0;j<band_size;j++) {
 92 |       float tmp;
 93 |       float frac = (float)j/band_size;
 94 |       tmp = X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r * P[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r;
 95 |       tmp += X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i * P[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i;
 96 |       sum[i] += (1-frac)*tmp;
 97 |       sum[i+1] += frac*tmp;
 98 |     }
 99 |   }
100 |   sum[0] *= 2;
101 |   sum[NB_BANDS-1] *= 2;
102 |   for (i=0;i<NB_BANDS;i++)
103 |   {
104 |     bandE[i] = sum[i];
105 |   }
106 | }
107 | 
108 | void interp_band_gain(float *g, const float *bandE) {
109 |   int i;
110 |   memset(g, 0, FREQ_SIZE);
111 |   for (i=0;i<NB_BANDS-1;i++)
112 |   {
113 |     int j;
114 |     int band_size;
115 |     band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
116 |     for (j=0;j<band_size;j++) {
117 |       float frac = (float)j/band_size;
118 |       g[(eband5ms[i]*WINDOW_SIZE_5MS) + j] = (1-frac)*bandE[i] + frac*bandE[i+1];
119 |     }
120 |   }
121 | }
122 | 
123 | CommonState common;
124 | 
125 | static void check_init() {
126 |   int i;
127 |   if (common.init) return;
128 |   common.kfft = opus_fft_alloc_twiddles(WINDOW_SIZE, NULL, NULL, NULL, 0);
129 |   for (i=0;i<OVERLAP_SIZE;i++)
130 |     common.half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/OVERLAP_SIZE) * sin(.5*M_PI*(i+.5)/OVERLAP_SIZE));
131 |   for (i=0;i<NB_BANDS;i++) {
132 |     int j;
133 |     for (j=0;j<NB_BANDS;j++) {
134 |       common.dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
135 |       if (j==0) common.dct_table[i*NB_BANDS + j] *= sqrt(.5);
136 |     }
137 |   }
138 |   common.init = 1;
139 | }
140 | 
141 | void dct(float *out, const float *in) {
142 |   int i;
143 |   check_init();
144 |   for (i=0;i<NB_BANDS;i++) {
145 |     int j;
146 |     float sum = 0;
147 |     for (j=0;j<NB_BANDS;j++) {
148 |       sum += in[j] * common.dct_table[j*NB_BANDS + i];
149 |     }
150 |     out[i] = sum*sqrt(2./NB_BANDS);
151 |   }
152 | }
153 | 
154 | void idct(float *out, const float *in) {
155 |   int i;
156 |   check_init();
157 |   for (i=0;i<NB_BANDS;i++) {
158 |     int j;
159 |     float sum = 0;
160 |     for (j=0;j<NB_BANDS;j++) {
161 |       sum += in[j] * common.dct_table[i*NB_BANDS + j];
162 |     }
163 |     out[i] = sum*sqrt(2./NB_BANDS);
164 |   }
165 | }
166 | 
167 | void forward_transform(kiss_fft_cpx *out, const float *in) {
168 |   int i;
169 |   kiss_fft_cpx x[WINDOW_SIZE];
170 |   kiss_fft_cpx y[WINDOW_SIZE];
171 |   check_init();
172 |   for (i=0;i<WINDOW_SIZE;i++) {
173 |     x[i].r = in[i];
174 |     x[i].i = 0;
175 |   }
176 |   opus_fft(common.kfft, x, y, 0);
177 |   for (i=0;i<FREQ_SIZE;i++) {
178 |     out[i] = y[i];
179 |   }
180 | }
181 | 
182 | void inverse_transform(float *out, const kiss_fft_cpx *in) {
183 |   int i;
184 |   kiss_fft_cpx x[WINDOW_SIZE];
185 |   kiss_fft_cpx y[WINDOW_SIZE];
186 |   check_init();
187 |   for (i=0;i<FREQ_SIZE;i++) {
188 |     x[i] = in[i];
189 |   }
190 |   for (;i<WINDOW_SIZE;i++) {
191 |     x[i].r = x[WINDOW_SIZE - i].r;
192 |     x[i].i = -x[WINDOW_SIZE - i].i;
193 |   }
194 |   opus_fft(common.kfft, x, y, 0);
195 |   /* output in reverse order for IFFT. */
196 |   out[0] = WINDOW_SIZE*y[0].r;
197 |   for (i=1;i<WINDOW_SIZE;i++) {
198 |     out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r;
199 |   }
200 | }
201 | 
202 | float lpc_from_bands(float *lpc, const float *Ex)
203 | {
204 |    int i;
205 |    float ac[LPC_ORDER+1];
206 |    float rc[LPC_ORDER];
207 |    float Xr[FREQ_SIZE];
208 |    kiss_fft_cpx X_auto[FREQ_SIZE];
209 |    float x_auto[WINDOW_SIZE];
210 |    interp_band_gain(Xr, Ex);
211 |    Xr[FREQ_SIZE-1] = 0;
212 |    RNN_CLEAR(X_auto, FREQ_SIZE);
213 |    for (i=0;i<FREQ_SIZE;i++) X_auto[i].r = Xr[i];
214 |    inverse_transform(x_auto, X_auto);
215 |    for (i=0;i<LPC_ORDER+1;i++) ac[i] = x_auto[i];
216 | 
217 |    /* -40 dB noise floor. */
218 |    ac[0] += ac[0]*1e-4 + 320/12/38.;
219 |    /* Lag windowing. */
220 |    for (i=1;i<LPC_ORDER+1;i++) ac[i] *= (1 - 6e-5*i*i);
221 |    return _celt_lpc(lpc, rc, ac, LPC_ORDER);
222 | }
223 | 
224 | float lpc_from_cepstrum(float *lpc, const float *cepstrum)
225 | {
226 |    int i;
227 |    float Ex[NB_BANDS];
228 |    float tmp[NB_BANDS];
229 |    RNN_COPY(tmp, cepstrum, NB_BANDS);
230 |    tmp[0] += 4;
231 |    idct(Ex, tmp);
232 |    for (i=0;i<NB_BANDS;i++) Ex[i] = pow(10.f, Ex[i])*compensation[i];
233 |    return lpc_from_bands(lpc, Ex);
234 | }
235 | 
236 | void apply_window(float *x) {
237 |   int i;
238 |   check_init();
239 |   for (i=0;i<OVERLAP_SIZE;i++) {
240 |     x[i] *= common.half_window[i];
241 |     x[WINDOW_SIZE - 1 - i] *= common.half_window[i];
242 |   }
243 | }
244 | 
245 | 


--------------------------------------------------------------------------------
/src/freq.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2017-2018 Mozilla */
 2 | /*
 3 |    Redistribution and use in source and binary forms, with or without
 4 |    modification, are permitted provided that the following conditions
 5 |    are met:
 6 | 
 7 |    - Redistributions of source code must retain the above copyright
 8 |    notice, this list of conditions and the following disclaimer.
 9 | 
10 |    - Redistributions in binary form must reproduce the above copyright
11 |    notice, this list of conditions and the following disclaimer in the
12 |    documentation and/or other materials provided with the distribution.
13 | 
14 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
18 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 | 
27 | #include "kiss_fft.h"
28 | 
29 | #define FRAME_SIZE_5MS (2)
30 | #define OVERLAP_SIZE_5MS (2)
31 | #define TRAINING_OFFSET_5MS (1)
32 | 
33 | #define WINDOW_SIZE_5MS (FRAME_SIZE_5MS + OVERLAP_SIZE_5MS)
34 | 
35 | #define FRAME_SIZE (80*FRAME_SIZE_5MS)
36 | #define OVERLAP_SIZE (80*OVERLAP_SIZE_5MS)
37 | #define TRAINING_OFFSET (80*TRAINING_OFFSET_5MS)
38 | #define WINDOW_SIZE (FRAME_SIZE + OVERLAP_SIZE)
39 | #define FREQ_SIZE (WINDOW_SIZE/2 + 1)
40 | 
41 | #define NB_BANDS 18
42 | #define NB_BANDS_1 (NB_BANDS - 1)
43 | 
44 | void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
45 | void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
46 | 
47 | void apply_window(float *x);
48 | void dct(float *out, const float *in);
49 | void idct(float *out, const float *in);
50 | void forward_transform(kiss_fft_cpx *out, const float *in);
51 | void inverse_transform(float *out, const kiss_fft_cpx *in);
52 | float lpc_from_bands(float *lpc, const float *Ex);
53 | float lpc_from_cepstrum(float *lpc, const float *cepstrum);
54 | void apply_window(float *x);
55 | 
56 | 


--------------------------------------------------------------------------------
/src/kiss_fft.h:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2003-2004, Mark Borgerding
  2 |   Lots of modifications by Jean-Marc Valin
  3 |   Copyright (c) 2005-2007, Xiph.Org Foundation
  4 |   Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
  5 | 
  6 |   All rights reserved.
  7 | 
  8 |   Redistribution and use in source and binary forms, with or without
  9 |    modification, are permitted provided that the following conditions are met:
 10 | 
 11 |     * Redistributions of source code must retain the above copyright notice,
 12 |        this list of conditions and the following disclaimer.
 13 |     * Redistributions in binary form must reproduce the above copyright notice,
 14 |        this list of conditions and the following disclaimer in the
 15 |        documentation and/or other materials provided with the distribution.
 16 | 
 17 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 18 |   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 19 |   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 20 |   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 21 |   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 22 |   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 23 |   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 24 |   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 |   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 26 |   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 27 |   POSSIBILITY OF SUCH DAMAGE.*/
 28 | 
 29 | #ifndef KISS_FFT_H
 30 | #define KISS_FFT_H
 31 | 
 32 | #include <stdlib.h>
 33 | #include <math.h>
 34 | #include "arch.h"
 35 | 
 36 | #include <stdlib.h>
 37 | #define opus_alloc(x) malloc(x)
 38 | #define opus_free(x) free(x)
 39 | 
 40 | #ifdef __cplusplus
 41 | extern "C" {
 42 | #endif
 43 | 
 44 | #ifdef USE_SIMD
 45 | # include <xmmintrin.h>
 46 | # define kiss_fft_scalar __m128
 47 | #define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
 48 | #else
 49 | #define KISS_FFT_MALLOC opus_alloc
 50 | #endif
 51 | 
 52 | #ifdef FIXED_POINT
 53 | #include "arch.h"
 54 | 
 55 | #  define kiss_fft_scalar opus_int32
 56 | #  define kiss_twiddle_scalar opus_int16
 57 | 
 58 | 
 59 | #else
 60 | # ifndef kiss_fft_scalar
 61 | /*  default is float */
 62 | #   define kiss_fft_scalar float
 63 | #   define kiss_twiddle_scalar float
 64 | #   define KF_SUFFIX _celt_single
 65 | # endif
 66 | #endif
 67 | 
 68 | typedef struct {
 69 |     kiss_fft_scalar r;
 70 |     kiss_fft_scalar i;
 71 | }kiss_fft_cpx;
 72 | 
 73 | typedef struct {
 74 |    kiss_twiddle_scalar r;
 75 |    kiss_twiddle_scalar i;
 76 | }kiss_twiddle_cpx;
 77 | 
 78 | #define MAXFACTORS 8
 79 | /* e.g. an fft of length 128 has 4 factors
 80 |  as far as kissfft is concerned
 81 |  4*4*4*2
 82 |  */
 83 | 
 84 | typedef struct arch_fft_state{
 85 |    int is_supported;
 86 |    void *priv;
 87 | } arch_fft_state;
 88 | 
 89 | typedef struct kiss_fft_state{
 90 |     int nfft;
 91 |     opus_val16 scale;
 92 | #ifdef FIXED_POINT
 93 |     int scale_shift;
 94 | #endif
 95 |     int shift;
 96 |     opus_int16 factors[2*MAXFACTORS];
 97 |     const opus_int16 *bitrev;
 98 |     const kiss_twiddle_cpx *twiddles;
 99 |     arch_fft_state *arch_fft;
100 | } kiss_fft_state;
101 | 
102 | #if defined(HAVE_ARM_NE10)
103 | #include "arm/fft_arm.h"
104 | #endif
105 | 
106 | /*typedef struct kiss_fft_state* kiss_fft_cfg;*/
107 | 
108 | /**
109 |  *  opus_fft_alloc
110 |  *
111 |  *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
112 |  *
113 |  *  typical usage:      kiss_fft_cfg mycfg=opus_fft_alloc(1024,0,NULL,NULL);
114 |  *
115 |  *  The return value from fft_alloc is a cfg buffer used internally
116 |  *  by the fft routine or NULL.
117 |  *
118 |  *  If lenmem is NULL, then opus_fft_alloc will allocate a cfg buffer using malloc.
119 |  *  The returned value should be free()d when done to avoid memory leaks.
120 |  *
121 |  *  The state can be placed in a user supplied buffer 'mem':
122 |  *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
123 |  *      then the function places the cfg in mem and the size used in *lenmem
124 |  *      and returns mem.
125 |  *
126 |  *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
127 |  *      then the function returns NULL and places the minimum cfg
128 |  *      buffer size in *lenmem.
129 |  * */
130 | 
131 | kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base, int arch);
132 | 
133 | kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch);
134 | 
135 | /**
136 |  * opus_fft(cfg,in_out_buf)
137 |  *
138 |  * Perform an FFT on a complex input buffer.
139 |  * for a forward FFT,
140 |  * fin should be  f[0] , f[1] , ... ,f[nfft-1]
141 |  * fout will be   F[0] , F[1] , ... ,F[nfft-1]
142 |  * Note that each element is complex and can be accessed like
143 |     f[k].r and f[k].i
144 |  * */
145 | void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
146 | void opus_ifft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
147 | 
148 | void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
149 | void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
150 | 
151 | void opus_fft_free(const kiss_fft_state *cfg, int arch);
152 | 
153 | 
154 | void opus_fft_free_arch_c(kiss_fft_state *st);
155 | int opus_fft_alloc_arch_c(kiss_fft_state *st);
156 | 
157 | #if !defined(OVERRIDE_OPUS_FFT)
158 | /* Is run-time CPU detection enabled on this platform? */
159 | #if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10))
160 | 
161 | extern int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(
162 |  kiss_fft_state *st);
163 | 
164 | #define opus_fft_alloc_arch(_st, arch) \
165 |          ((*OPUS_FFT_ALLOC_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
166 | 
167 | extern void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(
168 |  kiss_fft_state *st);
169 | #define opus_fft_free_arch(_st, arch) \
170 |          ((*OPUS_FFT_FREE_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st))
171 | 
172 | extern void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
173 |  const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
174 | #define opus_fft(_cfg, _fin, _fout, arch) \
175 |    ((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
176 | 
177 | extern void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
178 |  const kiss_fft_cpx *fin, kiss_fft_cpx *fout);
179 | #define opus_ifft(_cfg, _fin, _fout, arch) \
180 |    ((*OPUS_IFFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
181 | 
182 | #else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
183 | 
184 | #define opus_fft_alloc_arch(_st, arch) \
185 |          ((void)(arch), opus_fft_alloc_arch_c(_st))
186 | 
187 | #define opus_fft_free_arch(_st, arch) \
188 |          ((void)(arch), opus_fft_free_arch_c(_st))
189 | 
190 | #define opus_fft(_cfg, _fin, _fout, arch) \
191 |          ((void)(arch), opus_fft_c(_cfg, _fin, _fout))
192 | 
193 | #define opus_ifft(_cfg, _fin, _fout, arch) \
194 |          ((void)(arch), opus_ifft_c(_cfg, _fin, _fout))
195 | 
196 | #endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
197 | #endif /* end if !defined(OVERRIDE_OPUS_FFT) */
198 | 
199 | #ifdef __cplusplus
200 | }
201 | #endif
202 | 
203 | #endif
204 | 


--------------------------------------------------------------------------------
/src/lpcnet.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2018 Mozilla */
 2 | /*
 3 |    Redistribution and use in source and binary forms, with or without
 4 |    modification, are permitted provided that the following conditions
 5 |    are met:
 6 | 
 7 |    - Redistributions of source code must retain the above copyright
 8 |    notice, this list of conditions and the following disclaimer.
 9 | 
10 |    - Redistributions in binary form must reproduce the above copyright
11 |    notice, this list of conditions and the following disclaimer in the
12 |    documentation and/or other materials provided with the distribution.
13 | 
14 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
18 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | */
26 | 
27 | #ifndef _LPCNET_H_
28 | #define _LPCNET_H_
29 | 
30 | 
31 | #define NB_FEATURES 38
32 | #define NB_TOTAL_FEATURES 55
33 | 
34 | 
35 | typedef struct LPCNetEncState LPCNetEncState;
36 | 
37 | 
38 | /** Gets the size of an <code>LPCNetEncState</code> structure.
39 |   * @returns The size in bytes.
40 |   */
41 | int lpcnet_encoder_get_size();
42 | 
43 | /** Initializes a previously allocated encoder state
44 |   * The memory pointed to by st must be at least the size returned by lpcnet_encoder_get_size().
45 |   * This is intended for applications which use their own allocator instead of malloc.
46 |   * @see lpcnet_encoder_create(),lpcnet_encoder_get_size()
47 |   * @param [in] st <tt>LPCNetEncState*</tt>: Encoder state
48 |   * @retval 0 Success
49 |   */
50 | int lpcnet_encoder_init(LPCNetEncState *st);
51 | 
52 | /** Allocates and initializes an encoder state.
53 |   *  @returns The newly created state
54 |   */
55 | LPCNetEncState *lpcnet_encoder_create();
56 | 
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/src/lpcnet_enc.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2019 Mozilla */
  2 | /*
  3 |    Redistribution and use in source and binary forms, with or without
  4 |    modification, are permitted provided that the following conditions
  5 |    are met:
  6 | 
  7 |    - Redistributions of source code must retain the above copyright
  8 |    notice, this list of conditions and the following disclaimer.
  9 | 
 10 |    - Redistributions in binary form must reproduce the above copyright
 11 |    notice, this list of conditions and the following disclaimer in the
 12 |    documentation and/or other materials provided with the distribution.
 13 | 
 14 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 15 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 16 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 17 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 18 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 19 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 20 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 21 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 22 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 23 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | */
 26 | #include <math.h>
 27 | #include <stdio.h>
 28 | #include <stdlib.h>
 29 | #include <string.h>
 30 | 
 31 | #include "arch.h"
 32 | #include "celt_lpc.h"
 33 | #include "common.h"
 34 | #include "freq.h"
 35 | #include "kiss_fft.h"
 36 | #include "lpcnet.h"
 37 | #include "lpcnet_private.h"
 38 | #include "pitch.h"
 39 | 
 40 | //#define NB_FEATURES (2*NB_BANDS+3+LPC_ORDER)
 41 | 
 42 | int interp_search(const float *x, const float *left, const float *right, float *dist_out)
 43 | {
 44 |   int i, k;
 45 |   float min_dist = 1e15;
 46 |   int best_pred = 0;
 47 |   float pred[4 * NB_BANDS];
 48 |   for (i = 0; i < NB_BANDS; i++)
 49 |     pred[i] = pred[NB_BANDS + i] = .5 * (left[i] + right[i]);
 50 |   for (i = 0; i < NB_BANDS; i++)
 51 |     pred[2 * NB_BANDS + i] = left[i];
 52 |   for (i = 0; i < NB_BANDS; i++)
 53 |     pred[3 * NB_BANDS + i] = right[i];
 54 | 
 55 |   for (k = 1; k < 4; k++)
 56 |   {
 57 |     float dist = 0;
 58 |     for (i = 0; i < NB_BANDS; i++)
 59 |       dist += (x[i] - pred[k * NB_BANDS + i]) * (x[i] - pred[k * NB_BANDS + i]);
 60 |     dist_out[k - 1] = dist;
 61 |     if (dist < min_dist)
 62 |     {
 63 |       min_dist = dist;
 64 |       best_pred = k;
 65 |     }
 66 |   }
 67 |   return best_pred - 1;
 68 | }
 69 | 
 70 | int double_interp_search(float features[4][NB_TOTAL_FEATURES], const float *mem)
 71 | {
 72 |   int i, j;
 73 |   int best_id = 0;
 74 |   float min_dist = 1e15;
 75 |   float dist[2][3];
 76 |   interp_search(features[0], mem, features[1], dist[0]);
 77 |   interp_search(features[2], features[1], features[3], dist[1]);
 78 |   for (i = 0; i < 3; i++)
 79 |   {
 80 |     for (j = 0; j < 3; j++)
 81 |     {
 82 |       float d;
 83 |       int id;
 84 |       id = 3 * i + j;
 85 |       d = dist[0][i] + dist[1][j];
 86 |       if (d < min_dist && id != FORBIDDEN_INTERP)
 87 |       {
 88 |         min_dist = d;
 89 |         best_id = id;
 90 |       }
 91 |     }
 92 |   }
 93 |   return best_id - (best_id >= FORBIDDEN_INTERP);
 94 | }
 95 | 
 96 | 
 97 | int lpcnet_encoder_get_size() {
 98 |   return sizeof(LPCNetEncState);
 99 | }
100 | 
101 | int lpcnet_encoder_init(LPCNetEncState *st) {
102 |   memset(st, 0, sizeof(*st));
103 |   return 0;
104 | }
105 | 
106 | LPCNetEncState *lpcnet_encoder_create() {
107 |   LPCNetEncState *st;
108 |   st = malloc(lpcnet_encoder_get_size());
109 |   lpcnet_encoder_init(st);
110 |   return st;
111 | }
112 | 
113 | static void frame_analysis(LPCNetEncState *st, kiss_fft_cpx *X, float *Ex, const float *in) {
114 |   float x[WINDOW_SIZE];
115 |   RNN_COPY(x, st->analysis_mem, OVERLAP_SIZE);
116 |   RNN_COPY(&x[OVERLAP_SIZE], in, FRAME_SIZE);
117 |   RNN_COPY(st->analysis_mem, &in[FRAME_SIZE - OVERLAP_SIZE], OVERLAP_SIZE);
118 |   apply_window(x);
119 |   forward_transform(X, x);
120 |   compute_band_energy(Ex, X);
121 | }
122 | 
123 | void compute_frame_features(LPCNetEncState *st, const float *in) {
124 |   float aligned_in[FRAME_SIZE];
125 |   int i;
126 |   float E = 0;
127 |   float Ly[NB_BANDS];
128 |   float follow, logMax;
129 |   float g;
130 |   kiss_fft_cpx X[FREQ_SIZE];
131 |   float Ex[NB_BANDS];
132 |   float xcorr[PITCH_MAX_PERIOD];
133 |   float ener0;
134 |   int sub;
135 |   float ener;
136 |   RNN_COPY(aligned_in, &st->analysis_mem[OVERLAP_SIZE - TRAINING_OFFSET], TRAINING_OFFSET);
137 | 
138 |   // Compute bark-scale cepstrum
139 |   frame_analysis(st, X, Ex, in);
140 |   logMax = -2;
141 |   follow = -2;
142 |   for (i = 0; i < NB_BANDS; i++)
143 |   {
144 |     Ly[i] = log10(1e-2 + Ex[i]);
145 |     Ly[i] = MAX16(logMax - 8, MAX16(follow - 2.5, Ly[i]));
146 |     logMax = MAX16(logMax, Ly[i]);
147 |     follow = MAX16(follow - 2.5, Ly[i]);
148 |     E += Ex[i];
149 |   }
150 | 
151 |   // Compute coefficients from bark-scale cepstrum
152 |   dct(st->features[st->pcount], Ly);
153 |   st->features[st->pcount][0] -= 4;
154 | 
155 |   // Compute lpcs from cepstral coefficients
156 |   g = lpc_from_cepstrum(st->lpc, st->features[st->pcount]);
157 | 
158 |   // Store lpcs in features
159 |   st->features[st->pcount][2 * NB_BANDS + 2] = log10(g);
160 |   for (i = 0; i < LPC_ORDER; i++)
161 |     st->features[st->pcount][2 * NB_BANDS + 3 + i] = st->lpc[i];
162 | 
163 |   // Move excitation by one frame
164 |   RNN_MOVE(st->exc_buf, &st->exc_buf[FRAME_SIZE], PITCH_MAX_PERIOD);
165 | 
166 |   // Perform yin pitch-tracking
167 |   RNN_COPY(&aligned_in[TRAINING_OFFSET], in, FRAME_SIZE - TRAINING_OFFSET);
168 |   for (i = 0; i < FRAME_SIZE; i++)
169 |   {
170 |     int j;
171 |     float sum = aligned_in[i];
172 |     for (j = 0; j < LPC_ORDER; j++)
173 |       sum += st->lpc[j] * st->pitch_mem[j];
174 |     RNN_MOVE(st->pitch_mem + 1, st->pitch_mem, LPC_ORDER - 1);
175 |     st->pitch_mem[0] = aligned_in[i];
176 |     st->exc_buf[PITCH_MAX_PERIOD + i] = sum + .7 * st->pitch_filt;
177 |     st->pitch_filt = sum;
178 |   }
179 |   /* Cross-correlation on half-frames. */
180 |   for (sub = 0; sub < 2; sub++)
181 |   {
182 |     int off = sub * FRAME_SIZE / 2;
183 |     celt_pitch_xcorr(&st->exc_buf[PITCH_MAX_PERIOD + off], st->exc_buf + off, xcorr, FRAME_SIZE / 2, PITCH_MAX_PERIOD);
184 |     ener0 = celt_inner_prod(&st->exc_buf[PITCH_MAX_PERIOD + off], &st->exc_buf[PITCH_MAX_PERIOD + off], FRAME_SIZE / 2);
185 |     st->frame_weight[2 + 2 * st->pcount + sub] = ener0;
186 |     for (i = 0; i < PITCH_MAX_PERIOD; i++)
187 |     {
188 |       ener = (1 + ener0 + celt_inner_prod(&st->exc_buf[i + off], &st->exc_buf[i + off], FRAME_SIZE / 2));
189 |       st->xc[2 + 2 * st->pcount + sub][i] = 2 * xcorr[i] / ener;
190 |     }
191 |   }
192 | }
193 | 
194 | void process_superframe(LPCNetEncState *st, FILE *ffeat) {
195 |   int i;
196 |   int sub;
197 |   int best_i;
198 |   int best[10];
199 |   int pitch_prev[8][PITCH_MAX_PERIOD];
200 |   float best_a = 0;
201 |   float best_b = 0;
202 |   float w;
203 |   float sx = 0, sxx = 0, sxy = 0, sy = 0, sw = 0;
204 |   float frame_corr;
205 |   int voiced;
206 |   float frame_weight_sum = 1e-15;
207 |   float center_pitch;
208 |   int main_pitch;
209 |   int modulation;
210 |   for (sub = 0; sub < 8; sub++)
211 |     frame_weight_sum += st->frame_weight[2 + sub];
212 |   for (sub = 0; sub < 8; sub++)
213 |     st->frame_weight[2 + sub] *= (8.f / frame_weight_sum);
214 |   for (sub = 0; sub < 8; sub++)
215 |   {
216 |     float max_path_all = -1e15;
217 |     best_i = 0;
218 |     for (i = 0; i < PITCH_MAX_PERIOD - 2 * PITCH_MIN_PERIOD; i++)
219 |     {
220 |       float xc_half = MAX16(MAX16(st->xc[2 + sub][(PITCH_MAX_PERIOD + i) / 2], st->xc[2 + sub][(PITCH_MAX_PERIOD + i + 2) / 2]), st->xc[2 + sub][(PITCH_MAX_PERIOD + i - 1) / 2]);
221 |       if (st->xc[2 + sub][i] < xc_half * 1.1)
222 |         st->xc[2 + sub][i] *= .8;
223 |     }
224 |     for (i = 0; i < PITCH_MAX_PERIOD - PITCH_MIN_PERIOD; i++)
225 |     {
226 |       int j;
227 |       float max_prev;
228 |       max_prev = st->pitch_max_path_all - 6.f;
229 |       pitch_prev[sub][i] = st->best_i;
230 |       for (j = IMIN(0, 4 - i); j <= 4 && i + j < PITCH_MAX_PERIOD - PITCH_MIN_PERIOD; j++)
231 |       {
232 |         if (st->pitch_max_path[0][i + j] > max_prev)
233 |         {
234 |           max_prev = st->pitch_max_path[0][i + j] - .02f * abs(j) * abs(j);
235 |           pitch_prev[sub][i] = i + j;
236 |         }
237 |       }
238 |       st->pitch_max_path[1][i] = max_prev + st->frame_weight[2 + sub] * st->xc[2 + sub][i];
239 |       if (st->pitch_max_path[1][i] > max_path_all)
240 |       {
241 |         max_path_all = st->pitch_max_path[1][i];
242 |         best_i = i;
243 |       }
244 |     }
245 |     /* Renormalize. */
246 |     for (i = 0; i < PITCH_MAX_PERIOD - PITCH_MIN_PERIOD; i++)
247 |       st->pitch_max_path[1][i] -= max_path_all;
248 |     RNN_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);
249 |     st->pitch_max_path_all = max_path_all;
250 |     st->best_i = best_i;
251 |   }
252 |   best_i = st->best_i;
253 |   frame_corr = 0;
254 | 
255 |   /* Backward pass. */
256 |   for (sub = 7; sub >= 0; sub--)
257 |   {
258 |     best[2 + sub] = PITCH_MAX_PERIOD - best_i;
259 |     frame_corr += st->frame_weight[2 + sub] * st->xc[2 + sub][best_i];
260 |     best_i = pitch_prev[sub][best_i];
261 |   }
262 | 
263 |   frame_corr /= 8;
264 | 
265 |   for (sub = 2; sub < 10; sub++)
266 |   {
267 |     w = st->frame_weight[sub];
268 |     sw += w;
269 |     sx += w * sub;
270 |     sxx += w * sub * sub;
271 |     sxy += w * sub * best[sub];
272 |     sy += w * best[sub];
273 |   }
274 |   voiced = frame_corr >= .3;
275 | 
276 |   /* Linear regression to figure out the pitch contour. */
277 |   best_a = (sw * sxy - sx * sy) / (sw * sxx - sx * sx);
278 |   if (voiced)
279 |   {
280 |     float max_a;
281 |     float mean_pitch = sy / sw;
282 | 
283 |     /* Allow a relative variation of up to 1/4 over 8 sub-frames. */
284 |     max_a = mean_pitch / 32;
285 |     best_a = MIN16(max_a, MAX16(-max_a, best_a));
286 |   }
287 |   else
288 |   {
289 |     best_a = 0;
290 |   }
291 | 
292 |   best_b = (sy - best_a * sx) / sw;
293 | 
294 |   /* Quantizing the pitch as "main" pitch + slope. */
295 |   center_pitch = best_b + 5.5 * best_a;
296 |   main_pitch = (int)floor(.5 + 21. * log2(center_pitch / PITCH_MIN_PERIOD));
297 |   main_pitch = IMAX(0, IMIN(63, main_pitch));
298 |   modulation = (int)floor(.5 + 16 * 7 * best_a / center_pitch);
299 |   modulation = IMAX(-3, IMIN(3, modulation));
300 | 
301 |   for (sub = 0; sub < 4; sub++)
302 |   {
303 |     st->features[sub][2 * NB_BANDS] = .01 * (best[2 + 2 * sub] + best[2 + 2 * sub + 1] - 200);
304 |     st->features[sub][2 * NB_BANDS + 1] = frame_corr - .5;
305 |   }
306 |   RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD);
307 |   RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD);
308 |   for (sub = 0; sub < 4; sub++)
309 |   {
310 |     float g = lpc_from_cepstrum(st->lpc, st->features[sub]);
311 |     st->features[sub][2 * NB_BANDS + 2] = log10(g);
312 |     for (i = 0; i < LPC_ORDER; i++)
313 |       st->features[sub][2 * NB_BANDS + 3 + i] = st->lpc[i];
314 |   }
315 |   if (ffeat)
316 |   {
317 |     for (i = 0; i < 4; i++)
318 |     {
319 |       fwrite(st->features[i], sizeof(float), NB_TOTAL_FEATURES, ffeat);
320 |     }
321 |   }
322 | }
323 | 


--------------------------------------------------------------------------------
/src/lpcnet_private.h:
--------------------------------------------------------------------------------
 1 | #ifndef LPCNET_PRIVATE_H
 2 | #define LPCNET_PRIVATE_H
 3 | 
 4 | #include "celt_lpc.h"
 5 | #include "common.h"
 6 | #include "freq.h"
 7 | #include "lpcnet.h"
 8 | 
 9 | #define PITCH_MIN_PERIOD 32  //
10 | #define PITCH_MAX_PERIOD 256 //
11 | 
12 | #define PITCH_FRAME_SIZE 320                                 //
13 | #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD + PITCH_FRAME_SIZE) //
14 | 
15 | #define FORBIDDEN_INTERP 7
16 | 
17 | struct LPCNetEncState
18 | {
19 |   float analysis_mem[OVERLAP_SIZE]; //
20 |   int pcount;                                //
21 |   float pitch_mem[LPC_ORDER];                //
22 |   float pitch_filt;                          //
23 |   float xc[10][PITCH_MAX_PERIOD + 1];        //
24 |   float frame_weight[10];                    //
25 |   float exc_buf[PITCH_BUF_SIZE];             //
26 |   float pitch_max_path[2][PITCH_MAX_PERIOD]; //
27 |   float pitch_max_path_all;                  //
28 |   int best_i;                                //
29 |   float lpc[LPC_ORDER];                      //
30 |   float features[4][NB_TOTAL_FEATURES]; //
31 |   float sig_mem[LPC_ORDER];             //
32 |   int exc_mem;                          //
33 | };
34 | 
35 | void process_superframe(LPCNetEncState *st, FILE *ffeat);
36 | 
37 | void compute_frame_features(LPCNetEncState *st, const float *in);
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/src/opus_types.h:
--------------------------------------------------------------------------------
  1 | /* (C) COPYRIGHT 1994-2002 Xiph.Org Foundation */
  2 | /* Modified by Jean-Marc Valin */
  3 | /*
  4 |    Redistribution and use in source and binary forms, with or without
  5 |    modification, are permitted provided that the following conditions
  6 |    are met:
  7 | 
  8 |    - Redistributions of source code must retain the above copyright
  9 |    notice, this list of conditions and the following disclaimer.
 10 | 
 11 |    - Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 16 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 17 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 18 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 19 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 23 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 24 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | */
 27 | /* opus_types.h based on ogg_types.h from libogg */
 28 | 
 29 | /**
 30 |    @file opus_types.h
 31 |    @brief Opus reference implementation types
 32 | */
 33 | #ifndef OPUS_TYPES_H
 34 | #define OPUS_TYPES_H
 35 | 
 36 | /* Use the real stdint.h if it's there (taken from Paul Hsieh's pstdint.h) */
 37 | #if (defined(__STDC__) && __STDC__ && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_)) || defined (HAVE_STDINT_H))
 38 | #include <stdint.h>
 39 | 
 40 |    typedef int16_t opus_int16;
 41 |    typedef uint16_t opus_uint16;
 42 |    typedef int32_t opus_int32;
 43 |    typedef uint32_t opus_uint32;
 44 | #elif defined(_WIN32)
 45 | 
 46 | #  if defined(__CYGWIN__)
 47 | #    include <_G_config.h>
 48 |      typedef _G_int32_t opus_int32;
 49 |      typedef _G_uint32_t opus_uint32;
 50 |      typedef _G_int16 opus_int16;
 51 |      typedef _G_uint16 opus_uint16;
 52 | #  elif defined(__MINGW32__)
 53 |      typedef short opus_int16;
 54 |      typedef unsigned short opus_uint16;
 55 |      typedef int opus_int32;
 56 |      typedef unsigned int opus_uint32;
 57 | #  elif defined(__MWERKS__)
 58 |      typedef int opus_int32;
 59 |      typedef unsigned int opus_uint32;
 60 |      typedef short opus_int16;
 61 |      typedef unsigned short opus_uint16;
 62 | #  else
 63 |      /* MSVC/Borland */
 64 |      typedef __int32 opus_int32;
 65 |      typedef unsigned __int32 opus_uint32;
 66 |      typedef __int16 opus_int16;
 67 |      typedef unsigned __int16 opus_uint16;
 68 | #  endif
 69 | 
 70 | #elif defined(__MACOS__)
 71 | 
 72 | #  include <sys/types.h>
 73 |    typedef SInt16 opus_int16;
 74 |    typedef UInt16 opus_uint16;
 75 |    typedef SInt32 opus_int32;
 76 |    typedef UInt32 opus_uint32;
 77 | 
 78 | #elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */
 79 | 
 80 | #  include <sys/types.h>
 81 |    typedef int16_t opus_int16;
 82 |    typedef u_int16_t opus_uint16;
 83 |    typedef int32_t opus_int32;
 84 |    typedef u_int32_t opus_uint32;
 85 | 
 86 | #elif defined(__BEOS__)
 87 | 
 88 |    /* Be */
 89 | #  include <inttypes.h>
 90 |    typedef int16 opus_int16;
 91 |    typedef u_int16 opus_uint16;
 92 |    typedef int32_t opus_int32;
 93 |    typedef u_int32_t opus_uint32;
 94 | 
 95 | #elif defined (__EMX__)
 96 | 
 97 |    /* OS/2 GCC */
 98 |    typedef short opus_int16;
 99 |    typedef unsigned short opus_uint16;
100 |    typedef int opus_int32;
101 |    typedef unsigned int opus_uint32;
102 | 
103 | #elif defined (DJGPP)
104 | 
105 |    /* DJGPP */
106 |    typedef short opus_int16;
107 |    typedef unsigned short opus_uint16;
108 |    typedef int opus_int32;
109 |    typedef unsigned int opus_uint32;
110 | 
111 | #elif defined(R5900)
112 | 
113 |    /* PS2 EE */
114 |    typedef int opus_int32;
115 |    typedef unsigned opus_uint32;
116 |    typedef short opus_int16;
117 |    typedef unsigned short opus_uint16;
118 | 
119 | #elif defined(__SYMBIAN32__)
120 | 
121 |    /* Symbian GCC */
122 |    typedef signed short opus_int16;
123 |    typedef unsigned short opus_uint16;
124 |    typedef signed int opus_int32;
125 |    typedef unsigned int opus_uint32;
126 | 
127 | #elif defined(CONFIG_TI_C54X) || defined (CONFIG_TI_C55X)
128 | 
129 |    typedef short opus_int16;
130 |    typedef unsigned short opus_uint16;
131 |    typedef long opus_int32;
132 |    typedef unsigned long opus_uint32;
133 | 
134 | #elif defined(CONFIG_TI_C6X)
135 | 
136 |    typedef short opus_int16;
137 |    typedef unsigned short opus_uint16;
138 |    typedef int opus_int32;
139 |    typedef unsigned int opus_uint32;
140 | 
141 | #else
142 | 
143 |    /* Give up, take a reasonable guess */
144 |    typedef short opus_int16;
145 |    typedef unsigned short opus_uint16;
146 |    typedef int opus_int32;
147 |    typedef unsigned int opus_uint32;
148 | 
149 | #endif
150 | 
151 | #define opus_int         int                     /* used for counters etc; at least 16 bits */
152 | #define opus_int64       long long
153 | #define opus_int8        signed char
154 | 
155 | #define opus_uint        unsigned int            /* used for counters etc; at least 16 bits */
156 | #define opus_uint64      unsigned long long
157 | #define opus_uint8       unsigned char
158 | 
159 | #endif  /* OPUS_TYPES_H */
160 | 


--------------------------------------------------------------------------------
/src/pitch.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2007-2008 CSIRO
  2 |    Copyright (c) 2007-2009 Xiph.Org Foundation
  3 |    Written by Jean-Marc Valin */
  4 | /**
  5 |    @file pitch.h
  6 |    @brief Pitch analysis
  7 |  */
  8 | 
  9 | /*
 10 |    Redistribution and use in source and binary forms, with or without
 11 |    modification, are permitted provided that the following conditions
 12 |    are met:
 13 | 
 14 |    - Redistributions of source code must retain the above copyright
 15 |    notice, this list of conditions and the following disclaimer.
 16 | 
 17 |    - Redistributions in binary form must reproduce the above copyright
 18 |    notice, this list of conditions and the following disclaimer in the
 19 |    documentation and/or other materials provided with the distribution.
 20 | 
 21 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 22 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24 |    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 25 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 26 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 27 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 28 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 29 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 30 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 31 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32 | */
 33 | 
 34 | #ifndef PITCH_H
 35 | #define PITCH_H
 36 | 
 37 | #include "arch.h"
 38 | 
 39 | void pitch_downsample(opus_val16 *x_lp,
 40 |       int len);
 41 | 
 42 | void pitch_search(const opus_val16 *x_lp, opus_val16 *y,
 43 |                   int len, int max_pitch, int *pitch);
 44 | 
 45 | opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
 46 |       int N, int *T0, int prev_period, opus_val16 prev_gain);
 47 | 
 48 | 
 49 | /* OPT: This is the kernel you really want to optimize. It gets used a lot
 50 |    by the prefilter and by the PLC. */
 51 | static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 52 | {
 53 |    int j;
 54 |    opus_val16 y_0, y_1, y_2, y_3;
 55 |    celt_assert(len>=3);
 56 |    y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
 57 |    y_0=*y++;
 58 |    y_1=*y++;
 59 |    y_2=*y++;
 60 |    for (j=0;j<len-3;j+=4)
 61 |    {
 62 |       opus_val16 tmp;
 63 |       tmp = *x++;
 64 |       y_3=*y++;
 65 |       sum[0] = MAC16_16(sum[0],tmp,y_0);
 66 |       sum[1] = MAC16_16(sum[1],tmp,y_1);
 67 |       sum[2] = MAC16_16(sum[2],tmp,y_2);
 68 |       sum[3] = MAC16_16(sum[3],tmp,y_3);
 69 |       tmp=*x++;
 70 |       y_0=*y++;
 71 |       sum[0] = MAC16_16(sum[0],tmp,y_1);
 72 |       sum[1] = MAC16_16(sum[1],tmp,y_2);
 73 |       sum[2] = MAC16_16(sum[2],tmp,y_3);
 74 |       sum[3] = MAC16_16(sum[3],tmp,y_0);
 75 |       tmp=*x++;
 76 |       y_1=*y++;
 77 |       sum[0] = MAC16_16(sum[0],tmp,y_2);
 78 |       sum[1] = MAC16_16(sum[1],tmp,y_3);
 79 |       sum[2] = MAC16_16(sum[2],tmp,y_0);
 80 |       sum[3] = MAC16_16(sum[3],tmp,y_1);
 81 |       tmp=*x++;
 82 |       y_2=*y++;
 83 |       sum[0] = MAC16_16(sum[0],tmp,y_3);
 84 |       sum[1] = MAC16_16(sum[1],tmp,y_0);
 85 |       sum[2] = MAC16_16(sum[2],tmp,y_1);
 86 |       sum[3] = MAC16_16(sum[3],tmp,y_2);
 87 |    }
 88 |    if (j++<len)
 89 |    {
 90 |       opus_val16 tmp = *x++;
 91 |       y_3=*y++;
 92 |       sum[0] = MAC16_16(sum[0],tmp,y_0);
 93 |       sum[1] = MAC16_16(sum[1],tmp,y_1);
 94 |       sum[2] = MAC16_16(sum[2],tmp,y_2);
 95 |       sum[3] = MAC16_16(sum[3],tmp,y_3);
 96 |    }
 97 |    if (j++<len)
 98 |    {
 99 |       opus_val16 tmp=*x++;
100 |       y_0=*y++;
101 |       sum[0] = MAC16_16(sum[0],tmp,y_1);
102 |       sum[1] = MAC16_16(sum[1],tmp,y_2);
103 |       sum[2] = MAC16_16(sum[2],tmp,y_3);
104 |       sum[3] = MAC16_16(sum[3],tmp,y_0);
105 |    }
106 |    if (j<len)
107 |    {
108 |       opus_val16 tmp=*x++;
109 |       y_1=*y++;
110 |       sum[0] = MAC16_16(sum[0],tmp,y_2);
111 |       sum[1] = MAC16_16(sum[1],tmp,y_3);
112 |       sum[2] = MAC16_16(sum[2],tmp,y_0);
113 |       sum[3] = MAC16_16(sum[3],tmp,y_1);
114 |    }
115 | }
116 | 
117 | static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
118 |       int N, opus_val32 *xy1, opus_val32 *xy2)
119 | {
120 |    int i;
121 |    opus_val32 xy01=0;
122 |    opus_val32 xy02=0;
123 |    for (i=0;i<N;i++)
124 |    {
125 |       xy01 = MAC16_16(xy01, x[i], y01[i]);
126 |       xy02 = MAC16_16(xy02, x[i], y02[i]);
127 |    }
128 |    *xy1 = xy01;
129 |    *xy2 = xy02;
130 | }
131 | 
132 | /*We make sure a C version is always available for cases where the overhead of
133 |   vectorization and passing around an arch flag aren't worth it.*/
134 | static inline opus_val32 celt_inner_prod(const opus_val16 *x,
135 |       const opus_val16 *y, int N)
136 | {
137 |    int i;
138 |    opus_val32 xy=0;
139 |    for (i=0;i<N;i++)
140 |       xy = MAC16_16(xy, x[i], y[i]);
141 |    return xy;
142 | }
143 | 
144 | void celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y,
145 |       opus_val32 *xcorr, int len, int max_pitch);
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/src/preprocess.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2017-2018 Mozilla */
  2 | /*
  3 |    Redistribution and use in source and binary forms, with or without
  4 |    modification, are permitted provided that the following conditions
  5 |    are met:
  6 | 
  7 |    - Redistributions of source code must retain the above copyright
  8 |    notice, this list of conditions and the following disclaimer.
  9 | 
 10 |    - Redistributions in binary form must reproduce the above copyright
 11 |    notice, this list of conditions and the following disclaimer in the
 12 |    documentation and/or other materials provided with the distribution.
 13 | 
 14 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 15 |    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 16 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 17 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
 18 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 19 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 20 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 21 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 22 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 23 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | */
 26 | #include <math.h>
 27 | #include <stdlib.h>
 28 | #include <string.h>
 29 | #include <stdio.h>
 30 | 
 31 | #include "lpcnet.h"
 32 | #include "lpcnet_private.h"
 33 | 
 34 | 
 35 | /******************************************************************************
 36 | Constants
 37 | ******************************************************************************/
 38 | 
 39 | 
 40 | #ifndef FRAME_SIZE
 41 | #define FRAME_SIZE 160
 42 | #endif
 43 | 
 44 | #ifndef HALF_FRAME
 45 | #define HALF_FRAME (FRAME_SIZE / 2)
 46 | #endif
 47 | 
 48 | #ifndef LOG256
 49 | #define LOG256 5.5451774445f
 50 | #endif
 51 | 
 52 | #ifndef LPC_ORDER
 53 | #define LPC_ORDER 16
 54 | #endif
 55 | 
 56 | #ifndef NB_BANDS
 57 | #define NB_BANDS 18
 58 | #endif
 59 | 
 60 | #ifndef OVERLAP_SIZE
 61 | #define OVERLAP_SIZE 160
 62 | #endif
 63 | 
 64 | #ifndef PITCH_MAX_PERIOD
 65 | #define PITCH_MAX_PERIOD 256
 66 | #endif
 67 | 
 68 | #ifndef WINDOW_SIZE
 69 | #define WINDOW_SIZE (FRAME_SIZE + OVERLAP_SIZE)
 70 | #endif
 71 | 
 72 | #ifndef FREQ_SIZE
 73 | #define FREQ_SIZE (WINDOW_SIZE / 2 + 1)
 74 | #endif
 75 | 
 76 | #ifndef log_approx
 77 | #define log_approx(x) (0.69315f * log2_approx(x))
 78 | #endif
 79 | 
 80 | #ifndef max
 81 | #define max(a, b) (((a) > (b)) ? (a) : (b))
 82 | #endif
 83 | 
 84 | #ifndef min
 85 | #define min(a, b) (((a) < (b)) ? (a) : (b))
 86 | #endif
 87 | 
 88 | 
 89 | /******************************************************************************
 90 | Conversions
 91 | ******************************************************************************/
 92 | 
 93 | 
 94 | static inline float mulaw_to_linear(float u) {
 95 |   /* Convert mulaw-encoded audio to linear */
 96 |   float s;
 97 |   float scale_1 = 32768.f / 255.f;
 98 |   u = u - 128;
 99 |   s = u >= 0 ? 1 : -1;
100 |   u = fabs(u);
101 |   return s * scale_1 * (exp(u / 128. * LOG256) - 1);
102 | }
103 | 
104 | 
105 | static inline int linear_to_mulaw(float x) {
106 |   /* Convert linear audio to mulaw */
107 |   float u;
108 |   float scale = 255.f / 32768.f;
109 |   int s = x >= 0 ? 1 : -1;
110 |   x = fabs(x);
111 |   u = (s * (128 * log_approx(1 + scale * x) / LOG256));
112 |   u = 128 + u;
113 |   if (u < 0)
114 |     u = 0;
115 |   if (u > 255)
116 |     u = 255;
117 |   return (int)floor(.5 + u);
118 | }
119 | 
120 | 
121 | /******************************************************************************
122 | File output
123 | ******************************************************************************/
124 | 
125 | 
126 | void write_audio(LPCNetEncState *st, const short *pcm, FILE *file) {
127 |   // Iterate over frames in a block
128 |   for (int k = 0; k < 4; k++) {
129 | 
130 |     // Container for sample-level features
131 |     unsigned char data[4 * FRAME_SIZE];
132 | 
133 |     // Iterate over samples in a frame
134 |     for (int i = 0; i < FRAME_SIZE; i++) {
135 | 
136 |       // Compute prediction from lpc coefficients and previous samples
137 |       float p = 0;
138 |       for (int j = 0; j < LPC_ORDER; j++)
139 |         p -= st->features[k][2 * NB_BANDS + 3 + j] * st->sig_mem[j];
140 | 
141 |       // Compute excitation from sample and prediction
142 |       float e = linear_to_mulaw(pcm[k * FRAME_SIZE + i] - p);
143 | 
144 |       // Mu-law encoded signal
145 |       data[4 * i] = linear_to_mulaw(st->sig_mem[0]);
146 | 
147 |       // Mu-law encoded prediction
148 |       data[4 * i + 1] = linear_to_mulaw(p);
149 | 
150 |       // Input excitation
151 |       data[4 * i + 2] = st->exc_mem;
152 | 
153 |       // Target excitation
154 |       data[4 * i + 3] = e;
155 | 
156 |       // Delay signal by one
157 |       unsigned int size = (LPC_ORDER - 1) * sizeof(float);
158 |       memmove(&st->sig_mem[1], &st->sig_mem[0], size);
159 | 
160 |       // Bound excitation
161 |       e = min(255, max(0, e));
162 | 
163 |       // Store computed values for next iteration
164 |       st->sig_mem[0] = p + mulaw_to_linear(e);
165 |       st->exc_mem = e;
166 |     }
167 | 
168 |     // Write sample-rate features to disk
169 |     fwrite(data, 4 * FRAME_SIZE, 1, file);
170 |   }
171 | }
172 | 
173 | 
174 | /******************************************************************************
175 | Entry point
176 | ******************************************************************************/
177 | 
178 | 
179 | int main(int argc, char **argv) {
180 |   float x[FRAME_SIZE];
181 |   FILE *output_sample_file = NULL;
182 |   short pcm[FRAME_SIZE] = {0};
183 |   short pcmbuf[FRAME_SIZE * 4] = {0};
184 | 
185 |   // Create encoder
186 |   LPCNetEncState *st = lpcnet_encoder_create();
187 | 
188 |   // Open input audio file
189 |   FILE *input_file = fopen(argv[1], "rb");
190 | 
191 |   // Open output feature file
192 |   FILE *output_feature_file = fopen(argv[2], "wb");
193 | 
194 |   // Open output file for sample-rate features and training targets
195 |   if (argc == 4) output_sample_file = fopen(argv[3], "wb");
196 | 
197 |   // Read in up to FRAME_SIZE samples
198 |   while (fread(pcm, sizeof(short), FRAME_SIZE, input_file) == FRAME_SIZE) {
199 | 
200 |     // Cast to float
201 |     for (int i = 0; i < FRAME_SIZE; i++) x[i] = pcm[i];
202 | 
203 |     // Compute pitch, correlation, and bark-scale coefficients
204 |     compute_frame_features(st, x);
205 | 
206 |     // Copy frame into position in 4-frame buffer
207 |     memcpy(&pcmbuf[st->pcount * FRAME_SIZE], pcm, FRAME_SIZE * sizeof(*pcm));
208 |     st->pcount++;
209 | 
210 |     // Running on groups of 4 frames
211 |     if (st->pcount == 4) {
212 |       process_superframe(st, output_feature_file);
213 | 
214 |       // If training, write audio frame
215 |       if (output_sample_file) write_audio(st, pcmbuf, output_sample_file);
216 | 
217 |       // Reset count
218 |       st->pcount = 0;
219 |     }
220 |   }
221 | 
222 |   // Close files
223 |   fclose(input_file);
224 |   fclose(output_feature_file);
225 |   if (output_sample_file) fclose(output_sample_file);
226 | 
227 |   // Clean-up encoder memory
228 |   free(st);
229 | 
230 |   return 0;
231 | }
232 | 


--------------------------------------------------------------------------------